In [None]:
import pandas as pd
from tqdm import tqdm
from pymongo import MongoClient
import re
import pickle
from datetime import datetime
from utils import LVDoc
import time

In [None]:
# DB Constants
mongo = MongoClient("mongodb://localhost:27017/?readPreference=primary&appname=MongoDB%20Compass&ssl=false")
db = mongo["delfi_texts"]
source_col = db["lv_news"]

destination_col = db["lv_news_annotated"]
destination_col.drop()
destination_col = db["lv_news_annotated"]

In [None]:
# Load cabinet data
with open("Cabinet_data_NER_LV.pkl", "rb") as file:
    dfl = pickle.load(file)
print(len(dfl))

In [None]:
# Helpers


def resolve_cabinet(cabinet_data, datetime_obj):
    """
    Assembles list of cabinet member names for a particular datetime
    
    PARAMS:
        cabinet_data:list - a list of cabinet members, their positions and incumbency duration
        datetime_obj:datetime - date when a news article was published
    RETURNS:
        cabinet - a list of people who were ministers and the president for that time
    """

    cabinet = []

    for row in cabinet_data:
        if row["from"] <= datetime_obj <= row["to"]:
            cabinet.append(row)
    return cabinet


def cabinet_entities(datetime_obj, entities, cabinet_data):
    """
    Cross-references a list of entities extracted from text
    with a list of cabinet ministers in office at the time the article was published
    
    PARAMS:
        datetime_obj:datetime - datetime when the article was published
        entities:list - a list of entities extracted from text
        cabinet_data:list - a list of cabinet members, their positions and incumbency duration
    
    RETURNS:
        rel_mentions:list - a list of enities mentioned in text that were the cabinet 
                            members at the time and the president
    """
    entities = set(entities)
    relevant_cabinet = resolve_cabinet(cabinet_data, datetime_obj)
    rel_mentions = []
    for row in relevant_cabinet:
        if any(name in entities for name in row["names"]):
            plh = {
                "person_id": row["person_id"],
                "person_name": row["person_name"],
                "cabinet_ents": row["ministry"],
            }
            rel_mentions.append(plh)
    return rel_mentions

def detect_ents_row(text, dfl, timestamp):
    cabinet = resolve_cabinet(dfl, timestamp)
    
    plh = []
    
    for item in cabinet:
        for entry in item["names"]:
            if entry in text:
                d = {"person_id" : item["person_id"], 
                    "cabinet_no" : item["cabinet_no"], 
                    "cabinet_ents" : item["ministry"], 
                    }
                if d not in plh:
                    plh.append(d)
    return plh


def prime_and_prez(cabinet: list):
    pm = ""
    cabinet_no = ""
    prezident = ""

    for item in cabinet:
        if item["ministry"] == "Premjers":
            pm = item["person_name"]
            cabinet_no = str(item["cabinet_no"])

        if item["ministry"] in ("Prezidente", "Prezidents"):
            prezident = item["person_name"]
    return (pm, cabinet_no, prezident)

In [None]:
class LVAnnotator:
    """
    Annotates LV news texts to detect mentions of cabinet members and president
    """

    def __init__(self):
    
        self.cabinet_data = dfl

    def detect_entities(self, input_data):
        """
        PARAMS: 
            input_data: list [{"_id":IDObject, "text":str, "date":datetime, "source":str}, 
                            {{"_id":IDObject, "text":str, "date":datetime, "source":str}}]
        RETURNS:
            annotate_data: pandas.DataFrame
        """

        annotated_raw = []

        for row in tqdm(input_data):

            cab_ents = detect_ents_row(row["text"], self.cabinet_data, row["date"]) #text, dfl, timestamp):
            if cab_ents != []:
                for i in cab_ents:
                    d = {
                        "para_id": str(row["_id"]),
                        "date": row["date"],
                        "text": row["text"],
                        "source": row["source"],
                    }
                    d.update(i)
                    annotated_raw.append(d)

        df_ent = pd.DataFrame(annotated_raw)

        plh = df_ent.groupby(["para_id"]).agg(
            {
                "cabinet_ents": lambda x: list(x),
                "date": "first",
                "text": "first",
                "source": "first",
            }
        )
        df_ent_g = pd.DataFrame()
        df_ent_g["para_id"] = list(plh.index)
        df_ent_g["cabinet_ents"] = list(plh["cabinet_ents"])
        df_ent_g["date"] = list(plh["date"])
        df_ent_g["source"] = list(plh["source"])
        df_ent_g["text"] = list(plh["text"])

        return df_ent_g
    

    def extend_data_datetime_vars(self, interim_data):

        """
        Adds year, mothm day vars to the dataframe

        PARAMS:
            interim_data:DataFrame - a dataframe with `date` column
        RETURNS:
            interim_data:DataFrame
        """

        interim_data["year"] = interim_data.apply(lambda x: x["date"].year, axis=1)
        interim_data["month"] = interim_data.apply(lambda x: x["date"].month, axis=1)
        interim_data["day"] = interim_data.apply(lambda x: x["date"].day, axis=1)
        interim_data = interim_data.sort_values(by=["date"])

        return interim_data

    def extend_data_cabinet_vars(self, interim_data):

        """
        Adds cabinet vars: cabinet, cabinet_number, president

        PARAMS:
            interim_data:DataFrame - a dataframe with `date` column
        RETURNS:
            interim_data:DataFrame
        """

        pm = []
        cabinet_no = []
        prezident = []

        for row in tqdm(interim_data.date):
            cabinet = resolve_cabinet(self.cabinet_data, row)
            plh = prime_and_prez(cabinet)
            pm.append(plh[0])
            cabinet_no.append(plh[1])
            prezident.append(plh[2])

        interim_data["cabinet"] = pm
        interim_data["cabinet_no"] = cabinet_no
        interim_data["president"] = prezident

        return interim_data

In [None]:
begin = 1999
end = datetime.now().year + 1

years = [i for i in range(begin, end)]

anno = LVAnnotator()

In [None]:
for y in years:
    print(y)
    query = {"date": {"$gte": datetime(y, 1, 1), "$lt": datetime(y + 1, 1, 1)}}
    cursor = source_col.find(query, no_cursor_timeout=True)
    input_data = [i for i in cursor]
    print(f"Records for this year {len(input_data)}")
    if len(input_data) > 0:

        df = anno.detect_entities(input_data)
        df2 = anno.extend_data_datetime_vars(df)
        df3 = anno.extend_data_cabinet_vars(df2)

        validated_data = [
            LVDoc(**i) for i in df3.to_dict(orient="records")
        ]  # Testing for schema compliance
        validated_data2 = [i.dict() for i in validated_data]
        #break
        destination_col.insert_many(validated_data2)
    time.sleep(60)