In [6]:
import pandas as pd
from tqdm import tqdm
from pymongo import MongoClient
import re
import pickle
from datetime import datetime

In [7]:
# DB Constants
mongo = MongoClient("mongodb://localhost:27017/?readPreference=primary&appname=MongoDB%20Compass&ssl=false")
db = mongo["delfi_texts"]
source_col = db["lv_news"]

destination_col = db["lv_news_annotated"]
destination_col.drop()
destination_col = db["lv_news_annotated"]

In [8]:
# Load cabinet data
with open("Cabinet_data_NER_LV.pkl", "rb") as file:
    dfl = pickle.load(file)
print(len(dfl))

255


In [27]:
# Helpers


def resolve_cabinet(cabinet_data, datetime_obj):
    """
    Assembles list of cabinet member names for a particular datetime
    
    PARAMS:
        cabinet_data:list - a list of cabinet members, their positions and incumbency duration
        datetime_obj:datetime - date when a news article was published
    RETURNS:
        cabinet - a list of people who were ministers and the president for that time
    """

    cabinet = []

    for row in cabinet_data:
        if row["from"] <= datetime_obj <= row["to"]:
            cabinet.append(row)
    return cabinet


def cabinet_entities(datetime_obj, entities, cabinet_data):
    """
    Cross-references a list of entities extracted from text
    with a list of cabinet ministers in office at the time the article was published
    
    PARAMS:
        datetime_obj:datetime - datetime when the article was published
        entities:list - a list of entities extracted from text
        cabinet_data:list - a list of cabinet members, their positions and incumbency duration
    
    RETURNS:
        rel_mentions:list - a list of enities mentioned in text that were the cabinet 
                            members at the time and the president
    """
    entities = set(entities)
    relevant_cabinet = resolve_cabinet(cabinet_data, datetime_obj)
    rel_mentions = []
    for row in relevant_cabinet:
        if any(name in entities for name in row["names"]):
            plh = {
                "person_id": row["person_id"],
                "person_name": row["person_name"],
                "cabinet_ents": row["ministry"],
            }
            rel_mentions.append(plh)
    return rel_mentions

def detect_ents_row(text, dfl, timestamp):
    cabinet = resolve_cabinet(dfl, timestamp)
    
    plh = []
    
    for item in cabinet:
        for entry in item["names"]:
            if entry in text:
                d = {"person_id" : item["person_id"], 
                    "cabinet_no" : item["cabinet_no"], 
                    "cabinet_ents" : item["ministry"], 
                    }
                if d not in plh:
                    plh.append(d)
    return plh


def extend_data_datetime_vars(interim_data):

    """
    Adds year, mothm day vars to the dataframe

    PARAMS:
        interim_data:DataFrame - a dataframe with `date` column
    RETURNS:
        interim_data:DataFrame
    """

    interim_data["year"] = interim_data.apply(lambda x: x["date"].year, axis=1)
    interim_data["month"] = interim_data.apply(lambda x: x["date"].month, axis=1)
    interim_data["day"] = interim_data.apply(lambda x: x["date"].day, axis=1)
    interim_data = interim_data.sort_values(by=["date"])

    return interim_data


def prime_and_prez(cabinet: list):
    pm = ""
    cabinet_no = ""
    prezident = ""

    for item in cabinet:
        if item["ministry"] == "Premjers":
            pm = item["person_name"]
            cabinet_no = str(item["cabinet_no"])

        if item["ministry"] in ("Prezidente", "Prezidents"):
            prezident = item["person_name"]
    return (pm, cabinet_no, prezident)

def extend_data_cabinet_vars(interim_data):

    """
    Adds cabinet vars: cabinet, cabinet_number, president

    PARAMS:
        interim_data:DataFrame - a dataframe with `date` column
    RETURNS:
        interim_data:DataFrame
    """

    pm = []
    cabinet_no = []
    prezident = []

    for row in tqdm(interim_data.date):
        cabinet = resolve_cabinet(dfl, row)
        plh = prime_and_prez(cabinet)
        pm.append(plh[0])
        cabinet_no.append(plh[1])
        prezident.append(plh[2])

    interim_data["cabinet"] = pm
    interim_data["cabinet_no"] = cabinet_no
    interim_data["president"] = prezident

    return interim_data

In [34]:
class LVAnnotator:
    """
    Annotates LV news texts to detect mentions of cabinet members and president
    """

    def __init__(self):
    
        self.cabinet_data = dfl

    def detect_entities(self, input_data):
        """
        PARAMS: 
            input_data: list [{"_id":IDObject, "text":str, "date":datetime, "source":str}, 
                            {{"_id":IDObject, "text":str, "date":datetime, "source":str}}]
        RETURNS:
            annotate_data: pandas.DataFrame
        """

        annotated_raw = []

        for row in tqdm(input_data):

            cab_ents = detect_ents_row(row["text"], self.cabinet_data, row["date"]) #text, dfl, timestamp):
            if cab_ents != []:
                for i in cab_ents:
                    d = {
                        "para_id": str(row["_id"]),
                        "date": row["date"],
                        "text": row["text"],
                        "source": row["source"],
                    }
                    d.update(i)
                    annotated_raw.append(d)

        df_ent = pd.DataFrame(annotated_raw)

        plh = df_ent.groupby(["para_id"]).agg(
            {
                "cabinet_ents": lambda x: list(x),
                "date": "first",
                "text": "first",
                "source": "first",
            }
        )
        df_ent_g = pd.DataFrame()
        df_ent_g["para_id"] = list(plh.index)
        df_ent_g["cabinet_ents"] = list(plh["cabinet_ents"])
        df_ent_g["date"] = list(plh["date"])
        df_ent_g["source"] = list(plh["source"])
        df_ent_g["text"] = list(plh["text"])

        return df_ent_g
    

    def extend_data_datetime_vars(self, interim_data):

        """
        Adds year, mothm day vars to the dataframe

        PARAMS:
            interim_data:DataFrame - a dataframe with `date` column
        RETURNS:
            interim_data:DataFrame
        """

        interim_data["year"] = interim_data.apply(lambda x: x["date"].year, axis=1)
        interim_data["month"] = interim_data.apply(lambda x: x["date"].month, axis=1)
        interim_data["day"] = interim_data.apply(lambda x: x["date"].day, axis=1)
        interim_data = interim_data.sort_values(by=["date"])

        return interim_data

    def extend_data_cabinet_vars(self, interim_data):

        """
        Adds cabinet vars: cabinet, cabinet_number, president

        PARAMS:
            interim_data:DataFrame - a dataframe with `date` column
        RETURNS:
            interim_data:DataFrame
        """

        pm = []
        cabinet_no = []
        prezident = []

        for row in tqdm(interim_data.date):
            cabinet = resolve_cabinet(self.cabinet_data, row)
            plh = prime_and_prez(cabinet)
            pm.append(plh[0])
            cabinet_no.append(plh[1])
            prezident.append(plh[2])

        interim_data["cabinet"] = pm
        interim_data["cabinet_no"] = cabinet_no
        interim_data["president"] = prezident

        return interim_data

In [None]:
anno = LVAnnotator()

In [None]:
for y in years:
    print(y)
    query = {"date": {"$gte": datetime(y, 1, 1), "$lt": datetime(y + 1, 1, 1)}}
    cursor = source_col.find(query, no_cursor_timeout=True)
    input_data = [i for i in cursor]
    print(f"Records for this year {len(input_data)}")
    if len(input_data) > 0:

        df = anno.detect_entities(input_data)
        df2 = anno.extend_data_datetime_vars(df)
        df3 = anno.extend_data_cabinet_vars(df2)

        validated_data = [
            LVDoc(**i) for i in df3.to_dict(orient="records")
        ]  # Testing for schema compliance
        validated_data2 = [i.dict() for i in validated_data]
        break
        #mongo_lt_destination_col.insert_many(validated_data2)
    #time.sleep(60)

In [35]:
lva = LVAnnotator()

In [36]:
query = {"date" : {"$ne" : None}}
cursor = source_col.find(query).limit(500)
data = [i for i in cursor]

plh = lva.detect_entities(data)
plh2 = lva.extend_data_datetime_vars(plh)
plh3 = lva.extend_data_cabinet_vars(plh2)

100%|██████████| 500/500 [00:00<00:00, 1103.08it/s]
100%|██████████| 77/77 [00:00<00:00, 11631.12it/s]


In [37]:
plh3


Unnamed: 0,para_id,cabinet_ents,date,source,text,year,month,day,cabinet,cabinet_no,president
66,61ccaf79ce530efb6e9960f2,"[Iekšlietu, Premjers]",2012-02-20 14:19:00,https://www.delfi.lv/news/national/politics/va...,\n Pensionēšanās vecumu plāno pakāp...,2012,2,20,Valdis Dombrovskis,36,Andris Bērziņš
64,61ccaf72ce530efb6e9960ed,[Prezidents],2012-02-20 21:47:00,https://www.delfi.lv/news/national/politics/pr...,"\n Pieļaujot iespēju, ka salīdzinoš...",2012,2,20,Valdis Dombrovskis,36,Andris Bērziņš
51,61ccaf37ce530efb6e9960ad,[Ekonomikas],2012-02-29 10:25:00,https://www.delfi.lv/news/national/politics/la...,\n Latvijā tiek identificēti 10% lī...,2012,2,29,Valdis Dombrovskis,36,Andris Bērziņš
46,61ccaf27ce530efb6e9960a0,[Prezidents],2012-03-29 14:16:00,https://www.delfi.lv/news/national/politics/sa...,\n Saeima ceturtdien neatbalstīja g...,2012,3,29,Valdis Dombrovskis,36,Andris Bērziņš
13,61cca9c61a3facdb78d73ac1,"[Ekonomikas, Izglītības un zinātnes]",2012-03-31 14:40:00,https://www.delfi.lv/news/national/politics/pa...,"\n Nedrīkst solīt, ka visiem visur ...",2012,3,31,Valdis Dombrovskis,36,Andris Bērziņš
...,...,...,...,...,...,...,...,...,...,...,...
2,61cc72ffecb2b47816d7b34e,[Veselības],2021-10-20 15:00:00,https://www.delfi.lv/news/national/politics/pi...,"""DELFI plus"" redaktore\n Šie ""kukuļi"" paši lie...",2021,10,20,Krišjānis Kariņš,40,Eglis Levits
5,61cc8524ecb2b47816d7b370,[Premjers],2021-12-13 16:53:00,https://www.delfi.lv/news/national/politics/el...,Nacionālo ziņu nodaļas žurnālists\n \n ...,2021,12,13,Krišjānis Kariņš,40,Eglis Levits
12,61cca8831a3facdb78d73ab3,[Ārlietu],2021-12-27 14:44:00,https://www.delfi.lv/news/national/politics/pa...,\n Par ārlietu ministra Edgara Rink...,2021,12,27,Krišjānis Kariņš,40,Eglis Levits
8,61cc8e09ecb2b47816d7b37f,[Premjers],2021-12-29 08:17:00,https://www.delfi.lv/news/national/politics/pi...,\n Ministru prezidents Krišjānis Ka...,2021,12,29,Krišjānis Kariņš,40,Eglis Levits


In [4]:
begin = 1999
end = datetime.now().year + 1

years = [i for i in range(begin, end)]




In [None]:
for y in years:
    print(y)
    query = {"date": {"$gte": datetime(y, 1, 1), "$lt": datetime(y + 1, 1, 1)}}
    cursor = mongo_lt_source_col.find(query, no_cursor_timeout=True)
    input_data = [i for i in cursor]
    print(f"Records for this year {len(input_data)}")
    if len(input_data) > 0:

        df = anno.detect_entities(input_data)
        df2 = anno.extend_data_datetime_vars(df)
        df3 = anno.extend_data_cabinet_vars(df2)

        validated_data = [
            LTDoc(**i) for i in df3.to_dict(orient="records")
        ]  # Testing for schema compliance
        validated_data2 = [i.dict() for i in validated_data]

        mongo_lt_destination_col.insert_many(validated_data2)
    time.sleep(60)

In [5]:
years

[1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020,
 2021,
 2022]