In [1]:
import pandas as pd
from tqdm import tqdm
from pymongo import MongoClient
import re
import pickle
from datetime import datetime
from utils import LVDoc
import time

In [2]:
# DB Constants
mongo = MongoClient("mongodb://localhost:27017/?readPreference=primary&appname=MongoDB%20Compass&ssl=false")
db = mongo["delfi_texts"]
source_col = db["lv_news"]

destination_col = db["lv_news_annotated"]
destination_col.drop()
destination_col = db["lv_news_annotated"]

In [3]:
# Load cabinet data
with open("Cabinet_data_NER_LV.pkl", "rb") as file:
    dfl = pickle.load(file)
print(len(dfl))

255


In [4]:
# Helpers


def resolve_cabinet(cabinet_data, datetime_obj):
    """
    Assembles list of cabinet member names for a particular datetime
    
    PARAMS:
        cabinet_data:list - a list of cabinet members, their positions and incumbency duration
        datetime_obj:datetime - date when a news article was published
    RETURNS:
        cabinet - a list of people who were ministers and the president for that time
    """

    cabinet = []

    for row in cabinet_data:
        if row["from"] <= datetime_obj <= row["to"]:
            cabinet.append(row)
    return cabinet


def cabinet_entities(datetime_obj, entities, cabinet_data):
    """
    Cross-references a list of entities extracted from text
    with a list of cabinet ministers in office at the time the article was published
    
    PARAMS:
        datetime_obj:datetime - datetime when the article was published
        entities:list - a list of entities extracted from text
        cabinet_data:list - a list of cabinet members, their positions and incumbency duration
    
    RETURNS:
        rel_mentions:list - a list of enities mentioned in text that were the cabinet 
                            members at the time and the president
    """
    entities = set(entities)
    relevant_cabinet = resolve_cabinet(cabinet_data, datetime_obj)
    rel_mentions = []
    for row in relevant_cabinet:
        if any(name in entities for name in row["names"]):
            plh = {
                "person_id": row["person_id"],
                "person_name": row["person_name"],
                "cabinet_ents": row["ministry"],
            }
            rel_mentions.append(plh)
    return rel_mentions

def detect_ents_row(text, dfl, timestamp):
    cabinet = resolve_cabinet(dfl, timestamp)
    
    plh = []
    
    for item in cabinet:
        for entry in item["names"]:
            if entry in text:
                d = {"person_id" : item["person_id"], 
                    "cabinet_no" : item["cabinet_no"], 
                    "cabinet_ents" : item["ministry"], 
                    }
                if d not in plh:
                    plh.append(d)
    return plh


def prime_and_prez(cabinet: list):
    pm = ""
    cabinet_no = ""
    prezident = ""

    for item in cabinet:
        if item["ministry"] == "Premjers":
            pm = item["person_name"]
            cabinet_no = str(item["cabinet_no"])

        if item["ministry"] in ("Prezidente", "Prezidents"):
            prezident = item["person_name"]
    return (pm, cabinet_no, prezident)

In [5]:
class LVAnnotator:
    """
    Annotates LV news texts to detect mentions of cabinet members and president
    """

    def __init__(self):
    
        self.cabinet_data = dfl

    def detect_entities(self, input_data):
        """
        PARAMS: 
            input_data: list [{"_id":IDObject, "text":str, "date":datetime, "source":str}, 
                            {{"_id":IDObject, "text":str, "date":datetime, "source":str}}]
        RETURNS:
            annotate_data: pandas.DataFrame
        """

        annotated_raw = []

        for row in tqdm(input_data):

            cab_ents = detect_ents_row(row["text"], self.cabinet_data, row["date"]) #text, dfl, timestamp):
            if cab_ents != []:
                for i in cab_ents:
                    d = {
                        "para_id": str(row["_id"]),
                        "date": row["date"],
                        "text": row["text"],
                        "source": row["source"],
                    }
                    d.update(i)
                    annotated_raw.append(d)

        df_ent = pd.DataFrame(annotated_raw)

        plh = df_ent.groupby(["para_id"]).agg(
            {
                "cabinet_ents": lambda x: list(x),
                "date": "first",
                "text": "first",
                "source": "first",
            }
        )
        df_ent_g = pd.DataFrame()
        df_ent_g["para_id"] = list(plh.index)
        df_ent_g["cabinet_ents"] = list(plh["cabinet_ents"])
        df_ent_g["date"] = list(plh["date"])
        df_ent_g["source"] = list(plh["source"])
        df_ent_g["text"] = list(plh["text"])

        return df_ent_g
    

    def extend_data_datetime_vars(self, interim_data):

        """
        Adds year, mothm day vars to the dataframe

        PARAMS:
            interim_data:DataFrame - a dataframe with `date` column
        RETURNS:
            interim_data:DataFrame
        """

        interim_data["year"] = interim_data.apply(lambda x: x["date"].year, axis=1)
        interim_data["month"] = interim_data.apply(lambda x: x["date"].month, axis=1)
        interim_data["day"] = interim_data.apply(lambda x: x["date"].day, axis=1)
        interim_data = interim_data.sort_values(by=["date"])

        return interim_data

    def extend_data_cabinet_vars(self, interim_data):

        """
        Adds cabinet vars: cabinet, cabinet_number, president

        PARAMS:
            interim_data:DataFrame - a dataframe with `date` column
        RETURNS:
            interim_data:DataFrame
        """

        pm = []
        cabinet_no = []
        prezident = []

        for row in tqdm(interim_data.date):
            cabinet = resolve_cabinet(self.cabinet_data, row)
            plh = prime_and_prez(cabinet)
            pm.append(plh[0])
            cabinet_no.append(plh[1])
            prezident.append(plh[2])

        interim_data["cabinet"] = pm
        interim_data["cabinet_no"] = cabinet_no
        interim_data["president"] = prezident

        return interim_data

In [6]:
begin = 1999
end = datetime.now().year + 1

years = [i for i in range(begin, end)]

anno = LVAnnotator()

In [7]:
for y in years:
    print(y)
    query = {"date": {"$gte": datetime(y, 1, 1), "$lt": datetime(y + 1, 1, 1)}}
    cursor = source_col.find(query, no_cursor_timeout=True)
    input_data = [i for i in cursor]
    print(f"Records for this year {len(input_data)}")
    if len(input_data) > 0:

        df = anno.detect_entities(input_data)
        df2 = anno.extend_data_datetime_vars(df)
        df3 = anno.extend_data_cabinet_vars(df2)

        validated_data = [
            LVDoc(**i) for i in df3.to_dict(orient="records")
        ]  # Testing for schema compliance
        validated_data2 = [i.dict() for i in validated_data]
        #break
        destination_col.insert_many(validated_data2)
    time.sleep(60)

  return Cursor(self, *args, **kwargs)
 48%|████▊     | 158/326 [00:00<00:00, 1572.23it/s]

1999
Records for this year 326


100%|██████████| 326/326 [00:00<00:00, 1584.76it/s]
100%|██████████| 51/51 [00:00<00:00, 18268.81it/s]
  0%|          | 0/2901 [00:00<?, ?it/s]

2000
Records for this year 2901


100%|██████████| 2901/2901 [00:01<00:00, 1762.96it/s]
100%|██████████| 388/388 [00:00<00:00, 18408.14it/s]
  0%|          | 0/3390 [00:00<?, ?it/s]

2001
Records for this year 3390


100%|██████████| 3390/3390 [00:01<00:00, 1697.04it/s]
100%|██████████| 337/337 [00:00<00:00, 18201.12it/s]
  0%|          | 0/3977 [00:00<?, ?it/s]

2002
Records for this year 3977


100%|██████████| 3977/3977 [00:02<00:00, 1467.07it/s]
100%|██████████| 541/541 [00:00<00:00, 17869.19it/s]
  0%|          | 0/4182 [00:00<?, ?it/s]

2003
Records for this year 4182


100%|██████████| 4182/4182 [00:03<00:00, 1245.77it/s]
100%|██████████| 783/783 [00:00<00:00, 14156.45it/s]
  0%|          | 0/3284 [00:00<?, ?it/s]

2004
Records for this year 3284


100%|██████████| 3284/3284 [00:02<00:00, 1208.21it/s]
100%|██████████| 775/775 [00:00<00:00, 17069.71it/s]
  0%|          | 0/4392 [00:00<?, ?it/s]

2005
Records for this year 4392


100%|██████████| 4392/4392 [00:03<00:00, 1242.06it/s]
100%|██████████| 1000/1000 [00:00<00:00, 16680.54it/s]
  0%|          | 0/6320 [00:00<?, ?it/s]

2006
Records for this year 6320


100%|██████████| 6320/6320 [00:05<00:00, 1261.76it/s]
100%|██████████| 1118/1118 [00:00<00:00, 15566.38it/s]
  0%|          | 0/7373 [00:00<?, ?it/s]

2007
Records for this year 7373


100%|██████████| 7373/7373 [00:06<00:00, 1205.15it/s]
100%|██████████| 1662/1662 [00:00<00:00, 14811.73it/s]


2008


  1%|▏         | 103/7596 [00:00<00:07, 1022.99it/s]

Records for this year 7596


100%|██████████| 7596/7596 [00:06<00:00, 1197.77it/s]
100%|██████████| 1765/1765 [00:00<00:00, 14948.20it/s]


2009


  1%|          | 106/9090 [00:00<00:08, 1057.08it/s]

Records for this year 9090


100%|██████████| 9090/9090 [00:07<00:00, 1172.46it/s]
100%|██████████| 2587/2587 [00:00<00:00, 15188.31it/s]


2010


  1%|▏         | 100/7798 [00:00<00:07, 989.01it/s]

Records for this year 7798


100%|██████████| 7798/7798 [00:06<00:00, 1152.49it/s]
100%|██████████| 1894/1894 [00:00<00:00, 15491.65it/s]


2011


  1%|          | 96/9658 [00:00<00:10, 952.18it/s]

Records for this year 9658


100%|██████████| 9658/9658 [00:08<00:00, 1081.56it/s]
100%|██████████| 2663/2663 [00:00<00:00, 15239.22it/s]


2012


  1%|▏         | 107/7282 [00:00<00:06, 1069.21it/s]

Records for this year 7282


100%|██████████| 7282/7282 [00:06<00:00, 1067.33it/s]
100%|██████████| 1811/1811 [00:00<00:00, 14924.71it/s]


2013


  1%|▏         | 96/6852 [00:00<00:07, 958.62it/s]

Records for this year 6852


100%|██████████| 6852/6852 [00:06<00:00, 1076.85it/s]
100%|██████████| 1555/1555 [00:00<00:00, 14249.79it/s]


2014


100%|██████████| 6527/6527 [00:05<00:00, 1116.47it/s]
100%|██████████| 1192/1192 [00:00<00:00, 10548.72it/s]


Records for this year 6527
2015


  1%|▏         | 123/8549 [00:00<00:06, 1220.92it/s]

Records for this year 8549


100%|██████████| 8549/8549 [00:07<00:00, 1144.59it/s]
100%|██████████| 1464/1464 [00:00<00:00, 12434.77it/s]


2016


  2%|▏         | 118/6961 [00:00<00:05, 1177.17it/s]

Records for this year 6961


100%|██████████| 6961/6961 [00:05<00:00, 1171.60it/s]
100%|██████████| 1341/1341 [00:00<00:00, 14114.05it/s]
  0%|          | 0/5294 [00:00<?, ?it/s]

2017
Records for this year 5294


100%|██████████| 5294/5294 [00:04<00:00, 1115.61it/s]
100%|██████████| 865/865 [00:00<00:00, 13111.75it/s]


2018


  2%|▏         | 105/5649 [00:00<00:05, 1049.19it/s]

Records for this year 5649


100%|██████████| 5649/5649 [00:05<00:00, 1078.62it/s]
100%|██████████| 956/956 [00:00<00:00, 13557.92it/s]
  0%|          | 0/5101 [00:00<?, ?it/s]

2019
Records for this year 5101


100%|██████████| 5101/5101 [00:05<00:00, 997.25it/s] 
100%|██████████| 1054/1054 [00:00<00:00, 13260.41it/s]


2020


  1%|▏         | 97/6627 [00:00<00:06, 969.39it/s]

Records for this year 6627


100%|██████████| 6627/6627 [00:06<00:00, 975.38it/s] 
100%|██████████| 1084/1084 [00:00<00:00, 13537.63it/s]


2021


  1%|▏         | 108/8139 [00:00<00:07, 1074.53it/s]

Records for this year 8139


100%|██████████| 8139/8139 [00:08<00:00, 990.63it/s] 
100%|██████████| 1669/1669 [00:00<00:00, 13699.34it/s]
  0%|          | 0/1272 [00:00<?, ?it/s]

2022
Records for this year 1272


100%|██████████| 1272/1272 [00:01<00:00, 1043.75it/s]
100%|██████████| 309/309 [00:00<00:00, 13396.45it/s]
