In [1]:
import pandas as pd
from tqdm import tqdm
from pymongo import MongoClient
import re
import pickle
from datetime import datetime

# Prep cabinet entities

In [14]:
with open("Cabinet_data_NER_LV.pkl", "rb") as file:
    dfl = pickle.load(file)
print(len(dfl))

255


In [15]:
dfl[70]

{'cabinet_no': 31,
 'person_id': 'k_karins',
 'ministry': 'Ekonomikas',
 'from': Timestamp('2004-12-02 00:00:00'),
 'to': Timestamp('2006-04-07 00:00:00'),
 'names': ['Krišjānis Kariņš',
  'K. Kariņš',
  'Krišjānja Kariņa',
  'K. Kariņa',
  'Krišjānim Kariņam',
  'K. Kariņam',
  'Krišjāni Kariņu',
  'K. Kariņu',
  'Krišjānī Kariņā',
  'K. Kariņā'],
 'person_name': 'Krišjānis Kariņš'}

In [16]:


def resolve_cabinet(cabinet_data, datetime_obj):
    """
    Assembles list of cabinet member names for a particular datetime
    
    PARAMS:
        cabinet_data:list - a list of cabinet members, their positions and incumbency duration
        datetime_obj:datetime - date when a news article was published
    RETURNS:
        cabinet - a list of people who were ministers and the president for that time
    """

    cabinet = []

    for row in cabinet_data:
        if row["from"] <= datetime_obj <= row["to"]:
            cabinet.append(row)
    return cabinet


def cabinet_entities(datetime_obj, entities, cabinet_data):
    """
    Cross-references a list of entities extracted from text
    with a list of cabinet ministers in office at the time the article was published
    
    PARAMS:
        datetime_obj:datetime - datetime when the article was published
        entities:list - a list of entities extracted from text
        cabinet_data:list - a list of cabinet members, their positions and incumbency duration
    
    RETURNS:
        rel_mentions:list - a list of enities mentioned in text that were the cabinet 
                            members at the time and the president
    """
    entities = set(entities)
    relevant_cabinet = resolve_cabinet(cabinet_data, datetime_obj)
    rel_mentions = []
    for row in relevant_cabinet:
        if any(name in entities for name in row["names"]):
            plh = {
                "person_id": row["person_id"],
                "person_name": row["person_name"],
                "cabinet_ents": row["ministry"],
            }
            rel_mentions.append(plh)
    return rel_mentions


In [17]:
#resolve_cabinet(dfl, datetime.now())

# Load Data from momgo

In [18]:
mongo = MongoClient("mongodb://localhost:27017/?readPreference=primary&appname=MongoDB%20Compass&ssl=false")
db = mongo["delfi_texts"]
col = db["lv_news"]

In [19]:
query = {}
cursor = col.find(query, {"_id" :0}).limit(5000)
data = [i for i in cursor]

df = pd.DataFrame(data)
print(df.shape)
df.head()

(5000, 3)


Unnamed: 0,source,text,date
0,https://www.delfi.lv/news/national/politics/pa...,DELFI žurnālists\n 1933. gadā vīrs vārdā Freds...,2021-07-22 00:02:00
1,https://www.delfi.lv/news/national/politics/ra...,rus.delfi.lv žurnāliste\n Olga Sukonnikova lat...,2015-07-21 04:16:00
2,https://www.delfi.lv/news/national/politics/ka...,"DELFI žurnālists\n ""Paliec sveiks, mans mazais...",2021-07-18 00:00:00
3,https://www.delfi.lv/news/national/politics/bl...,Nacionālo ziņu nodaļas žurnāliste\n Mēdz teikt...,2021-03-13 09:16:00
4,https://www.delfi.lv/news/national/politics/da...,"""DELFI plus"" žurnāliste\n No bildēm pretim ver...",2021-10-10 00:00:00


# Detect ents

In [27]:
def detect_ents(text, timestamp):
    cabinet = resolve_cabinet(dfl, timestamp)
    
    plh = []
    
    for item in cabinet:
        for entry in item["names"]:
            if entry in text:
                d = {"person_id" : item["person_id"], 
                    "cabinet_no" : item["cabinet_no"], 
                    "ministry" : item["ministry"], 
                    }
                if d not in plh:
                    plh.append(d)
    return plh


def extend_data_datetime_vars(interim_data):

    """
    Adds year, mothm day vars to the dataframe

    PARAMS:
        interim_data:DataFrame - a dataframe with `date` column
    RETURNS:
        interim_data:DataFrame
    """

    interim_data["year"] = interim_data.apply(lambda x: x["date"].year, axis=1)
    interim_data["month"] = interim_data.apply(lambda x: x["date"].month, axis=1)
    interim_data["day"] = interim_data.apply(lambda x: x["date"].day, axis=1)
    interim_data = interim_data.sort_values(by=["date"])

    return interim_data


def prime_and_prez(cabinet: list):
    pm = ""
    cabinet_no = ""
    prezident = ""

    for item in cabinet:
        if item["ministry"] == "Premjers":
            pm = item["person_name"]
            cabinet_no = str(item["cabinet_no"])

        if item["ministry"] in ("Prezidente", "Prezidents"):
            prezident = item["person_name"]
    return (pm, cabinet_no, prezident)

def extend_data_cabinet_vars(interim_data):

    """
    Adds cabinet vars: cabinet, cabinet_number, president

    PARAMS:
        interim_data:DataFrame - a dataframe with `date` column
    RETURNS:
        interim_data:DataFrame
    """

    pm = []
    cabinet_no = []
    prezident = []

    for row in tqdm(interim_data.date):
        cabinet = resolve_cabinet(dfl, row)
        plh = prime_and_prez(cabinet)
        pm.append(plh[0])
        cabinet_no.append(plh[1])
        prezident.append(plh[2])

    interim_data["cabinet"] = pm
    interim_data["cabinet_no"] = cabinet_no
    interim_data["president"] = prezident

    return interim_data

In [28]:
entities = []

for index, row in tqdm(df.iterrows()):
    ents = detect_ents(row["text"], row["date"])
    entities.append(ents)
    
df["entities"] = entities
df = extend_data_datetime_vars(df)
df = extend_data_cabinet_vars(df)

df.head()


5000it [00:02, 1702.59it/s]
100%|██████████| 5000/5000 [00:00<00:00, 20271.86it/s]


Unnamed: 0,source,text,date,entities,year,month,day,cabinet,cabinet_no,president
1715,https://www.delfi.lv/news/national/politics/la...,\n Lauku atbalsta dienesta (LAD) di...,2006-01-03 00:26:00,[],2006.0,1.0,3.0,Aigars Kalvītis,31,Vaira Viķe Freiberga
2961,https://www.delfi.lv/news/national/politics/pr...,\n Pret Ventspils mēru Aivaru Lembe...,2006-08-30 10:10:00,[],2006.0,8.0,30.0,Aigars Kalvītis,31,Vaira Viķe Freiberga
4081,https://www.delfi.lv/news/national/politics/sa...,\n Kopumā 9. Saeimā ievēlēti 19 daž...,2006-10-17 10:13:00,[],2006.0,10.0,17.0,Aigars Kalvītis,31,Vaira Viķe Freiberga
3022,https://www.delfi.lv/news/national/politics/en...,\n Satversmes tiesas (ST) tiesnesis...,2007-01-31 12:48:00,[],2007.0,1.0,31.0,Aigars Kalvītis,32,Vaira Viķe Freiberga
4848,https://www.delfi.lv/news/national/politics/ve...,\n Jau veiktie un tuvākajā nākotnē ...,2009-10-09 19:22:00,"[{'person_id': 'r_vejonis', 'cabinet_no': 34, ...",2009.0,10.0,9.0,Valdis Dombrovskis,34,Valdis Zatlers


In [29]:
df.head()

Unnamed: 0,source,text,date,entities,year,month,day,cabinet,cabinet_no,president
1715,https://www.delfi.lv/news/national/politics/la...,\n Lauku atbalsta dienesta (LAD) di...,2006-01-03 00:26:00,[],2006.0,1.0,3.0,Aigars Kalvītis,31,Vaira Viķe Freiberga
2961,https://www.delfi.lv/news/national/politics/pr...,\n Pret Ventspils mēru Aivaru Lembe...,2006-08-30 10:10:00,[],2006.0,8.0,30.0,Aigars Kalvītis,31,Vaira Viķe Freiberga
4081,https://www.delfi.lv/news/national/politics/sa...,\n Kopumā 9. Saeimā ievēlēti 19 daž...,2006-10-17 10:13:00,[],2006.0,10.0,17.0,Aigars Kalvītis,31,Vaira Viķe Freiberga
3022,https://www.delfi.lv/news/national/politics/en...,\n Satversmes tiesas (ST) tiesnesis...,2007-01-31 12:48:00,[],2007.0,1.0,31.0,Aigars Kalvītis,32,Vaira Viķe Freiberga
4848,https://www.delfi.lv/news/national/politics/ve...,\n Jau veiktie un tuvākajā nākotnē ...,2009-10-09 19:22:00,"[{'person_id': 'r_vejonis', 'cabinet_no': 34, ...",2009.0,10.0,9.0,Valdis Dombrovskis,34,Valdis Zatlers


In [31]:
df.tail(100)

Unnamed: 0,source,text,date,entities,year,month,day,cabinet,cabinet_no,president
3179,https://www.delfi.lv/news/national/politics/ri...,Nacionālo ziņu nodaļas žurnāliste\n Fotoredakt...,2021-12-03 12:52:00,"[{'person_id': 'e_rinkevics', 'cabinet_no': 40...",2021.0,12.0,3.0,Krišjānis Kariņš,40,Eglis Levits
4092,https://www.delfi.lv/news/national/politics/bv...,\n Būvniecības valsts kontroles bir...,2021-12-03 18:38:00,[],2021.0,12.0,3.0,Krišjānis Kariņš,40,Eglis Levits
3595,https://www.delfi.lv/news/national/politics/se...,\n Sestdienas rītā visā Latvijā sni...,2021-12-04 09:11:00,[],2021.0,12.0,4.0,Krišjānis Kariņš,40,Eglis Levits
4113,https://www.delfi.lv/news/national/politics/so...,\n Šonedēļ vakcīnas pret Covid-19 t...,2021-12-04 10:03:00,[],2021.0,12.0,4.0,Krišjānis Kariņš,40,Eglis Levits
4286,https://www.delfi.lv/news/national/politics/vi...,"\n Starptautisko skolu tīkls ""Unite...",2021-12-04 13:57:00,[],2021.0,12.0,4.0,Krišjānis Kariņš,40,Eglis Levits
...,...,...,...,...,...,...,...,...,...,...
158,https://www.delfi.lv/news/national/politics/re...,\n Ekspremjera Einara Repšes jauno ...,NaT,[],,,,,,
159,https://www.delfi.lv/news/national/politics/ta...,\n Tautas skaitīšana Latvijā būs iz...,NaT,[],,,,,,
161,https://www.delfi.lv/news/national/politics/no...,"\n Nokrītot pa kāpnēm, traumas guvi...",NaT,[],,,,,,
162,https://www.delfi.lv/news/national/politics/st...,"\n Rīgā, Strēlnieku ielā 8, kur pat...",NaT,[],,,,,,


In [33]:
df["has_entities"] = df.apply(lambda x : len(x["entities"])>0, axis = 1)
df["has_entities"].mean()

0.2112

In [34]:
df2 = df[df["has_entities"]==1]
print(df2.shape)
df2.head()

(1056, 11)


Unnamed: 0,source,text,date,entities,year,month,day,cabinet,cabinet_no,president,has_entities
4848,https://www.delfi.lv/news/national/politics/ve...,\n Jau veiktie un tuvākajā nākotnē ...,2009-10-09 19:22:00,"[{'person_id': 'r_vejonis', 'cabinet_no': 34, ...",2009.0,10.0,9.0,Valdis Dombrovskis,34,Valdis Zatlers,True
4163,https://www.delfi.lv/news/national/politics/le...,\n Simtiem leģionāru piemiņas gājie...,2011-03-16 12:15:00,"[{'person_id': 'l_murniece', 'cabinet_no': 35,...",2011.0,3.0,16.0,Valdis Dombrovskis,35,Valdis Zatlers,True
3366,https://www.delfi.lv/news/national/politics/pi...,\n Leģionāru atbalstītāju rindas šo...,2011-03-16 12:50:00,"[{'person_id': 'l_murniece', 'cabinet_no': 35,...",2011.0,3.0,16.0,Valdis Dombrovskis,35,Valdis Zatlers,True
4696,https://www.delfi.lv/news/national/politics/sa...,\n Saeima ceturtdien ārkārtas sēdē ...,2011-05-26 14:33:00,"[{'person_id': 'v_zatlers', 'cabinet_no': 7, '...",2011.0,5.0,26.0,Valdis Dombrovskis,35,Valdis Zatlers,True
3895,https://www.delfi.lv/news/national/politics/sa...,\n Divdesmit gadus pēc neatkarības ...,2011-06-09 05:46:00,"[{'person_id': 'a_pabriks', 'cabinet_no': 35, ...",2011.0,6.0,9.0,Valdis Dombrovskis,35,Valdis Zatlers,True


In [21]:
cab = resolve_cabinet(dfl, datetime.now())

prime_and_prez(cab)

('Krišjānis Kariņš', '40', 'Eglis Levits')

In [None]:
for index, row in df.iterrows():
    res = detect_ents(row["text"], row["date"])
    if res != []:
        print(res)
        print()

In [None]:
text = list(df.text)[10]
timestamp = list(df.date)[10]

detect_ents(text, timestamp)

In [None]:
cab

In [None]:
# P