In [None]:
import pandas as pd
from tqdm import tqdm
from pymongo import MongoClient
import re
import pickle
from datetime import datetime

# Prep cabinet entities

In [None]:
with open("Cabinet_data_NER_LV.pkl", "rb") as file:
    dfl = pickle.load(file)
print(len(dfl))

In [None]:
dfl[70]

In [None]:


def resolve_cabinet(cabinet_data, datetime_obj):
    """
    Assembles list of cabinet member names for a particular datetime
    
    PARAMS:
        cabinet_data:list - a list of cabinet members, their positions and incumbency duration
        datetime_obj:datetime - date when a news article was published
    RETURNS:
        cabinet - a list of people who were ministers and the president for that time
    """

    cabinet = []

    for row in cabinet_data:
        if row["from"] <= datetime_obj <= row["to"]:
            cabinet.append(row)
    return cabinet


def cabinet_entities(datetime_obj, entities, cabinet_data):
    """
    Cross-references a list of entities extracted from text
    with a list of cabinet ministers in office at the time the article was published
    
    PARAMS:
        datetime_obj:datetime - datetime when the article was published
        entities:list - a list of entities extracted from text
        cabinet_data:list - a list of cabinet members, their positions and incumbency duration
    
    RETURNS:
        rel_mentions:list - a list of enities mentioned in text that were the cabinet 
                            members at the time and the president
    """
    entities = set(entities)
    relevant_cabinet = resolve_cabinet(cabinet_data, datetime_obj)
    rel_mentions = []
    for row in relevant_cabinet:
        if any(name in entities for name in row["names"]):
            plh = {
                "person_id": row["person_id"],
                "person_name": row["person_name"],
                "cabinet_ents": row["ministry"],
            }
            rel_mentions.append(plh)
    return rel_mentions


In [None]:
#resolve_cabinet(dfl, datetime.now())

# Load Data from momgo

In [None]:
mongo = MongoClient("mongodb://localhost:27017/?readPreference=primary&appname=MongoDB%20Compass&ssl=false")
db = mongo["delfi_texts"]
col = db["lv_news"]

In [None]:
query = {}
cursor = col.find(query, {"_id" :0}).limit(50000)
data = [i for i in cursor]

df = pd.DataFrame(data)
print(df.shape)
df.head()

# Detect ents

In [None]:
def detect_ents(text, timestamp):
    cabinet = resolve_cabinet(dfl, timestamp)
    
    plh = []
    
    for item in cabinet:
        for entry in item["names"]:
            if entry in text:
                d = {"person_id" : item["person_id"], 
                    "cabinet_no" : item["cabinet_no"], 
                    "ministry" : item["ministry"], 
                    }
                if d not in plh:
                    plh.append(d)
    return plh


def extend_data_datetime_vars(interim_data):

    """
    Adds year, mothm day vars to the dataframe

    PARAMS:
        interim_data:DataFrame - a dataframe with `date` column
    RETURNS:
        interim_data:DataFrame
    """

    interim_data["year"] = interim_data.apply(lambda x: x["date"].year, axis=1)
    interim_data["month"] = interim_data.apply(lambda x: x["date"].month, axis=1)
    interim_data["day"] = interim_data.apply(lambda x: x["date"].day, axis=1)
    interim_data = interim_data.sort_values(by=["date"])

    return interim_data


def prime_and_prez(cabinet: list):
    pm = ""
    cabinet_no = ""
    prezident = ""

    for item in cabinet:
        if item["ministry"] == "Premjers":
            pm = item["person_name"]
            cabinet_no = str(item["cabinet_no"])

        if item["ministry"] in ("Prezidente", "Prezidents"):
            prezident = item["person_name"]
    return (pm, cabinet_no, prezident)

def extend_data_cabinet_vars(interim_data):

    """
    Adds cabinet vars: cabinet, cabinet_number, president

    PARAMS:
        interim_data:DataFrame - a dataframe with `date` column
    RETURNS:
        interim_data:DataFrame
    """

    pm = []
    cabinet_no = []
    prezident = []

    for row in tqdm(interim_data.date):
        cabinet = resolve_cabinet(dfl, row)
        plh = prime_and_prez(cabinet)
        pm.append(plh[0])
        cabinet_no.append(plh[1])
        prezident.append(plh[2])

    interim_data["cabinet"] = pm
    interim_data["cabinet_no"] = cabinet_no
    interim_data["president"] = prezident

    return interim_data

In [None]:
entities = []

for index, row in tqdm(df.iterrows()):
    ents = detect_ents(row["text"], row["date"])
    entities.append(ents)
    
df["entities_full"] = entities
df["entities"] = df.apply(lambda x : [i["ministry"] for i in x["entities_full"]], axis = 1)


df = extend_data_datetime_vars(df)
df = extend_data_cabinet_vars(df)

df.head()


In [None]:
df.tail()

In [None]:
df["has_entities"] = df.apply(lambda x : len(x["entities"])>0, axis = 1)
df["has_entities"].mean()

In [None]:
df2 = df[df["has_entities"]==1]
print(df2.shape)
df2.head()

In [None]:
df2.year.value_counts()

In [None]:
ministries_i= {}
for item in list(df2.entities):
    for i in item:
        if i not in ministries_i:
            ministries_i[i]=0
        ministries_i[i]+=1

s_min = sorted(ministries_i.items(), key = lambda kv : kv[1], reverse = True)        

s_min

In [None]:
ministries = set(ministries_i.keys())
ministries.remove("Prezidents")
if "Prezidente" in ministries:
    ministries.remove("Prezidente")

for item in ministries:
    plh1 = []
    plh2 = []
    
    for row in list(df2.entities):
        if item in row:
            plh1.append(1)
        else:
            plh1.append(0)
        if item in row and any(i in row for i in ["Prezidents", "Prezidente"]):
            plh2.append(1)
        else:
            plh2.append(0)
    df2[item] = plh1
    df2["Prezident_"+item] = plh2
    print("Prezident_"+item, sum(plh2))
    
print(df2.shape)
df2.head()

In [None]:
df2["Prezident_Premjers"].sum()

In [None]:
plh = df2[df2["Prezident_Aizsardzības"]==1]
plh.head()

In [None]:
list(plh.source)[50]

In [None]:
cab = resolve_cabinet(dfl, datetime.now())

prime_and_prez(cab)

In [None]:
for index, row in df.iterrows():
    res = detect_ents(row["text"], row["date"])
    if res != []:
        print(res)
        print()

In [None]:
text = list(df.text)[10]
timestamp = list(df.date)[10]

detect_ents(text, timestamp)

In [None]:
cab

In [None]:
# P