In [1]:
import pandas as pd
from tqdm import tqdm
from pymongo import MongoClient
import re
import pickle
from datetime import datetime

# Prep cabinet entities

In [2]:
with open("Cabinet_data_NER_LV.pkl", "rb") as file:
    dfl = pickle.load(file)
print(len(dfl))

255


In [3]:
dfl[70]

{'cabinet_no': 31,
 'person_id': 'k_karins',
 'ministry': 'Ekonomikas',
 'from': Timestamp('2004-12-02 00:00:00'),
 'to': Timestamp('2006-04-07 00:00:00'),
 'names': ['Krišjānis Kariņš',
  'K. Kariņš',
  'Krišjānja Kariņa',
  'K. Kariņa',
  'Krišjānim Kariņam',
  'K. Kariņam',
  'Krišjāni Kariņu',
  'K. Kariņu',
  'Krišjānī Kariņā',
  'K. Kariņā'],
 'person_name': 'Krišjānis Kariņš'}

In [4]:


def resolve_cabinet(cabinet_data, datetime_obj):
    """
    Assembles list of cabinet member names for a particular datetime
    
    PARAMS:
        cabinet_data:list - a list of cabinet members, their positions and incumbency duration
        datetime_obj:datetime - date when a news article was published
    RETURNS:
        cabinet - a list of people who were ministers and the president for that time
    """

    cabinet = []

    for row in cabinet_data:
        if row["from"] <= datetime_obj <= row["to"]:
            cabinet.append(row)
    return cabinet


def cabinet_entities(datetime_obj, entities, cabinet_data):
    """
    Cross-references a list of entities extracted from text
    with a list of cabinet ministers in office at the time the article was published
    
    PARAMS:
        datetime_obj:datetime - datetime when the article was published
        entities:list - a list of entities extracted from text
        cabinet_data:list - a list of cabinet members, their positions and incumbency duration
    
    RETURNS:
        rel_mentions:list - a list of enities mentioned in text that were the cabinet 
                            members at the time and the president
    """
    entities = set(entities)
    relevant_cabinet = resolve_cabinet(cabinet_data, datetime_obj)
    rel_mentions = []
    for row in relevant_cabinet:
        if any(name in entities for name in row["names"]):
            plh = {
                "person_id": row["person_id"],
                "person_name": row["person_name"],
                "cabinet_ents": row["ministry"],
            }
            rel_mentions.append(plh)
    return rel_mentions


In [5]:
#resolve_cabinet(dfl, datetime.now())

# Load Data from momgo

In [6]:
mongo = MongoClient("mongodb://localhost:27017/?readPreference=primary&appname=MongoDB%20Compass&ssl=false")
db = mongo["delfi_texts"]
col = db["lv_news"]

In [7]:
query = {}
cursor = col.find(query, {"_id" :0}).limit(50000)
data = [i for i in cursor]

df = pd.DataFrame(data)
print(df.shape)
df.head()

(50000, 3)


Unnamed: 0,source,text,date
0,https://www.delfi.lv/news/national/politics/pa...,DELFI žurnālists\n 1933. gadā vīrs vārdā Freds...,2021-07-22 00:02:00
1,https://www.delfi.lv/news/national/politics/ra...,rus.delfi.lv žurnāliste\n Olga Sukonnikova lat...,2015-07-21 04:16:00
2,https://www.delfi.lv/news/national/politics/ka...,"DELFI žurnālists\n ""Paliec sveiks, mans mazais...",2021-07-18 00:00:00
3,https://www.delfi.lv/news/national/politics/bl...,Nacionālo ziņu nodaļas žurnāliste\n Mēdz teikt...,2021-03-13 09:16:00
4,https://www.delfi.lv/news/national/politics/da...,"""DELFI plus"" žurnāliste\n No bildēm pretim ver...",2021-10-10 00:00:00


# Detect ents

In [10]:
def detect_ents(text, timestamp):
    cabinet = resolve_cabinet(dfl, timestamp)
    
    plh = []
    
    for item in cabinet:
        for entry in item["names"]:
            if entry in text:
                d = {"person_id" : item["person_id"], 
                    "cabinet_no" : item["cabinet_no"], 
                    "ministry" : item["ministry"], 
                    }
                if d not in plh:
                    plh.append(d)
    return plh


def extend_data_datetime_vars(interim_data):

    """
    Adds year, mothm day vars to the dataframe

    PARAMS:
        interim_data:DataFrame - a dataframe with `date` column
    RETURNS:
        interim_data:DataFrame
    """

    interim_data["year"] = interim_data.apply(lambda x: x["date"].year, axis=1)
    interim_data["month"] = interim_data.apply(lambda x: x["date"].month, axis=1)
    interim_data["day"] = interim_data.apply(lambda x: x["date"].day, axis=1)
    interim_data = interim_data.sort_values(by=["date"])

    return interim_data


def prime_and_prez(cabinet: list):
    pm = ""
    cabinet_no = ""
    prezident = ""

    for item in cabinet:
        if item["ministry"] == "Premjers":
            pm = item["person_name"]
            cabinet_no = str(item["cabinet_no"])

        if item["ministry"] in ("Prezidente", "Prezidents"):
            prezident = item["person_name"]
    return (pm, cabinet_no, prezident)

def extend_data_cabinet_vars(interim_data):

    """
    Adds cabinet vars: cabinet, cabinet_number, president

    PARAMS:
        interim_data:DataFrame - a dataframe with `date` column
    RETURNS:
        interim_data:DataFrame
    """

    pm = []
    cabinet_no = []
    prezident = []

    for row in tqdm(interim_data.date):
        cabinet = resolve_cabinet(dfl, row)
        plh = prime_and_prez(cabinet)
        pm.append(plh[0])
        cabinet_no.append(plh[1])
        prezident.append(plh[2])

    interim_data["cabinet"] = pm
    interim_data["cabinet_no"] = cabinet_no
    interim_data["president"] = prezident

    return interim_data

In [11]:
entities = []

for index, row in tqdm(df.iterrows()):
    ents = detect_ents(row["text"], row["date"])
    entities.append(ents)
    
df["entities_full"] = entities
df["entities"] = df.apply(lambda x : [i["ministry"] for i in x["entities_full"]], axis = 1)


df = extend_data_datetime_vars(df)
df = extend_data_cabinet_vars(df)

df.head()


50000it [01:04, 778.32it/s] 
100%|██████████| 50000/50000 [00:08<00:00, 5672.56it/s]


Unnamed: 0,source,text,date,entities_full,entities,year,month,day,cabinet,cabinet_no,president
18547,https://www.delfi.lv/news/national/politics/gu...,\n Vidzemes apgabaltiesas pastāvīgā...,1999-11-29 13:03:00,[],[],1999.0,11.0,29.0,Andris Šķēle,27,Vaira Viķe Freiberga
29970,https://www.delfi.lv/news/national/politics/da...,\n Par darba aizsardzības likuma pā...,1999-12-10 09:09:00,[],[],1999.0,12.0,10.0,Andris Šķēle,27,Vaira Viķe Freiberga
28844,https://www.delfi.lv/news/national/politics/dr...,\n Valdība otrdien Drošības policij...,1999-12-21 19:52:00,[],[],1999.0,12.0,21.0,Andris Šķēle,27,Vaira Viķe Freiberga
14355,https://www.delfi.lv/news/national/politics/sk...,\n Ministru prezidents Andris Šķēle...,2000-04-11 09:42:00,"[{'person_id': 'a_skele', 'cabinet_no': 27, 'm...",[Premjers],2000.0,4.0,11.0,Andris Šķēle,27,Vaira Viķe Freiberga
16722,https://www.delfi.lv/news/national/politics/la...,"\n Naktī uz trešdienu Latvijā, izle...",2000-11-15 15:53:00,[],[],2000.0,11.0,15.0,Andris Bērziņš,28,Vaira Viķe Freiberga


In [12]:
df.tail()

Unnamed: 0,source,text,date,entities_full,entities,year,month,day,cabinet,cabinet_no,president
32946,https://www.delfi.lv/news/national/politics/pi...,"""Katrā pašvaldībā mums ir jāpārliecina, ka tie...",NaT,[],[],,,,,,
35883,https://www.delfi.lv/news/national/politics/la...,Stingrāks valsts valodas likums | Atkāpjas Jeļ...,NaT,[],[],,,,,,
37591,https://www.delfi.lv/news/national/politics/la...,Brīvības pieminekļa lielais remonts | Nola zel...,NaT,[],[],,,,,,
37996,https://www.delfi.lv/news/national/politics/ga...,,NaT,[],[],,,,,,
43628,https://www.delfi.lv/news/national/politics/la...,'Konkorde' beidz lidot | 'Samsung' izgudro fot...,NaT,[],[],,,,,,


In [13]:
df["has_entities"] = df.apply(lambda x : len(x["entities"])>0, axis = 1)
df["has_entities"].mean()

0.2287

In [14]:
df2 = df[df["has_entities"]==1]
print(df2.shape)
df2.head()

(11435, 12)


Unnamed: 0,source,text,date,entities_full,entities,year,month,day,cabinet,cabinet_no,president,has_entities
14355,https://www.delfi.lv/news/national/politics/sk...,\n Ministru prezidents Andris Šķēle...,2000-04-11 09:42:00,"[{'person_id': 'a_skele', 'cabinet_no': 27, 'm...",[Premjers],2000.0,4.0,11.0,Andris Šķēle,27,Vaira Viķe Freiberga,True
21128,https://www.delfi.lv/news/national/politics/ap...,\n Šodien Saeima ārkārtas sēdē izte...,2002-11-07 10:41:00,"[{'person_id': 'g_kristovskis', 'cabinet_no': ...","[Aizsardzības, Ekonomikas, Finanšu, Izglītības...",2002.0,11.0,7.0,Einars Repše,29,Vaira Viķe Freiberga,True
20476,https://www.delfi.lv/news/national/politics/am...,\n Aizsardzības ministrija (AM) pēc...,2002-11-29 05:00:00,"[{'person_id': 'g_kristovskis', 'cabinet_no': ...",[Aizsardzības],2002.0,11.0,29.0,Einars Repše,29,Vaira Viķe Freiberga,True
38512,https://www.delfi.lv/news/national/politics/di...,"\n Iztērējot aptuveni 100 000 latu,...",2004-09-16 10:19:00,"[{'person_id': 'a_radzevics', 'cabinet_no': 30...",[Reģionālās attīstības un pašvaldību lietu],2004.0,9.0,16.0,Indulis Emsis,30,Vaira Viķe Freiberga,True
20292,https://www.delfi.lv/news/national/politics/ci...,"\n Lai apkarotu latvāņus, kuri šobr...",2005-08-25 16:03:00,"[{'person_id': 'r_vejonis', 'cabinet_no': 31, ...",[Vides],2005.0,8.0,25.0,Aigars Kalvītis,31,Vaira Viķe Freiberga,True


In [15]:
df2.year.value_counts()

2021.0    1406
2012.0    1283
2015.0    1207
2013.0    1167
2016.0    1065
2014.0     935
2011.0     932
2020.0     855
2019.0     852
2018.0     769
2017.0     677
2010.0     226
2009.0      23
2008.0      15
2006.0       8
2007.0       6
2005.0       5
2002.0       2
2000.0       1
2004.0       1
Name: year, dtype: int64

In [16]:
ministries_i= {}
for item in list(df2.entities):
    for i in item:
        if i not in ministries_i:
            ministries_i[i]=0
        ministries_i[i]+=1

s_min = sorted(ministries_i.items(), key = lambda kv : kv[1], reverse = True)        

s_min

[('Premjers', 3380),
 ('Prezidents', 2361),
 ('Veselības', 1349),
 ('Izglītības un zinātnes', 1235),
 ('Tieslietu', 1005),
 ('Ārlietu', 992),
 ('Iekšlietu', 982),
 ('Aizsardzības', 959),
 ('Vides aizsardzības un reģionālās attīstības', 868),
 ('Labklājības', 837),
 ('Ekonomikas', 702),
 ('Finanšu', 558),
 ('Zemkopības', 341),
 ('Satiksmes', 323),
 ('Kultūras', 320),
 ('Reģionālās attīstības un pašvaldību lietu', 33),
 ('Vides', 22),
 ('Bērnu, ģimenes un sabiedrības integrācijas lietu', 1)]

In [19]:
ministries = set(ministries_i.keys())
ministries.remove("Prezidents")
if "Prezidente" in ministries:
    ministries.remove("Prezidente")

for item in ministries:
    plh1 = []
    plh2 = []
    
    for row in list(df2.entities):
        if item in row:
            plh1.append(1)
        else:
            plh1.append(0)
        if item in row and any(i in row for i in ["Prezidents", "Prezidente"]):
            plh2.append(1)
        else:
            plh2.append(0)
    df2[item] = plh1
    df2["Prezident_"+item] = plh2
    print("Prezident_"+item, sum(plh2))
    
print(df2.shape)
df2.head()

Prezident_Finanšu 74
Prezident_Reģionālās attīstības un pašvaldību lietu 2
Prezident_Vides aizsardzības un reģionālās attīstības 45
Prezident_Satiksmes 53
Prezident_Iekšlietu 75
Prezident_Veselības 43
Prezident_Ārlietu 149
Prezident_Vides 2
Prezident_Aizsardzības 160
Prezident_Ekonomikas 53
Prezident_Zemkopības 81
Prezident_Tieslietu 55
Prezident_Premjers 385
Prezident_Bērnu, ģimenes un sabiedrības integrācijas lietu 0
Prezident_Labklājības 82
Prezident_Kultūras 60
Prezident_Izglītības un zinātnes 47
(11435, 46)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[item] = plh1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["Prezident_"+item] = plh2


Unnamed: 0,source,text,date,entities_full,entities,year,month,day,cabinet,cabinet_no,...,Premjers,Prezident_Premjers,"Bērnu, ģimenes un sabiedrības integrācijas lietu","Prezident_Bērnu, ģimenes un sabiedrības integrācijas lietu",Labklājības,Prezident_Labklājības,Kultūras,Prezident_Kultūras,Izglītības un zinātnes,Prezident_Izglītības un zinātnes
14355,https://www.delfi.lv/news/national/politics/sk...,\n Ministru prezidents Andris Šķēle...,2000-04-11 09:42:00,"[{'person_id': 'a_skele', 'cabinet_no': 27, 'm...",[Premjers],2000.0,4.0,11.0,Andris Šķēle,27,...,1,0,0,0,0,0,0,0,0,0
21128,https://www.delfi.lv/news/national/politics/ap...,\n Šodien Saeima ārkārtas sēdē izte...,2002-11-07 10:41:00,"[{'person_id': 'g_kristovskis', 'cabinet_no': ...","[Aizsardzības, Ekonomikas, Finanšu, Izglītības...",2002.0,11.0,7.0,Einars Repše,29,...,1,0,0,0,0,0,1,0,1,0
20476,https://www.delfi.lv/news/national/politics/am...,\n Aizsardzības ministrija (AM) pēc...,2002-11-29 05:00:00,"[{'person_id': 'g_kristovskis', 'cabinet_no': ...",[Aizsardzības],2002.0,11.0,29.0,Einars Repše,29,...,0,0,0,0,0,0,0,0,0,0
38512,https://www.delfi.lv/news/national/politics/di...,"\n Iztērējot aptuveni 100 000 latu,...",2004-09-16 10:19:00,"[{'person_id': 'a_radzevics', 'cabinet_no': 30...",[Reģionālās attīstības un pašvaldību lietu],2004.0,9.0,16.0,Indulis Emsis,30,...,0,0,0,0,0,0,0,0,0,0
20292,https://www.delfi.lv/news/national/politics/ci...,"\n Lai apkarotu latvāņus, kuri šobr...",2005-08-25 16:03:00,"[{'person_id': 'r_vejonis', 'cabinet_no': 31, ...",[Vides],2005.0,8.0,25.0,Aigars Kalvītis,31,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df2["Prezident_Premjers"].sum()

385

In [20]:
plh = df2[df2["Prezident_Aizsardzības"]==1]
plh.head()

Unnamed: 0,source,text,date,entities_full,entities,year,month,day,cabinet,cabinet_no,...,Premjers,Prezident_Premjers,"Bērnu, ģimenes un sabiedrības integrācijas lietu","Prezident_Bērnu, ģimenes un sabiedrības integrācijas lietu",Labklājības,Prezident_Labklājības,Kultūras,Prezident_Kultūras,Izglītības un zinātnes,Prezident_Izglītības un zinātnes
34108,https://www.delfi.lv/news/national/politics/ra...,\n Amati jaunajā pašreizējā premjer...,2010-10-10 21:19:00,"[{'person_id': 'v_zatlers', 'cabinet_no': 7, '...","[Prezidents, Aizsardzības, Ekonomikas, Iekšlie...",2010.0,10.0,10.0,Valdis Dombrovskis,34,...,0,0,0,0,1,1,1,1,0,0
13912,https://www.delfi.lv/news/national/politics/sa...,\n 10.Saeimas vairākums trešdien no...,2010-11-03 11:15:00,"[{'person_id': 'v_zatlers', 'cabinet_no': 7, '...","[Prezidents, Aizsardzības, Ekonomikas, Finanšu...",2010.0,11.0,3.0,Valdis Dombrovskis,35,...,0,0,0,0,1,1,1,1,1,1
36262,https://www.delfi.lv/news/national/politics/at...,"\n Trešdien atbrīvoti Sudānā, Dārfū...",2010-12-08 18:41:00,"[{'person_id': 'v_zatlers', 'cabinet_no': 7, '...","[Prezidents, Aizsardzības, Premjers, Ārlietu]",2010.0,12.0,8.0,Valdis Dombrovskis,35,...,1,1,0,0,0,0,0,0,0,0
34018,https://www.delfi.lv/news/national/politics/pa...,\n Valsts prezidentam Valdim Zatler...,2011-01-31 09:29:00,"[{'person_id': 'v_zatlers', 'cabinet_no': 7, '...","[Prezidents, Aizsardzības]",2011.0,1.0,31.0,Valdis Dombrovskis,35,...,0,0,0,0,0,0,0,0,0,0
31159,https://www.delfi.lv/news/national/politics/zi...,\n Apkopojot vēlētāju ievilktos plu...,2011-09-19 20:00:00,"[{'person_id': 'a_berzins', 'cabinet_no': 8, '...","[Prezidents, Aizsardzības, Finanšu, Premjers, ...",2011.0,9.0,19.0,Valdis Dombrovskis,35,...,1,1,0,0,0,0,0,0,0,0


In [24]:
list(plh.source)[50]

'https://www.delfi.lv/news/national/politics/jaungada-uzrunu-iedzivotajiem-teiks-dombrovskis-les-brigmanis.d?id=43890602'

In [None]:
cab = resolve_cabinet(dfl, datetime.now())

prime_and_prez(cab)

In [None]:
for index, row in df.iterrows():
    res = detect_ents(row["text"], row["date"])
    if res != []:
        print(res)
        print()

In [None]:
text = list(df.text)[10]
timestamp = list(df.date)[10]

detect_ents(text, timestamp)

In [None]:
cab

In [None]:
# P