## Import dependencies

In [1]:
import sys
import os
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import numpy as np
from gensim.models import KeyedVectors
from sem_covid.services.model_registry import ClassificationModel

from sem_covid.services.data_registry import Dataset, LanguageModel
from sem_covid.services.sc_wrangling.mean_vectorizer import text_to_vector
import pandas as pd
from pycaret.classification import predict_model
from sem_covid.services.store_registry import StoreRegistry



## Define consts

In [2]:
EU_TIMELINE_TEXT_COLUMNS = ['title', 'abstract', 'detail_content', 'detail_title']

EU_TIMELINE_NEW_COLUMNS = ['businesses', 'citizens','workers']

## Load dataset and language model

In [3]:
df = Dataset.EU_ACTION_TIMELINE.fetch()
law2vec = LanguageModel.LAW2VEC.fetch()
law2vec_path = LanguageModel.LAW2VEC.path_to_local_cache()
law2vec_format = KeyedVectors.load_word2vec_format(law2vec_path, encoding="utf-8")
l2v_dict = {w: vec for w, vec in zip(law2vec_format.index_to_key, law2vec_format.vectors)}

100% (171 of 171) |######################| Elapsed Time: 0:00:00 Time:  0:00:00


## Prepare dataset

In [4]:
df.loc[df['detail_title'].isnull(), 'detail_title'] = df.loc[df['detail_title'].isnull(), 'detail_title'].apply(
    lambda x: [])
df['detail_title'] = df['detail_title'].apply(lambda x: " ".join(x))

text_df = pd.DataFrame(df[EU_TIMELINE_TEXT_COLUMNS])
text_df.replace(np.nan, '', regex=True,inplace=True)
text_df['text'] = text_df.agg(' '.join, axis=1)
text_df.reset_index(drop=True,inplace=True)
text_df["emb"] = text_df["text"].apply(lambda x: text_to_vector(x,l2v_dict))

## Enrich dataset with new columns

In [5]:
matrix_df = pd.DataFrame(list(text_df['emb'].values))
enriched_df = df
for new_column in EU_TIMELINE_NEW_COLUMNS:
    dataset = matrix_df
    dataset[new_column] = "no_data"
    model = ClassificationModel.pwdb_by_class_name(class_name=new_column)
    predicted_df = predict_model(model, data=matrix_df)
    text_df[new_column] = predicted_df['Label']
    enriched_df[new_column] = predicted_df['Label']


## Show result

In [6]:
text_df[['title']+EU_TIMELINE_NEW_COLUMNS[:]]

Unnamed: 0,title,businesses,citizens,workers
0,EU countries adopt guidelines on proof of vacc...,0,0,0
1,Commission unveils EU vaccines strategy,1,0,0
2,European roadmap shows path towards common lif...,0,0,0
3,Commission puts forward rules on rapid antigen...,0,0,0
4,EU leaders agree on the recovery plan and mult...,1,0,0
5,Council adopts Recovery and Resilience Facility,1,1,0
6,Commission presents “Staying safe from COVID-1...,0,0,0
7,Commitment to international cooperation reaffi...,0,0,0
8,rescEU medical stockpile expands in four Membe...,1,0,0
9,Commission authorises third safe and effective...,0,0,0


## Analyse results

In [7]:
for new_column in EU_TIMELINE_NEW_COLUMNS:
    n_poz = len(text_df[text_df[new_column]==1])
    n_total = len(text_df)
    n_neg = n_total - n_poz
    print(f"{new_column}: n_neg = {round(n_neg/n_total,2)}, n_poz = {round(n_poz/n_total,2)}")

businesses: n_neg = 0.43, n_poz = 0.57
citizens: n_neg = 0.95, n_poz = 0.05
workers: n_neg = 0.99, n_poz = 0.01


## Save enriched dataset

In [8]:
es_store = StoreRegistry.es_index_store()
es_store.put_dataframe(index_name='ds_eu_timeline_enriched',content= enriched_df)

N/A% (0 of 171) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--

171