## Import dependencies

In [3]:
import sys
import os
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
import numpy as np
from gensim.models import KeyedVectors
from sem_covid.services.model_registry import ClassificationModel
from sem_covid.services.sc_wrangling.mean_vectorizer import text_to_vector
from sem_covid.services.data_registry import Dataset, LanguageModel
import pandas as pd
from pycaret.classification import predict_model
from sem_covid.services.store_registry import StoreRegistry

## Define consts

In [4]:
IRELAND_TIMELINE_TEXT_COLUMNS = ['title', 'content', 'keyword']

IRELAND_TIMELINE_NEW_COLUMNS = ['businesses', 'citizens','workers']

## Load dataset and language model

In [5]:
df = Dataset.IRELAND_ACTION_TIMELINE.fetch()
law2vec = LanguageModel.LAW2VEC.fetch()
law2vec_path = LanguageModel.LAW2VEC.path_to_local_cache()
l2v_dict = KeyedVectors.load_word2vec_format(law2vec_path, encoding="utf-8")

100% (410 of 410) |######################| Elapsed Time: 0:00:00 Time:  0:00:00


## Prepare dataset

In [5]:
text_df = pd.DataFrame(df[IRELAND_TIMELINE_TEXT_COLUMNS])
text_df.replace(np.nan, '', regex=True,inplace=True)
text_df['text'] = text_df.agg(' '.join, axis=1)
text_df.reset_index(drop=True,inplace=True)
text_df["emb"] = text_df["text"].apply(lambda x: text_to_vector(x,l2v_dict))


## Enrich dataset with new columns

In [7]:
matrix_df = pd.DataFrame(list(text_df['emb'].values))
enriched_df = df
for new_column in IRELAND_TIMELINE_NEW_COLUMNS:
    dataset = matrix_df
    dataset[new_column] = 0
    model = ClassificationModel.pwdb_by_class_name(class_name=new_column)
    predicted_df = predict_model(model, data=matrix_df)
    text_df[new_column] = predicted_df['Label']
    enriched_df[new_column] = predicted_df['Label']

## Show result

In [8]:
text_df[['title']+IRELAND_TIMELINE_NEW_COLUMNS[:]]

Unnamed: 0,title,businesses,citizens,workers
0,Briefing on the government's response to COVID...,0,0,0
1,Air passenger rights in the European Union,0,0,0
2,Minister for Health enables secure access to C...,0,0,0
3,Tánaiste Simon Coveney and Minister Helen McEn...,0,0,0
4,Air traffic rights in Ireland,1,1,0
5,Minister Brophy announces further assistance o...,1,1,0
6,"Minister Harris announces roll-out of €300,000...",0,0,0
7,Briefing on the government's response to COVID...,0,0,0
8,Minister Donohoe publishes Stability Programme...,1,0,0
9,Update on COVID-19 vaccine deliveries - 31 Mar...,0,1,0


## Analyse results

In [9]:
for new_column in IRELAND_TIMELINE_NEW_COLUMNS:
    n_poz = len(text_df[text_df[new_column]==1])
    n_total = len(text_df)
    n_neg = n_total - n_poz
    print(f"{new_column}: n_neg = {round(n_neg/n_total,2)}, n_poz = {round(n_poz/n_total,2)}")

businesses: n_neg = 0.57, n_poz = 0.43
citizens: n_neg = 0.82, n_poz = 0.18
workers: n_neg = 0.73, n_poz = 0.27


## Save enriched dataset

In [10]:
es_store = StoreRegistry.es_index_store()
es_store.put_dataframe(index_name='ds_eu_timeline_enriched',content= enriched_df)



N/A% (0 of 410) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--

410