## Import dependencies

In [1]:
import sys
import os

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import numpy as np
from gensim.models import KeyedVectors

from pycaret.classification import predict_model
from sem_covid.services.model_registry import ClassificationModel
from sem_covid.services.store_registry import StoreRegistry
from sem_covid.services.data_registry import Dataset, LanguageModel
from sem_covid.services.sc_wrangling.mean_vectorizer import text_to_vector
import pandas as pd
import regex as re
from pysummarization.nlpbase.auto_abstractor import AutoAbstractor
from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer
from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor

from sem_covid.services.sc_wrangling.data_cleaning import clean_fix_unicode, clean_to_ascii, clean_to_lower,\
        clean_remove_stopwords, clean_remove_line_breaks, clean_remove_urls, clean_remove_emails

## Define consts

In [5]:
EU_CELLAR_TEXT_COLUMNS = ['title']#, 'content']

EU_CELLAR_NEW_COLUMNS = ['businesses', 'citizens', 'workers']

## Load dataset and language model

In [3]:
df = Dataset.EU_CELLAR.fetch()

100% (9792 of 9792) |####################| Elapsed Time: 0:00:37 Time:  0:00:37


In [4]:
law2vec = LanguageModel.LAW2VEC.fetch()
law2vec_path = LanguageModel.LAW2VEC.path_to_local_cache()
l2v_dict = KeyedVectors.load_word2vec_format(law2vec_path, encoding="utf-8")

## Prepare dataset

In [71]:
def clean_pipeline(document: str) -> str:
    document = clean_fix_unicode(document)
    document = clean_to_ascii(document)
    document = clean_to_lower(document)
    document = clean_remove_line_breaks(document)
    document = clean_remove_urls(document, replace_with='')
    document = clean_remove_emails(document, replace_with='')
    document = clean_remove_stopwords(document)
    return document

In [72]:
print("Text length before cleaning :", df['content'].str.len().sum())
df['content'] = df['content'].apply(lambda x: clean_pipeline(str(x)))
df['content'] = df['content'].str.replace('\n', '')
df['content'] = df['content'].str.replace('\t', '')
df['content'] = df['content'].apply(lambda x: re.sub(' +', ' ', str(x)))
print("Text length after cleaning :", df['content'].str.len().sum())

Text length before cleaning : 1077670023
Text length after cleaning : 724122960


In [35]:
auto_abstractor = AutoAbstractor()
auto_abstractor.tokenizable_doc = SimpleTokenizer()
auto_abstractor.delimiter_list = [".", "\n"]
abstractable_doc = TopNRankAbstractor()

In [None]:
df['content'] = df['content'].apply(
    lambda x: ' '.join(auto_abstractor.summarize(str(x), abstractable_doc)["summarize_result"]))

In [6]:
text_df = pd.DataFrame(df[EU_CELLAR_TEXT_COLUMNS])
text_df.replace(np.nan, '', regex=True, inplace=True)
text_df['text'] = text_df.agg(' '.join, axis=1)
text_df.reset_index(drop=True, inplace=True)

In [6]:
text_df['text'].head(5)

0               Lighten the load : healthy workplaces.
1    Executive summary of the opinion ethics of gen...
2    European research on environment and health : ...
3    Opinion No 6/2020 (pursuant to Article 287(4) ...
4    Case T-718/20: Action brought on 5 December 20...
Name: text, dtype: object

In [None]:
text_df['text'].str.len().sum()

In [8]:
text_df['text'].apply(lambda x: ' '.join(re.split('\W+', x))).str.len().sum()

  text_df['text'].apply(lambda x: ' '.join(re.split('\W+', x))).str.len().sum()


1509473

In [7]:
text_df["emb"] = text_df["text"].apply(lambda x: text_to_vector(x,l2v_dict))

## Enrich dataset with new columns

In [8]:
matrix_df = pd.DataFrame(list(text_df['emb'].values))
enriched_df = df
for new_column in EU_CELLAR_NEW_COLUMNS:
    dataset = matrix_df
    dataset[new_column] = "no_data"
    model = ClassificationModel.pwdb_by_class_name(class_name=new_column)
    predicted_df = predict_model(model, data=matrix_df)
    text_df[new_column] = predicted_df['Label']
    enriched_df[new_column] = predicted_df['Label']

## Show result

In [10]:
text_df[['title']+EU_CELLAR_NEW_COLUMNS[:]]

Unnamed: 0,title,businesses,citizens,workers
0,Lighten the load : healthy workplaces.,1,1,0
1,Executive summary of the opinion ethics of gen...,1,0,0
2,European research on environment and health : ...,1,0,0
3,Opinion No 6/2020 (pursuant to Article 287(4) ...,0,0,0
4,Case T-718/20: Action brought on 5 December 20...,0,0,0
...,...,...,...,...
9787,2018-3 CEF telecom : calls for proposals.,1,0,0
9788,Lebanese Hezbollah’s experience in Syria.,0,0,1
9789,Application Programming Interfaces in governme...,0,0,0
9790,The SME instrument in action : an effective an...,0,0,0


## Analyse results

In [11]:
for new_column in EU_CELLAR_NEW_COLUMNS:
    n_poz = len(text_df[text_df[new_column]==1])
    n_total = len(text_df)
    n_neg = n_total - n_poz
    print(f"{new_column}: n_neg = {round(n_neg/n_total,2)}, n_poz = {round(n_poz/n_total,2)}")

businesses: n_neg = 0.71, n_poz = 0.29
citizens: n_neg = 0.9, n_poz = 0.1
workers: n_neg = 0.91, n_poz = 0.09


## Save enriched dataset

In [12]:
es_store = StoreRegistry.es_index_store()
es_store.put_dataframe(index_name='ds_eu_cellar_enriched',content= enriched_df)

 98% (9667 of 9792) |################### | Elapsed Time: 0:00:49 ETA:   0:00:14

9792