## Import dependencies

In [69]:
import sys
import os

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import numpy as np
from gensim.models import KeyedVectors
import mlflow
from pycaret.classification import predict_model
from IPython.core.display import display, Markdown

from sem_covid.services.sc_wrangling.evaluation_metrics import model_evaluation_metrics

from sem_covid.services.model_registry import ClassificationModel, get_best_model_from_ml_flow
from sem_covid.services.store_registry import StoreRegistry
from sem_covid.services.data_registry import Dataset, LanguageModel
from sem_covid.services.sc_wrangling.mean_vectorizer import text_to_vector
import pandas as pd
import regex as re
from pysummarization.nlpbase.auto_abstractor import AutoAbstractor
from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer
from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor

from sem_covid.services.sc_wrangling.data_cleaning import clean_fix_unicode, clean_to_ascii, clean_to_lower,\
        clean_remove_stopwords, clean_remove_line_breaks, clean_remove_urls, clean_remove_emails

## Define consts

In [70]:
EU_CELLAR_TEXT_COLUMNS = ['title', 'content']

EU_CELLAR_NEW_COLUMNS = ['businesses', 'citizens', 'workers']

## Load dataset and language model

In [63]:
df = Dataset.EU_CELLAR.fetch()

In [64]:
law2vec = LanguageModel.LAW2VEC.fetch()
law2vec_path = LanguageModel.LAW2VEC.path_to_local_cache()
l2v_dict = KeyedVectors.load_word2vec_format(law2vec_path, encoding="utf-8")

## Prepare dataset

In [71]:
def clean_pipeline(document: str) -> str:
    document = clean_fix_unicode(document)
    document = clean_to_ascii(document)
    document = clean_to_lower(document)
    document = clean_remove_line_breaks(document)
    document = clean_remove_urls(document, replace_with='')
    document = clean_remove_emails(document, replace_with='')
    document = clean_remove_stopwords(document)
    return document

In [72]:
print("Text length before cleaning :", df['content'].str.len().sum())
df['content'] = df['content'].apply(lambda x: clean_pipeline(str(x)))
df['content'] = df['content'].str.replace('\n', '')
df['content'] = df['content'].str.replace('\t', '')
df['content'] = df['content'].apply(lambda x: re.sub(' +', ' ', str(x)))
print("Text length after cleaning :", df['content'].str.len().sum())

Text length before cleaning : 1077670023
Text length after cleaning : 724122960


In [35]:
auto_abstractor = AutoAbstractor()
auto_abstractor.tokenizable_doc = SimpleTokenizer()
auto_abstractor.delimiter_list = [".", "\n"]
abstractable_doc = TopNRankAbstractor()

In [None]:
df['content'] = df['content'].apply(
    lambda x: ' '.join(auto_abstractor.summarize(str(x), abstractable_doc)["summarize_result"]))

In [73]:
result_temp = df['content'].iloc[0:100].apply(
    lambda x: ' '.join(auto_abstractor.summarize(str(x), abstractable_doc)["summarize_result"]))


In [74]:
result_temp

_id
0007dea694fcf194ad9af6ec8b597a5185330549e826f04c6ca3e91f8bad383b    healthy workplaces - lighten load healthy work...
0013e1224a6385d0315875724287f0a044ec2050a8e710c14a479fa037eb9010    ethics genome editing #ethicsgroup_eu european...
00178f1c4c60992043347f38986f824c78096958d5bff593b8a1c79d058641f0    eur xxxx en environment european research proj...
002086bd15c9aba8b8b3cdf88498e25735cf66043ddab8877aca317256615aff    2020 en official journal european union c 350/...
002b8d0e8dcdbbb3ad87c517e97e1632ced062ad5aeb0c0d8727a9791d560e65    2021 en official journal european union c 53/4...
0033a9597090f698de7cbc177391594f28c072c99585f518458d9a57cd23bbf0                                                     
004b02ea0760b049799dc752d6378917cc4e7d2f0b0fa093a9fad2b0ac47bdd6    2020 en official journal european union c 343/...
0056ce1cc4915727f41bffbc5886a56788de84e75a0efa4e03a100e76f52d00f    a6_cordis_energy_efficency_hr c m y cm cy cmy ...
0066750e14340543b8cf2417f35d24e245adbd7731b4fb03fdc7

In [32]:
def my_text_to_vector(text: str, word2vec):
    dim = len(word2vec[next(iter(word2vec))])
    return np.mean([np.zeros(dim)] + [word2vec[word] for word in text.split() if word in word2vec], axis=0)

In [34]:
text_df = pd.DataFrame(df[EU_CELLAR_TEXT_COLUMNS])
text_df.replace(np.nan, '', regex=True, inplace=True)
text_df['text'] = text_df.agg(' '.join, axis=1)
text_df.reset_index(drop=True, inplace=True)

In [32]:
text_df['text'].head(5)

NameError: name 'text_df' is not defined

In [38]:
text_df['text'].str.len().sum()

2156889442

In [43]:
text_df['text'].apply(lambda x: ' '.join(re.split('\W+', x))).str.len().sum()

1545996497

In [None]:
text_df["emb"] = text_df["text"].apply(lambda x: my_text_to_vector(x, l2v_dict))

In [8]:
text_df[text_df['emb'].apply(lambda x: type(x) == np.float64)]

Unnamed: 0,title,content,text,emb
15,,,,
41,,,,
64,,,,
65,,,,
75,,,,
...,...,...,...,...
9742,,,,
9746,,,,
9751,,,,
9767,,,,


In [12]:
matrix_df = pd.DataFrame(list(text_df['emb'].values))

TypeError: 'numpy.float64' object is not iterable

In [7]:
matrix_df

NameError: name 'matrix_df' is not defined

## Enrich dataset with new columns

In [None]:
for new_column in EU_CELLAR_NEW_COLUMNS:
    model = get_best_model_from_ml_flow(experiment_ids=["1"], class_name=new_column)
    text_df[new_column] = model.predict(list(text_df["emb"]))


## Show result

In [None]:
text_df[['title'] + EU_CELLAR_NEW_COLUMNS[:]]

## Analyse results

In [None]:
for new_column in EU_CELLAR_NEW_COLUMNS:
    n_poz = len(text_df[text_df[new_column] == 1])
    n_total = len(text_df)
    n_neg = n_total - n_poz
    print(f"{new_column}: n_neg = {round(n_neg / n_total, 2)}, n_poz = {round(n_poz / n_total, 2)}")

