In [1]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
import numpy as np
from numpy import array
from sklearn import model_selection
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier

from sem_covid.services.sc_wrangling.evaluation_metrics import model_evaluation_metrics
from sem_covid.services.sc_wrangling.mean_vectorizer import MeanEmbeddingVectorizer


from sem_covid.services.data_registry import Dataset, LanguageModel
import pandas as pd
from gensim.models import KeyedVectors



Debug BinaryDataSource:  law2vec/law2vec-200d.txt
Debug BinaryDataSource:  jrc-word2vec/vectors-300-2019-11.txt


# Define constants

In [357]:
BUSINESSES = {'Companies providing essential services', 'Contractors of a company', 'Larger corporations',
              'One person or microenterprises', 'Other businesses', 'SMEs', 'Sector specific set of companies',
              'Solo-self-employed', 'Start-ups'}

CITIZENS = {'Children (minors)', 'Disabled', 'Migrants', 'Older citizens', 'Other groups of citizens', 'Parents',
            'People in care facilities', 'Refugees', 'Single parents', 'The COVID-19 risk group', 'Women',
            'Youth (18-25)'}

WORKERS = {'Cross-border commuters', 'Disabled workers', 'Employees in standard employment', 'Female workers',
           'Migrants in employment', 'Older people in employment (aged 55+)', 'Other groups of workers',
           'Parents in employment', 'Particular professions', 'Platform workers', 'Posted workers',
           'Refugees in employment', 'Seasonal workers', 'Self-employed', 'Single parents in employment',
           'The COVID-19 risk group at the workplace', 'Undeclared workers', 'Unemployed', 'Workers in care facilities',
           'Workers in essential services', 'Workers in non-standard forms of employment',
           'Youth (18-25) in employment'}

TEXTUAL_COLUMNS = ['title', 'background_info_description', 'content_of_measure_description',
                   'use_of_measure_description', 'involvement_of_social_partners_description']

CLASS_COLUMNS = ['businesses', 'citizens', 'workers']

In [358]:
def target_group_refactoring(pwdb_dataframe: pd.DataFrame,
                             target_group_column_name: str = 'target_groups') -> pd.DataFrame:
    """
        The target group available in the original dataset is very granular. For the purpose of this exercise
        we would benefit from aggregating the target groups into a more generic sets. As a result we will obtain
        target groups on two levels: L1, L2.
        L1: workers, businesses, citizens
        L2: the original set of categories

        :return: the given dataset with an extra column containing the aggregated (L1) values
    """
    new_columns = {'businesses': BUSINESSES, 'citizens': CITIZENS, 'workers': WORKERS}
    refactored_pwdb_df = pwdb_dataframe[target_group_column_name]
    for column, class_set in new_columns.items():
        pwdb_dataframe[column] = refactored_pwdb_df.apply(lambda x: any(item in class_set for item in x))
        pwdb_dataframe[column].replace({True: 1, False: 0}, inplace=True)
    return pwdb_dataframe

In [2]:
df = Dataset.PWDB.fetch()
law2vec = LanguageModel.LAW2VEC.fetch()

100% (1288 of 1288) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


In [3]:
type(law2vec)

bytes

In [5]:
law2vec_path = LanguageModel.LAW2VEC.path_to_local_cache()
law2vec_format = KeyedVectors.load_word2vec_format(law2vec_path, encoding="utf-8")
l2v_dict = {w: vec for w, vec in zip(law2vec_format.index_to_key, law2vec_format.vectors)}

In [264]:
df = target_group_refactoring(df)
df = df.set_index(df.columns[0])



identifier
1297    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
864     [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
1228    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
183     [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
1550    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
                              ...                        
451     [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
889     [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
658     [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
1115    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
1049    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
Name: text_data, Length: 1288, dtype: object

In [331]:
df["text_data"] = df[TEXTUAL_COLUMNS].agg(" ".join, axis=1)

In [332]:
df["text_data"] = df["text_data"].apply(lambda x: text_to_vector(x))


In [328]:
def text_to_vector(text: str):
    return list(np.mean([l2v_dict[word] for word in text.split() if word in l2v_dict], axis=0))

In [285]:
text_to_vector(df["text_data"].values[0])

[ 0.02545931  0.06653013  0.02339859  0.06355191 -0.01755772  0.10998537
  0.0063006  -0.2567684   0.05525973  0.00441868 -0.06974691 -0.16998082
 -0.16435541 -0.10855313  0.05459474  0.19679487 -0.24379154  0.02307621
  0.12596805 -0.20902404  0.00382822 -0.19417858  0.08447942  0.05687957
 -0.17167118 -0.04357034  0.15568979 -0.06187607 -0.11538222 -0.15128207
  0.35370547 -0.11019868 -0.23292814 -0.00622284  0.03836507  0.08357991
  0.05910455  0.05823036 -0.13042156 -0.00968329 -0.0725482  -0.0980014
  0.12952572  0.11194758  0.0283453   0.00061622 -0.12272678  0.12974879
  0.01398437  0.20155644 -0.02581969  0.1963244  -0.01750274 -0.04084431
 -0.13908987 -0.00317987 -0.21313736 -0.07175594  0.11640672 -0.04132283
 -0.01584359  0.04143398 -0.04355407  0.14294216 -0.00199252 -0.02342331
  0.06641872 -0.27542517  0.13968018 -0.01161421 -0.08233781 -0.11091176
  0.07202396 -0.24298942 -0.02012934 -0.03967017  0.04766052  0.08584447
 -0.14345793  0.03357938  0.00665812 -0.00816896 -0.

In [333]:
x_train_list = list()
x_test_list = list()
y_train_list = list()
y_test_list = list()
for column in CLASS_COLUMNS:
    x_train, x_test, y_train, y_test = model_selection.train_test_split(df["text_data"].values, df[column].values,
                                                                        random_state=42, test_size=0.3,
                                                                        shuffle=True)
    x_train_list.append(list(x_train))
    x_test_list.append(list(x_test))
    y_train_list.append(list(y_train))
    y_test_list.append(list(y_test))
train_test_dataset = {"x_train": x_train_list, "x_test": x_test_list, "y_train": y_train_list, "y_test": y_test_list,
                      "class_name": CLASS_COLUMNS}

In [339]:
tmp = pd.DataFrame.from_dict(train_test_dataset)
tmp.head()  #["workers"]
#tmp_dict = tmp.to_dict()
#print(x_train.values)


Unnamed: 0,x_train,x_test,y_train,y_test,class_name
0,"[[0.05028788, 0.11401862, -0.012204234, 0.0503...","[[0.02076964, 0.06914482, -0.00045922917, 0.08...","[1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, ...","[1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, ...",businesses
1,"[[0.05028788, 0.11401862, -0.012204234, 0.0503...","[[0.02076964, 0.06914482, -0.00045922917, 0.08...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",citizens
2,"[[0.05028788, 0.11401862, -0.012204234, 0.0503...","[[0.02076964, 0.06914482, -0.00045922917, 0.08...","[1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, ...","[1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, ...",workers


Unnamed: 0,hey
a,a


In [335]:
train_test_dataset["x_train"][0]
print("t")


t


In [None]:
linear_svc_l2v_1 = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(l2v_dict)),
    ("Multi-label classifier", RandomForestClassifier())])

In [352]:
t = pd.DataFrame()
for iter, row in tmp.iterrows():
    model = RandomForestClassifier()
    result = model.fit(row['x_train'],
                       row['y_train'])
    result.score(row['x_train'],
                 row['y_train'])
    prediction = result.predict(row['x_test'])
    evaluation = model_evaluation_metrics(
        row['y_test'], prediction)
    class_name =row["class_name"]
    t[class_name] = pd.Series(evaluation)

t

Unnamed: 0,businesses,citizens,workers
Accuracy,0.764858,0.945736,0.744186
Precision,0.755991,0.724026,0.745771
Recall,0.689913,0.522443,0.74242
F1-Score,0.703543,0.529497,0.74264
Mean Absolute Error,0.235142,0.054264,0.255814
Mean Squared Error,0.235142,0.054264,0.255814


In [206]:
l2v_target_groups_l1 = linear_svc_l2v_1.fit(train_test_dataset['x_train'],
                                            train_test_dataset['y_train']['businesses'])

In [207]:
l2v_target_groups_l1.score(train_test_dataset['x_train'],
                           train_test_dataset['y_train']['businesses'])

1.0

In [208]:
l2v_tg_l1_prediction = l2v_target_groups_l1.predict(train_test_dataset['x_test'])


In [209]:
l2v_tg_l1_evaluation = model_evaluation_metrics(
    train_test_dataset['y_test']['businesses'], l2v_tg_l1_prediction)

In [210]:
l2v_tg_l1_evaluation

{'Accuracy': 0.7209302325581395,
 'Precision': 0.675,
 'Recall': 0.6261086474501109,
 'F1-Score': 0.6335227272727273,
 'Mean Absolute Error': 0.27906976744186046,
 'Mean Squared Error': 0.27906976744186046}