## Word2vec model training
#### Model training based on three datasets' text data:
- M1: pwdb + eu_timeline  ( +  ireland_timeline )
- M2: ds_eu_cellar
- M3: M1+M2

#### Extract NOUN and NOUN PHRASES from each text data
#### Train the word2vec model with each dataset's textual data

### Import libraries

In [1]:
import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))

import os
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import warnings
warnings.filterwarnings("ignore")

import numpy as np

from sem_covid.services.data_registry import Dataset
from sem_covid.services.store_registry import store_registry
from sem_covid.services.language_model_execution_steps import LanguageModelExecutionSteps
from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.graph_handling import create_graph_for_language_model_key_words

2021-08-26 12:59:36.913122: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-08-26 12:59:36.913140: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


# economic
## Define constants

In [2]:
PWDB_TEXTUAL_CLASS = ['title', 'background_info_description', 'content_of_measure_description',
                      'use_of_measure_description', 'involvement_of_social_partners_description']

DEFAULT_TEXTUAL_COLUMN = ['title']

KEY_WORDS_FOR_ALL_MODELS =  ["eu", "national", "work", "aid", "coronavirus", "covid19", "measures",
                             "vaccine", "minister", "government", "organisations",
                             "agreement", "unemployment", "insurance", "reorientation", "economy",
                             "economic", "innovation", "research", "development", "risk", "transport"]

COUNTRIES = ['austria', 'belgium', 'bulgaria', 'croatia', 'cyprus', 'czechia', 'denmark', 'estonia',
             'european_union', 'finland', 'france', 'germany', 'greece', 'hungary', 'ireland', 'italy',
             'latvia', 'lithuania', 'luxembourg', 'malta', 'netherlands', 'norway', 'poland', 'portugal',
             'romania', 'slovakia', 'slovenia', 'spain', 'sweden', 'united_kingdom']

CATEGORY = ['retention', 'workplace', 'labour', 'recovery', 'adaptation',
            'protection', 'essential', 'business_continuity',
            'services', 'social', 'market']

SUBCATEGORY = ['safety', 'arrangements', 'health', 'spending', 'working', 'support', 'occupational',
               'stimulus_packages', 'access', 'time', 'finance', 'remote', 'flexibility',
               'essential_services', 'remuneration']

TARGET_GROUPS_L1 = ['businesses', 'workers', 'citizens']

TARGET_GROUPS_L2 = ['company', 'older', 'people', 'female', 'aged', 'corporations',
                    'single', 'person', 'forms', 'smes', 'ups', 'single_parents',
                    'citizens', 'professions', 'parents', 'groups', 'youth',
                    'sector', 'women', 'unemployed', 'care', 'facilities', 'standard',
                    'specific', 'contractors', 'children', 'border', 'refugees',
                    'minors', 'platform', 'employment', 'seasonal', 'disabled', 'migrants',
                    'risk_group', 'commuters']

FUNDING = ['companies', 'national_funds', 'employer', 'funds', 'european_funds', 'no_special_funding_required',
           'regional_funds', 'local_funds', 'employers_organization', 'employees']



MODEL1_AND_2_WORDS = {  'category': CATEGORY,
                        'subcategory': SUBCATEGORY,
                        'countries': COUNTRIES,
                        'target_groups_l1': TARGET_GROUPS_L1,
                        'target_groups_l2': TARGET_GROUPS_L2,
                        'funding': FUNDING}

MODEL3_WORDS = {'keywords': KEY_WORDS_FOR_ALL_MODELS}

NR1_MODEL_NAME = 'model1'
NR2_MODEL_NAME = 'model2'
NR3_MODEL_NAME = 'model3'

MODEL1_FILE_NAME = 'model1_language_model.model'
MODEL2_FILE_NAME = 'model2_language_model.model'
MODEL3_FILE_NAME = 'model3_language_model.model'

SIMILARITY_MATRIX_BUCKET_NAME = 'semantic-similarity-matrices'

MODEL1_COSINE_SIMILARITY_MATRIX = 'model1_cosine_matrix.pkl'
MODEL2_COSINE_SIMILARITY_MATRIX = 'model2_cosine_matrix.pkl'
MODEL3_COSINE_SIMILARITY_MATRIX = 'model3_cosine_matrix.pkl'

## Data preprocessing
- data cleanup
- turn corpus into spacy document


## Experiment Nr#1 language model based on:
- PWDB
- eu-timeline
- ireland-timeline

## Experiment Nr#2 language model based on:
- eu-cellar

## Experiment Nr#3 language model based on:
- PWDB
- eu-timeline
- ireland-timeline
- eu-cellar


In [3]:
ds_pwdb = Dataset.PWDB.fetch()
ds_eu_action_timeline = Dataset.EU_ACTION_TIMELINE.fetch()
ds_ireland_action_timeline = Dataset.IRELAND_ACTION_TIMELINE.fetch()
ds_eu_cellar = Dataset.EU_CELLAR_ENRICHED.fetch()

100% (1288 of 1288) |####################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (210 of 210) |######################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (1859 of 1859) |####################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (2653 of 2653) |####################| Elapsed Time: 0:00:01 Time:  0:00:01


In [4]:
model1_dataset_sources_config = [
    (ds_pwdb, PWDB_TEXTUAL_CLASS),
    (ds_eu_action_timeline, DEFAULT_TEXTUAL_COLUMN),
    (ds_ireland_action_timeline, DEFAULT_TEXTUAL_COLUMN)
]

model2_dataset_sources_config = [
    (ds_eu_cellar, DEFAULT_TEXTUAL_COLUMN),
]

model3_dataset_sources_config = [
    (ds_pwdb, PWDB_TEXTUAL_CLASS),
    (ds_eu_action_timeline, DEFAULT_TEXTUAL_COLUMN),
    (ds_eu_cellar, DEFAULT_TEXTUAL_COLUMN),
    (ds_ireland_action_timeline, DEFAULT_TEXTUAL_COLUMN)
]

In [5]:
%%time

model1_execution_steps = LanguageModelExecutionSteps(language_model_file_name=MODEL1_FILE_NAME, model_name=NR1_MODEL_NAME)
model1_execution_steps.train_language_model(model1_dataset_sources_config)
model1_execution_steps.train_similarity_matrices()

CPU times: user 3min 56s, sys: 681 ms, total: 3min 56s
Wall time: 3min 49s


In [6]:
%%time

model2_execution_steps = LanguageModelExecutionSteps(language_model_file_name=MODEL2_FILE_NAME, model_name=NR2_MODEL_NAME)
model2_execution_steps.train_language_model(model2_dataset_sources_config)
model2_execution_steps.train_similarity_matrices()

CPU times: user 1min 5s, sys: 32.2 ms, total: 1min 5s
Wall time: 1min 4s


In [7]:
%%time

model3_execution_steps = LanguageModelExecutionSteps(language_model_file_name=MODEL3_FILE_NAME, model_name=NR3_MODEL_NAME)
model3_execution_steps.train_language_model(model3_dataset_sources_config)
model3_execution_steps.train_similarity_matrices()

CPU times: user 4min 51s, sys: 312 ms, total: 4min 51s
Wall time: 4min 43s


### Generate D3 Graphs

#### Cosine similarity graph

In [8]:
def plot_graphs(pipeline: LanguageModelExecutionSteps, model_name: str, model_file_name: str,
                threshold: np.float64, word_graph_configs: dict, normalize_func) -> None:
    """
        steps of generating d3 graph, calling the similarity matrix from minio and normalizing it.
    Args:
        pipeline: Pipeline of language model execution stems
        model_name: the name of the model
        model_file_name: word2vec file name from MinIO
        threshold: the minimum of similarity number
        word_graph_configs: dictionary of key words
        normalize_func: function of similarity normalization
    """
    model_cosine_matrix = store_registry.minio_feature_store(SIMILARITY_MATRIX_BUCKET_NAME).get_features(model_file_name)
    for key in word_graph_configs.keys():
        create_graph_for_language_model_key_words(model_cosine_matrix.applymap(normalize_func),
                                                  pipeline.filter_language_model_words().select_key_words(key_words=word_graph_configs[key]),
                                                  model_name=model_name,
                                                  metric_threshold=threshold, column_name=key)

In [10]:
print('start')
plot_graphs(pipeline=model1_execution_steps,
            model_name=NR1_MODEL_NAME,
            model_file_name=MODEL1_COSINE_SIMILARITY_MATRIX,
            threshold=0.6,
            word_graph_configs=MODEL1_AND_2_WORDS,
            normalize_func=lambda x: 1 - x)

start
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/retention.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/workplace.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/labour.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/recovery.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/adaptation.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/protection.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/essential.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/services.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/social.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/market.html
Writing /home

In [9]:
print('start')
plot_graphs(pipeline=model2_execution_steps,
            model_name=NR2_MODEL_NAME,
            model_file_name=MODEL2_COSINE_SIMILARITY_MATRIX,
            threshold=0.6,
            word_graph_configs=MODEL1_AND_2_WORDS,
            normalize_func=lambda x: 1 - x)

start
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/recovery.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/protection.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/services.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/social.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/market.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/subcategory/safety.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/subcategory/arrangements.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/subcategory/health.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/subcategory/working.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/subcategory/access.html
Writ

In [11]:
print('start')
plot_graphs(pipeline=model3_execution_steps,
            model_name=NR3_MODEL_NAME,
            model_file_name=MODEL3_COSINE_SIMILARITY_MATRIX,
            threshold=0.6,
            word_graph_configs=MODEL3_WORDS,
            normalize_func=lambda x: 1 - x)

start
[d3graph] >Creating directory [docs/word-similarity-web/model3_graphs/keywords/]
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/keywords/aid.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/keywords/covid19.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/keywords/measures.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/keywords/vaccine.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/keywords/government.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/keywords/organisations.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/keywords/agreement.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/keywords/unemployment.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/keywords/insurance.html
Writing /home/jovyan/work