## Word2vec model training
#### Model training based on three datasets' text data:
- M1: pwdb + eu_timeline  ( +  ireland_timeline )
- M2: ds_eu_cellar
- M3: M1+M2

#### Extract NOUN and NOUN PHRASES from each text data
#### Train the word2vec model with each dataset's textual data

### Import libraries

In [2]:
import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))

import os
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import warnings
warnings.filterwarnings("ignore")

from typing import List

import numpy as np
import pandas as pd

from sem_covid.services.data_registry import Dataset
from sem_covid.services.store_registry import store_registry
from sem_covid.services.language_model_execution_steps import LanguageModelExecutionSteps
from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.graph_handling import create_graph_for_language_model_key_words

2021-08-25 09:27:31.212273: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-08-25 09:27:31.212325: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


## Define constants

In [3]:
PWDB_TEXTUAL_CLASS = ['title', 'background_info_description', 'content_of_measure_description',
                      'use_of_measure_description', 'involvement_of_social_partners_description']

DEFAULT_TEXTUAL_COLUMN = ['title']

KEY_WORDS_FOR_ALL_MODELS =  ["work", "aid", "coronavirus", "covid19", "health", "measures", "vaccine",
                             "spain", "italy", "ireland", "government", "sector", "organisations",
                             "companies", "businesses", "workers", "citizens", "employees", "unemployment",
                             "protection", "support", "insurance", "reorientation", "adaptation", "economy",
                             "economic", "social", "funds", "innovation", "research", "development",
                             "risk", "transport"]

KEY_WORDS_FOR_MODEL1_AND_2 = ['work', 'agreement', 'measures', 'temporary', 'covid19',
                              'public', 'national', 'statement', 'announce', 'minister', 'coronavirus',
                              'vaccine', 'commission']



COUNTRIES = ['austria', 'belgium', 'bulgaria', 'croatia', 'cyprus', 'czechia', 'denmark', 'estonia',
             'european_union', 'finland', 'france', 'germany', 'greece', 'hungary', 'ireland', 'italy',
             'latvia', 'lithuania', 'luxembourg', 'malta', 'netherlands', 'norway', 'poland', 'portugal',
             'romania', 'slovakia', 'slovenia', 'spain', 'sweden', 'united_kingdom']

CATEGORY = ['retention', 'workplace', 'labour', 'recovery', 'economic', 'adaptation',
            'businesses', 'protection', 'essential', 'workers', 'business_continuity',
            'services', 'social', 'market']

SUBCATEGORY = ['safety', 'arrangements', 'health', 'spending', 'working', 'support', 'occupational',
               'stimulus_packages', 'access', 'time', 'finance', 'remote', 'flexibility', 'workers',
               'essential_services', 'remuneration']

TARGET_GROUPS_L1 = ['businesses', 'workers', 'citizens']

TARGET_GROUPS_L2 = ['company', 'older', 'people', 'female', 'aged', 'corporations', 'businesses',
                    'single', 'person', 'forms', 'smes', 'ups', 'single_parents',
                    'citizens', 'professions', 'parents', 'groups', 'youth', 'workers', 'essential_services',
                    'sector', 'women', 'workplace', 'unemployed', 'care', 'facilities', 'standard',
                    'specific', 'companies', 'contractors', 'children', 'border', 'refugees',
                    'minors', 'platform', 'employment', 'seasonal', 'disabled', 'migrants',
                    'risk_group', 'commuters', 'employees']

FUNDING = ['companies', 'national_funds', 'employer', 'funds', 'european_funds', 'no_special_funding_required',
           'regional_funds', 'local_funds', 'employers_organization', 'employees']

def call_word_graph_configs(key_words_list: List[str]) -> dict:
    return {'category': CATEGORY,
            'key_words': key_words_list,
            'subcategory': SUBCATEGORY,
            'countries': COUNTRIES,
            'target_groups_l1': TARGET_GROUPS_L1,
            'target_groups_l2': TARGET_GROUPS_L2,
            'funding': FUNDING}

NR1_MODEL_NAME = 'model1'
NR2_MODEL_NAME = 'model2'
NR3_MODEL_NAME = 'model3'

MODEL1_FILE_NAME = 'model1_language_model.model'
MODEL2_FILE_NAME = 'model2_language_model.model'
MODEL3_FILE_NAME = 'model3_language_model.model'

COSINE_METRIC_NAME = 'cosine_matrix'
EUCLIDEAN_METRIC_NAME = 'euclidean_matrix'
HAMMING_METRIC_NAME = 'hamming_matrix'

## Data preprocessing
- data cleanup
- turn corpus into spacy document


In [4]:
def plot_graphs(pipeline: LanguageModelExecutionSteps, model_name: str, model_file_name: str, metric_name: str,
                threshold: np.float64, word_graph_configs: dict, normalize_func) -> None:
    """
        steps of generating d3 graph, calling the similarity matrix from minio and normalizing it.
    Args:
        pipeline: Pipeline of language model execution stems
        model_name: the name of the model
        model_file_name: word2vec file name from MinIO
        metric_name: the name of used metric
        threshold: the minimum of similarity number
        word_graph_configs: dictionary of key words
        normalize_func: function of similarity normalization
    """
    model_cosine_matrix = store_registry.minio_object_store('semantic-similarity-matrices').get_object(model_file_name)
    for key in word_graph_configs.keys():
        create_graph_for_language_model_key_words(pd.read_json(model_cosine_matrix).applymap(normalize_func),
                                                  pipeline.filter_language_model_words().select_key_words(key_words=word_graph_configs[key]),
                                                  model_name=model_name, metrics_name=metric_name,
                                                  metric_threshold=threshold, column_name=key)

## Experiment Nr#1 language model based on:
- PWDB
- eu-timeline
- ireland-timeline

## Experiment Nr#2 language model based on:
- eu-cellar

## Experiment Nr#3 language model based on:
- PWDB
- eu-timeline
- ireland-timeline
- eu-cellar


In [5]:
model1_dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN)
]

model2_dataset_sources_config = [
    (Dataset.EU_CELLAR_ENRICHED, DEFAULT_TEXTUAL_COLUMN),
]

model3_dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.EU_CELLAR_ENRICHED, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN)
]

In [6]:
model1_execution_steps = LanguageModelExecutionSteps(language_model_file_name=MODEL1_FILE_NAME, model_name=NR1_MODEL_NAME)
# model1_execution_steps.train_language_model(model1_dataset_sources_config)
# model1_execution_steps.train_similarity_matrices()

In [7]:
model2_execution_steps = LanguageModelExecutionSteps(language_model_file_name=MODEL2_FILE_NAME, model_name=NR2_MODEL_NAME)
# model2_execution_steps.train_language_model(model2_dataset_sources_config)
# model2_execution_steps.train_similarity_matrices()

In [8]:
model3_execution_steps = LanguageModelExecutionSteps(language_model_file_name=MODEL3_FILE_NAME, model_name=NR3_MODEL_NAME)
# model3_execution_steps.train_language_model(model3_dataset_sources_config)
# model3_execution_steps.train_similarity_matrices()

### Generate D3 Graphs

#### Cosine similarity graph

In [None]:
plot_graphs(pipeline=model1_execution_steps,
            model_name=NR1_MODEL_NAME,
            model_file_name='model1_cosine_matrix.json',
            threshold=0.6,
            metric_name=COSINE_METRIC_NAME,
            word_graph_configs=call_word_graph_configs(KEY_WORDS_FOR_MODEL1_AND_2),
            normalize_func=lambda x: 1 - x)

In [None]:
plot_graphs(pipeline=model2_execution_steps,
            model_name=NR2_MODEL_NAME,
            model_file_name='model2_cosine_matrix.json',
            threshold=0.6,
            metric_name=COSINE_METRIC_NAME,
            word_graph_configs=call_word_graph_configs(KEY_WORDS_FOR_MODEL1_AND_2),
            normalize_func=lambda x: 1 - x)

In [None]:
plot_graphs(pipeline=model3_execution_steps,
            model_name=NR3_MODEL_NAME,
            model_file_name='model3_cosine_matrix.json',
            threshold=0.6,
            metric_name=COSINE_METRIC_NAME,
            word_graph_configs=call_word_graph_configs(KEY_WORDS_FOR_ALL_MODELS),
            normalize_func=lambda x: 1 - x)