## Word2vec model training
#### Model training based on three datasets' text data:
- M1: pwdb + eu_timeline  ( +  ireland_timeline )
- M2: ds_eu_cellar
- M3: M1+M2

#### Extract NOUN and NOUN PHRASES from each text data
#### Train the word2vec model with each dataset's textual data

### Import libraries

In [1]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))

import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import warnings

warnings.filterwarnings("ignore")

import numpy as np

from sem_covid.services.store_registry import store_registry
from sem_covid.services.language_model_execution_steps import LanguageModelExecutionSteps
from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.graph_handling import (
    create_graph_for_language_model_key_words)
from typing import List
import time

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


# economic
## Define constants

In [2]:
DEFAULT_TEXTUAL_CLASS = ['Title', 'Content']

PWDB_TEXTUAL_CLASS = ['title', 'background_info_description', 'content_of_measure_description',
                      'country', 'category', 'subcategory', 'target_groups']

EU_CELLAR_TEXTUAL_CLASS = ['title', 'content', 'eurovoc_concept_labels', 'subject_matter_labels',
                           'directory_codes_labels']

IRELAND_ACTION_TIMELINE_CLASS = ['title', 'content', 'keyword']

EU_ACTION_TIMELINE_CLASS = ['abstract', 'title', 'topics', 'detail_content']

KEY_WORDS_FOR_ALL_MODELS = ["eu", "national", "work", "aid", "coronavirus", "covid19", "measures",
                            "vaccine", "minister", "government", "organisations",
                            "agreement", "unemployment", "insurance", "reorientation", "economy",
                            "economic", "innovation", "research", "development", "risk", "transport"]

COUNTRIES = ['austria', 'belgium', 'bulgaria', 'croatia', 'cyprus', 'czechia', 'denmark', 'estonia',
             'european_union', 'finland', 'france', 'germany', 'greece', 'hungary', 'ireland', 'italy',
             'latvia', 'lithuania', 'luxembourg', 'malta', 'netherlands', 'norway', 'poland', 'portugal',
             'romania', 'slovakia', 'slovenia', 'spain', 'sweden', 'united_kingdom']

CATEGORY = ['retention', 'workplace', 'labour', 'recovery', 'adaptation',
            'protection', 'essential', 'business_continuity',
            'services', 'social', 'market']

SUBCATEGORY = ['safety', 'arrangements', 'health', 'spending', 'working', 'support', 'occupational',
               'stimulus_packages', 'access', 'time', 'finance', 'remote', 'flexibility',
               'essential_services', 'remuneration']

TARGET_GROUPS_L1 = ['businesses', 'workers', 'citizens']

TARGET_GROUPS_L2 = ['company', 'older', 'people', 'female', 'aged', 'corporations',
                    'single', 'person', 'forms', 'smes', 'ups', 'single_parents',
                    'citizens', 'professions', 'parents', 'groups', 'youth',
                    'sector', 'women', 'unemployed', 'care', 'facilities', 'standard',
                    'specific', 'contractors', 'children', 'border', 'refugees',
                    'minors', 'platform', 'employment', 'seasonal', 'disabled', 'migrants',
                    'risk_group', 'commuters']

FUNDING = ['companies', 'national_funds', 'employer', 'funds', 'european_funds', 'no_special_funding_required',
           'regional_funds', 'local_funds', 'employers_organization', 'employees']

WORDS_PACK1 = {'category': CATEGORY,
               'subcategory': SUBCATEGORY,
               'countries': COUNTRIES,
               'target_groups_l1': TARGET_GROUPS_L1,
               'target_groups_l2': TARGET_GROUPS_L2,
               'funding': FUNDING}

WORDS_PACK2 = {'keywords': KEY_WORDS_FOR_ALL_MODELS}

MODEL_WORDS_PACKS = (WORDS_PACK1, WORDS_PACK1, WORDS_PACK2)


MODEL_NAMES = ('model1', 'model2', 'model3')


FILE_NAMES = ('model1_language_model.model',
              'model2_language_model.model',
              'model3_language_model.model'
              )

SIMILARITY_MATRIX_BUCKET_NAME = 'semantic-similarity-matrices'

COSINE_SIMILARITY_MATRICES = ('model1_cosine_matrix.pkl',
                              'model2_cosine_matrix.pkl',
                              'model3_cosine_matrix.pkl'
                              )

## Data preprocessing
- data cleanup
- turn corpus into spacy document


## Experiment Nr#1 language model based on:
- PWDB
- eu-timeline
- ireland-timeline

## Experiment Nr#2 language model based on:
- eu-cellar

## Experiment Nr#3 language model based on:
- PWDB
- eu-timeline
- ireland-timeline
- eu-cellar


In [3]:
ds_unified = store_registry.es_index_store().get_dataframe('ds_unified_datasets')

100% (4126 of 4126) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


In [4]:
model1_df = ds_unified.query('Document_source != "eu_cellar"')
model2_df = ds_unified.query('Document_source == "eu_cellar"')
model3_df = ds_unified

In [5]:
MODEL_DATASET_SOURCES_CONFIGS = (
    [
        (model1_df, DEFAULT_TEXTUAL_CLASS),
    ],
    [
        (model2_df, DEFAULT_TEXTUAL_CLASS),
    ],
    [
        (model3_df, DEFAULT_TEXTUAL_CLASS),
    ]
)

### Generate D3 Graphs

#### Cosine similarity graph

In [6]:
def plot_graphs(pipeline: LanguageModelExecutionSteps, model_name: str, model_file_name: str,
                threshold: np.float64, word_graph_configs: dict, normalize_func) -> None:
    """
        steps of generating d3 graph, calling the similarity matrix from minio and normalizing it.
    Args:
        pipeline: Pipeline of language model execution stems
        model_name: the name of the model
        model_file_name: word2vec file name from MinIO
        threshold: the minimum of similarity number
        word_graph_configs: dictionary of key words
        normalize_func: function of similarity normalization
    """
    model_cosine_matrix = store_registry.minio_feature_store(SIMILARITY_MATRIX_BUCKET_NAME).get_features(
        model_file_name)
    model_cosine_matrix = model_cosine_matrix.applymap(normalize_func)
    for key in word_graph_configs.keys():
        create_graph_for_language_model_key_words(model_cosine_matrix,
                                                  pipeline.filter_language_model_words().select_key_words(
                                                      key_words=word_graph_configs[key]),
                                                  model_name=model_name,
                                                  metric_threshold=threshold, column_name=key)

In [12]:
def execute_language_model_pipeline(model_file_name: str,
                                    model_name: str,
                                    model_dataset_sources_config: List[tuple],
                                    #model_words_pack: dict,
                                    #cosine_similarity_matrix: str
                                    ):
    start = time.time()
    print(f'Start execution for {model_name}:')
    model_execution_steps = LanguageModelExecutionSteps(language_model_file_name=model_file_name,
                                                        model_name=model_name)
    model_execution_steps.train_language_model(model_dataset_sources_config)
    model_execution_steps.train_similarity_matrices()
    # plot_graphs(pipeline=model_execution_steps,
    #             model_name=model_name,
    #             model_file_name=cosine_similarity_matrix,
    #             threshold=0.6,
    #             word_graph_configs=model_words_pack,
    #             normalize_func=lambda x: 1 - x)
    del model_execution_steps
    end = time.time()
    print(f'Execution finish for {model_name} in:')
    print(round((end - start), 4), 'seconds')



In [None]:
for model_file_name, model_name, model_dataset_sources_config in zip(
        FILE_NAMES[1:], MODEL_NAMES[1:],
        MODEL_DATASET_SOURCES_CONFIGS[1:],
        #MODEL_WORDS_PACKS,
        #COSINE_SIMILARITY_MATRICES
    ):
    execute_language_model_pipeline(model_file_name=model_file_name,
                                    model_name=model_name,
                                    model_dataset_sources_config=model_dataset_sources_config,
                                    #model_words_pack=model_words_pack,
                                    #cosine_similarity_matrix=cosine_similarity_matrix
                                    )


Start execution for model2:
Start computing similarity matrix.


In [9]:
print('12')

12


In [None]:
from gensim.models import WordEmbeddingSimilarityIndex

WordEmbeddingSimilarityIndex