## Word2vec model training
#### Model training based on three datasets' text data:
- M1: pwdb + eu_timeline  ( +  ireland_timeline )
- M2: ds_eu_cellar
- M3: M1+M2

#### Extract NOUN and NOUN PHRASES from each text data
#### Train the word2vec model with each dataset's textual data

### Import libraries

In [1]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))

import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
import warnings

warnings.filterwarnings("ignore")

import re
import pickle
from typing import List, Tuple

import spacy

nlp = spacy.load('en_core_web_sm')
nlp.max_length = 1500000

import numpy as np
import pandas as pd
import networkx as nx
from gensim.models import Word2Vec
from d3graph import d3graph

from sem_covid.services.data_registry import Dataset
from sem_covid.services.store_registry import store_registry
from sem_covid.adapters.data_source import IndexTabularDataSource

from sem_covid.entrypoints.notebooks.topic_modeling.topic_modeling_wrangling.token_management import select_pos

from sem_covid.services.sc_wrangling.data_cleaning import (clean_text_from_specific_characters, clean_fix_unicode,
                                                           clean_remove_currency_symbols, clean_remove_emails,
                                                           clean_remove_urls, clean_remove_stopwords)

from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.similarity_calculus import build_similarity_matrix

from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.document_handling_tools import (
    document_atomization_noun_phrases, lemmatize_document)

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


## Define constants

In [2]:
PWDB_TEXTUAL_CLASS = ['title', 'background_info_description', 'content_of_measure_description',
                      'use_of_measure_description', 'involvement_of_social_partners_description']

DEFAULT_TEXTUAL_COLUMN = ['title']
WINDOW = 5
MIN_COUNT = 1
VECTOR_SIZE = 300
EPOCHS = 50
EU_TIMELINE_TOTAL_EXAMPLES = 171
IRELAND_TIMELINE_TOTAL_EXAMPLES = 410
EU_CELLAR_TOTAL_EXAMPLES = 2653

KEY_WORDS = ['work', 'agreement', 'working', 'companies', 'workers',
             'measures', 'temporary', 'social', 'support', 'covid19',
             '2020', 'public', 'national', 'ireland', 'statement', '2021',
             'announce', 'health', 'minister', 'new', 'billion', 'coronavirus',
             'vaccine', 'eur', 'support', 'million', 'commission', 'eu']

COUNTRIES = ['austria', 'belgium', 'bulgaria', 'croatia', 'cyprus', 'czechia', 'denmark', 'estonia',
             'european_union', 'finland', 'france', 'germany', 'greece', 'hungary', 'ireland', 'italy',
             'latvia', 'lithuania', 'luxembourg', 'malta', 'netherlands', 'norway', 'poland', 'portugal',
             'romania', 'slovakia', 'slovenia', 'spain', 'sweden', 'united', 'kingdom']

CATEGORY = ['retention', 'workplace', 'labour', 'recovery', 'economic', 'adaptation',
            'businesses', 'protection', 'essential', 'workers', 'business_continuity',
            'services', 'social', 'market']

SUBCATEGORY = ['safety', 'arrangements', 'health', 'spending', 'working', 'support', 'occupational',
               'stimulus_packages', 'access', 'time', 'finance', 'remote', 'flexibility', 'workers',
               'essential_services', 'remuneration']

TARGET_GROUPS_L1 = ['businesses', 'workers', 'citizens']

TARGET_GROUPS_L2 = ['company', 'older', 'people', 'female', 'aged', 'corporations', 'businesses',
                    'single', 'person', 'larger', 'forms', 'smes', 'ups', 'non', 'single_parents',
                    'citizens', 'professions', 'parents', 'groups', 'youth', 'workers', 'essential_services',
                    'sector', 'women', 'workplace', 'unemployed', 'care', 'facilities', 'other', 'standard',
                    'specific', 'companies', 'self', 'contractors', 'children', 'border', 'solo', 'refugees',
                    'minors', 'cross', 'platform', 'employment', 'seasonal', 'disabled', 'migrants', 'start',
                    'risk_group', 'commuters', 'employees']

FUNDING = ['companies', 'national_funds', 'employer', 'funds', 'national_funds']

WORD_GRAPH_CONFIGS = {'category': CATEGORY,
                      'subcategory': SUBCATEGORY,
                      'key_words': KEY_WORDS,
                      'countries': COUNTRIES,
                      'target_groups_l1': TARGET_GROUPS_L1,
                      'target_groups_l2': TARGET_GROUPS_L2,
                      'funding': FUNDING
                      }

NR1_MODEL_NAME = 'model1'
NR2_MODEL_NAME = 'model2'
NR3_MODEL_NAME = 'model3'

MODEL1_FILE_NAME = 'model1_language_model.model'
MODEL2_FILE_NAME = 'model2_language_model.model'
MODEL3_FILE_NAME = 'model3_language_model.model'

## Data preprocessing
- data cleanup
- turn corpus into spacy document

In [3]:
def add_space_between_dots_and_commas(text: str):
    return re.sub(r'(?<=[.,])(?=[^\s])', r' ', text)


def apply_cleaning_functions(document_corpus: pd.Series) -> pd.Series:
    """
    This function receives the document and leads through cleaning steps
    Args:
        document_corpus: dataset document corpus

    Returns: clean document corpus
    """
    unused_characters = ["\\r", ">", "\n", "\\", "<", "''", "%", "...", "\'", '"', "(", "\n", "*", "1)", "2)", "3)",
                         "[", "]", "-", "_", "\r", '®', '..']

    new_document_corpus = document_corpus.apply(clean_text_from_specific_characters, characters=unused_characters)
    new_document_corpus = new_document_corpus.apply(clean_fix_unicode)
    new_document_corpus = new_document_corpus.apply(clean_remove_urls)
    new_document_corpus = new_document_corpus.apply(clean_remove_emails)
    new_document_corpus = new_document_corpus.apply(clean_remove_currency_symbols)
    new_document_corpus = new_document_corpus.apply(clean_remove_stopwords)
    new_document_corpus = new_document_corpus.apply(add_space_between_dots_and_commas)

    return new_document_corpus


def generate_graph(similarity_matrix: pd.DataFrame, graph: nx.Graph, root_word: str,
                   top_words: int, threshold: np.float64 = 0.8, deep_level: int = 0,
                   max_deep_level: int = 2, deep_map: dict = None, color_map: dict = None) -> nx.Graph:
    """
        Generates d3 graph using the inserted keywords and their top words from similarity matrix
    Args:
        similarity_matrix: Dataframe with word similarity
        graph: networkx graph
        root_word: key words
        top_words: top similar words from inserted keywords
        threshold: minimum percentage of similarity
        deep_level: the level of generating leaf
        max_deep_level: the maximum number of generated leaf
        deep_map: dictionary of the words and their level of similarity
        color_map: the color of each level of words' similarity

    Returns: a d3 graph with title and root of key word and their similarity words
    """
    if root_word not in deep_map.keys():
        deep_map[root_word] = (deep_level, color_map[deep_level])
    elif deep_map[root_word][0] > deep_level:
        deep_map[root_word] = (deep_level, color_map[deep_level])
    if deep_level > max_deep_level:
        return graph
    new_nodes = similarity_matrix[root_word].sort_values(ascending=False)[:top_words].index.to_list()
    new_nodes_weight = list(similarity_matrix[root_word].sort_values(ascending=False)[:top_words].values)
    for index in range(0, len(new_nodes)):
        if new_nodes_weight[index] >= threshold:
            graph.add_edge(root_word, new_nodes[index])
            generate_graph(similarity_matrix, graph, new_nodes[index], top_words, threshold, deep_level + 1,
                           max_deep_level,
                           deep_map=deep_map, color_map=color_map)

    return graph


def create_graph_for_language_model_key_words(similarity_matrix: pd.DataFrame, language_model_words: list,
                                              model_name: str, column_name: str,
                                              metric_threshold: np.float64) -> d3graph:
    """
    !!! This is not reusable function. It was made for a single thing !!!

    It generates d3graph based on language model selected words and and the similarity
    matrix created with those words.
    """
    graph_folder_path = f'docs/word-similarity-web/{model_name}_graphs/{column_name}/'
    color_map = {0: '#a70000',
                 1: '#f0000',
                 2: '#ff7b7b',
                 3: '#ffbaba'}
    for index in range(0, len(language_model_words)):
        deep_map = {}
        graph = generate_graph(similarity_matrix, nx.Graph(), language_model_words[index],
                               top_words=4, threshold=metric_threshold
                               , max_deep_level=2, deep_map=deep_map, color_map=color_map)
        network_adjacency_matrix = pd.DataFrame(data=nx.adjacency_matrix(graph).todense(),
                                                index=graph.nodes(), columns=graph.nodes())
        node_color_list = [deep_map[node][0] for node in graph.nodes()]
        d3graph(network_adjacency_matrix, savepath=graph_folder_path, savename=language_model_words[index],
                node_color=node_color_list,
                width=1920, height=1080, edge_width=5,
                edge_distance=60, directed=True)


In [4]:
class LanguageModelPipeline:
    """
        This pipeline executes the steps for word2vec language training.
    """

    def __init__(self, dataset_sources: List[Tuple[IndexTabularDataSource, List[str]]], language_model_name: str):
        """
            :param dataset_sources: represents the source of the datasets.
        """
        self.dataset_sources = dataset_sources
        self.language_model_name = language_model_name
        self.documents_corpus = pd.Series()
        self.word2vec = None

    def download_datasets(self):
        """
            In this step it will download the dataset and detect selected columns.
            It can be downloaded as many datasets as there are in data source.
        """
        self.dataset_sources = [(dataset_columns, dataset_source.fetch())
                                for dataset_source, dataset_columns in self.dataset_sources]

    def extract_textual_data(self):
        """
            After downloading the datasets, the textual data will be found and and concatenated
            with executing of several steps as well. It will fill the NaN values with empty space,
            add a dot at the end of each concatenated column and reset the index.
        """
        self.documents_corpus = pd.concat([dataset[columns]
                                          .fillna(value="")
                                          .agg('. '.join, axis=1)
                                          .reset_index(drop=True)
                                           for columns, dataset in self.dataset_sources
                                           ], ignore_index=True)

    def clean_textual_data(self):
        """
            The next step is data cleaning. In this step the function "apply_cleaning_functions"
            applies the following actions:
                - clean the document from specific characters
                - delete unicode
                - removes emails and URLs and currency symbols
        """
        self.documents_corpus = apply_cleaning_functions(self.documents_corpus)

    def transform_to_spacy_doc(self):
        """
            When the document is clean, is going to be transform into spacy document
        """
        self.documents_corpus = self.documents_corpus.apply(nlp)

    def extract_features(self):
        """
            To extract the parts of speech, below it was defined classes for each token is necessary.
        """
        self.documents_corpus = pd.concat([self.documents_corpus,
                                           self.documents_corpus.apply(document_atomization_noun_phrases),
                                           self.documents_corpus.apply(lemmatize_document)]
                                          , ignore_index=True)

        self.documents_corpus = self.documents_corpus.apply(lambda x: list(map(str, x)))

    def model_training(self):
        """
            When the data is prepared it's stored into Word2Vec model.
        """
        self.word2vec = Word2Vec(sentences=self.documents_corpus, window=WINDOW,
                                 min_count=MIN_COUNT, size=VECTOR_SIZE)

    def save_language_model(self):
        """
            Saves trained model in MinIO
        """
        minio = store_registry.minio_object_store('mdl-language')
        minio.put_object('word2vec/' + self.language_model_name, pickle.dumps(self.word2vec))

    def execute(self):
        """
            The final step is execution, where are stored each step and it will be executed in a row
        """
        self.download_datasets()
        self.extract_textual_data()
        self.clean_textual_data()
        self.transform_to_spacy_doc()
        self.extract_features()
        self.model_training()
        self.save_language_model()


class LanguageModelWordsFilter:
    def __init__(self, word2vec_model: Word2Vec, key_words: List[str], pos: List[str]) -> None:
        self.word2vec_model = word2vec_model
        self.key_words = key_words
        self.pos = pos
        self.word2vec_document = None
        self.word2vec_document = nlp(' '.join(self.word2vec_model.wv.index2word))
        self.word2vec_document = select_pos(self.word2vec_document, self.pos)
        self._extract_pos = list(map(str, self.word2vec_document))

    def extract_pos(self) -> List[str]:
        """
            transforms a word2vec indexes into spacy document and selects parts of
            speech. After that it puts into a list and converts those parts of speech
            into strings.
        """
        return self._extract_pos

    def select_key_words(self) -> List[str]:
        """
            Finds each word form inserted list of key words and returns a
            list with those words if there are presented in the list of
            extracted parts of speech.
        """
        return [word for word in self.key_words if word in self._extract_pos]

    def select_pos_index(self) -> List[int]:
        """
            Detects the part of speech indexes and returns them into a list
        """
        return [self.word2vec_model.wv.index2word.index(token) for token in self._extract_pos
                if token in self.word2vec_model.wv.index2word]

    def select_pos_embeddings(self) -> List[np.ndarray]:
        """
            Detects part of speech embeddings from their indexes
        """
        selected_pos_index = self.select_pos_index()
        return [self.word2vec_model.wv.vectors[index] for index in selected_pos_index]


class LanguageModelExecutionSteps:
    def __init__(self, language_model_file_name: str, model_name: str) -> None:
        """
            This class executes three steps to view the similarity between words from word2vec model:
            (1) trains language model
            (2) filters language model words
            (3) trains similarity matrices
            Args:
                language_model_file_name: word2vec model filename to be stored in MinIO
                model_name: the name of the model config
        """
        self.language_model_file_name = language_model_file_name
        self.model_name = model_name
        self.word2vec = None

    def train_language_model(self, dataset_sources_config: List[tuple]) -> None:
        """
            Trains word2vec model from dataset config,
            which is a list of tuples of dataset and their textual columns.
            After training the model is stored in MinIO for later use.
        """
        model_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config,
                                                              language_model_name=self.language_model_file_name)
        model_language_model_pipeline.execute()
        self.word2vec = model_language_model_pipeline.word2vec

    def filter_language_model_words(self, key_words: List[str]):
        """
            It filters the word2vec indexes, extracting selected noun phrases
        """
        if self.word2vec:
            return LanguageModelWordsFilter(self.word2vec, key_words, pos=['NOUN', 'ADJ'])
        else:
            return None

    def train_similarity_matrices(self, key_words: List[str]) -> None:
        """
            for each type of similarity functions, it will create a similarity matrix, based on
            indexes and their vectors from most similar words of key words.
            The matrices will be stored in MinIO for later use.
        """
        similarity_functions = ['cosine', 'euclidean', 'hamming']
        for index in range(len(similarity_functions)):
            print('Start computing similarity matrix.')
            model_similarity_matrix = build_similarity_matrix(
                self.filter_language_model_words(key_words=key_words).select_pos_embeddings(),
                self.filter_language_model_words(key_words=key_words).extract_pos(),
                metric=similarity_functions[index])
            print('Finish computing similarity matrix.')
            print('Save similarity matrix.')
            store_registry.minio_object_store('semantic-similarity-matrices').put_object(
                f'{self.model_name}_{similarity_functions[index]}_matrix.json',
                model_similarity_matrix.to_json(orient='columns'))


def plot_graphs(pipeline: LanguageModelExecutionSteps, model_name: str, model_file_name: str, threshold: np.float64,
                word_graph_configs: dict, normalize_func) -> None:
    """
        steps of generating d3 graph, calling the similarity matrix from minio and normalizing it.
    Args:
        pipeline: Pipeline of language model execution stems
        model_name: the name of the model
        model_file_name: word2vec file name from MinIO
        threshold: the minimum of similarity number
        word_graph_configs: dictionary of key words
        normalize_func: function of similarity normalization
    """
    model_cosine_matrix = store_registry.minio_object_store('semantic-similarity-matrices').get_object(model_file_name)
    for key in word_graph_configs.keys():
        create_graph_for_language_model_key_words(pd.read_json(model_cosine_matrix).applymap(normalize_func),
                                                  pipeline.filter_language_model_words(
                                                      key_words=word_graph_configs[key]).select_key_words(),
                                                  model_name=model_name, metric_threshold=threshold,
                                                  column_name=key)

## Experiment Nr#1 language model based on:
- PWDB
- eu-timeline
- ireland-timeline

## Experiment Nr#2 language model based on:
- eu-cellar

## Experiment Nr#3 language model based on:
- PWDB
- eu-timeline
- ireland-timeline
- eu-cellar


In [5]:
model1_dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN)
]

model2_dataset_sources_config = [
    (Dataset.EU_CELLAR_ENRICHED, DEFAULT_TEXTUAL_COLUMN),
]

model3_dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.EU_CELLAR_ENRICHED, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN)
]

In [6]:
model1_execution_steps = LanguageModelExecutionSteps(language_model_file_name=MODEL1_FILE_NAME, model_name=NR1_MODEL_NAME)
model1_execution_steps.train_language_model(model1_dataset_sources_config)
model1_execution_steps.train_similarity_matrices(CATEGORY)

100% (1288 of 1288) |####################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (171 of 171) |######################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (410 of 410) |######################| Elapsed Time: 0:00:00 Time:  0:00:00


In [7]:
model2_execution_steps = LanguageModelExecutionSteps(language_model_file_name=MODEL2_FILE_NAME, model_name=NR2_MODEL_NAME)
model2_execution_steps.train_language_model(model2_dataset_sources_config)
model2_execution_steps.train_similarity_matrices(CATEGORY)

100% (2653 of 2653) |####################| Elapsed Time: 0:00:01 Time:  0:00:01


In [9]:
model3_execution_steps = LanguageModelExecutionSteps(language_model_file_name=MODEL3_FILE_NAME, model_name=NR3_MODEL_NAME)
model3_execution_steps.train_language_model(model3_dataset_sources_config)
model3_execution_steps.train_similarity_matrices(CATEGORY)

100% (2653 of 2653) |####################| Elapsed Time: 0:00:01 Time:  0:00:01


### Generate D3 Graphs

#### Cosine similarity graph

In [7]:
print('starts protting graph')
plot_graphs(pipeline=model1_execution_steps,
            model_name=NR1_MODEL_NAME,
            model_file_name='model1_cosine_matrix.json',
            threshold=0.6,
            word_graph_configs=WORD_GRAPH_CONFIGS,
            normalize_func=lambda x: 1 - x)
print('done')

starts protting graph
[d3graph] >Creating directory [docs/word-similarity-web/model1_graphs/category/]
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/workplace.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/labour.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/recovery.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/economic.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/adaptation.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/businesses.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/essential.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/workers.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/category/services.html
Writing /hom

In [None]:
plot_graphs(pipeline=model2_execution_steps,
            model_name=NR2_MODEL_NAME,
            model_file_name='model2_cosine_matrix.json',
            threshold=0.6,
            word_graph_configs=WORD_GRAPH_CONFIGS,
            normalize_func=lambda x: 1 - x)

In [None]:
print('starts protting graph')
plot_graphs(pipeline=model3_execution_steps,
            model_name=NR3_MODEL_NAME,
            model_file_name='model3_cosine_matrix.json',
            threshold=0.6,
            word_graph_configs=WORD_GRAPH_CONFIGS,
            normalize_func=lambda x: 1 - x)
print('done')

starts protting graph


In [None]:
## Pentru a normaliza o distanță spațială într-o metrică de similaritate
# aplică formula dată:
# sim_metric = 1/(1+distance)
# unde sim_metric este distanța normalizată, ce reprezintă metrica de similaritate
# distance - este distanța calculată după o metodă anume( ex. Eucliedean, norma l1, norma l2, norma infint, etc.)


