## Word2vec model training
#### Model training based on three datasets' text data:
- M1: pwdb + eu_timeline  ( +  ireland_timeline )
- M2: ds_eu_cellar
- M3: M1+M2

#### Extract NOUN and NOUN PHRASES from each text data
#### Train the word2vec model with each dataset's textual data

### Import libraries

In [27]:
import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))

import os
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import warnings
warnings.filterwarnings("ignore")

import re
import pickle
from typing import List, Tuple

import spacy
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 1500000

import numpy as np
import pandas as pd
import networkx as nx
from gensim.models import Word2Vec
from d3graph import d3graph


from sem_covid.services.data_registry import Dataset
from sem_covid.services.store_registry import store_registry
from sem_covid.adapters.data_source import IndexTabularDataSource

from sem_covid.entrypoints.notebooks.topic_modeling.topic_modeling_wrangling.token_management import select_pos

from sem_covid.services.sc_wrangling.data_cleaning import (clean_text_from_specific_characters, clean_fix_unicode,
                                                           clean_remove_currency_symbols, clean_remove_emails,
                                                           clean_remove_urls, clean_remove_stopwords)

from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.similarity_calculus import (
    euclidean_similarity, manhattan_similarity, cosine_similarity, build_similarity_matrix)

from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.document_handling_tools import(
    document_atomization_noun_phrases, lemmatize_document)

## Define constants

In [36]:
PWDB_TEXTUAL_CLASS = ['title', 'background_info_description', 'content_of_measure_description',
                      'use_of_measure_description', 'involvement_of_social_partners_description']

DEFAULT_TEXTUAL_COLUMN = ['title']
WINDOW = 5
MIN_COUNT = 1
VECTOR_SIZE = 300
EPOCHS = 50
EU_TIMELINE_TOTAL_EXAMPLES = 171
IRELAND_TIMELINE_TOTAL_EXAMPLES = 410
EU_CELLAR_TOTAL_EXAMPLES = 2653

KEY_WORDS = ['work', 'agreement', 'working', 'companies', 'workers',
             'measures', 'temporary', 'social', 'support', 'covid19',
             '2020', 'public', 'national', 'ireland', 'statement', '2021',
             'announce', 'health', 'minister', 'new', 'billion', 'coronavirus',
             'vaccine', 'eur', 'support', 'million', 'commission', 'eu']

NR1_MODEL_NAME = 'model1'
NR2_MODEL_NAME = 'model2'
NR3_MODEL_NAME = 'model3'

MODEL1_FILE_NAME = 'model1_language_model.model'
MODEL2_FILE_NAME = 'model2_language_model.model'
MODEL3_FILE_NAME = 'model3_language_model.model'

## Data preprocessing
- data cleanup
- turn corpus into spacy document

In [29]:
def add_space_between_dots_and_commas(text: str):
    return re.sub(r'(?<=[.,])(?=[^\s])', r' ', text)

def apply_cleaning_functions(document_corpus: pd.Series) -> pd.Series:
    """
    This function receives the document and leads through cleaning steps
    Args:
        document_corpus: dataset document corpus

    Returns: clean document corpus
    """
    unused_characters = ["\\r", ">", "\n", "\\", "<", "''", "%", "...", "\'", '"', "(", "\n", "*", "1)", "2)", "3)",
                         "[", "]", "-", "_", "\r", '®', '..']

    new_document_corpus = document_corpus.apply(clean_text_from_specific_characters, characters=unused_characters)
    new_document_corpus = new_document_corpus.apply(clean_fix_unicode)
    new_document_corpus = new_document_corpus.apply(clean_remove_urls)
    new_document_corpus = new_document_corpus.apply(clean_remove_emails)
    new_document_corpus = new_document_corpus.apply(clean_remove_currency_symbols)
    new_document_corpus = new_document_corpus.apply(clean_remove_stopwords)
    new_document_corpus = new_document_corpus.apply(add_space_between_dots_and_commas)

    return new_document_corpus


def generate_graph(similarity_matrix: pd.DataFrame, graph: nx.Graph, root_word: str,
                   top_words: int, threshold: np.float64 = 0.8, deep_level: int = 0,
                   max_deep_level: int = 2, deep_map: dict = None, color_map: dict = None) -> nx.Graph:
    """
        Generates d3 graph using the inserted keywords and their top words from similarity matrix
    Args:
        similarity_matrix: Dataframe with word similarity
        graph: networkx graph
        root_word: key words
        top_words: top similar words from inserted keywords
        threshold: minimum percentage of similarity
        deep_level: the level of generating leaf
        max_deep_level: the maximum number of generated leaf
        deep_map: dictionary of the words and their level of similarity
        color_map: the color of each level of words' similarity

    Returns: a d3 graph with title and root of key word and their similarity words
    """
    if root_word not in deep_map.keys():
        deep_map[root_word] = (deep_level, color_map[deep_level])
    elif deep_map[root_word][0] > deep_level:
        deep_map[root_word] = (deep_level, color_map[deep_level])
    if deep_level > max_deep_level:
        return graph
    new_nodes = similarity_matrix[root_word].sort_values(ascending=False)[:top_words].index.to_list()
    new_nodes_weight = list(similarity_matrix[root_word].sort_values(ascending=False)[:top_words].values)
    for index in range(0, len(new_nodes)):
        if new_nodes_weight[index] >= threshold:
            graph.add_edge(root_word, new_nodes[index])
            generate_graph(similarity_matrix, graph, new_nodes[index], top_words, threshold, deep_level+1, max_deep_level,
                           deep_map=deep_map, color_map=color_map)

    return graph


def create_graph_for_language_model_key_words(similarity_matrix: pd.DataFrame, language_model_words: list,
                                              model_name: str, metric_threshold: np.float64) -> d3graph:
    """
    !!! This is not reusable function. It was made for a single thing !!!

    It generates d3graph based on language model selected words and and the similarity
    matrix created with those words.
    """
    graph_folder_path = f'docs/word-similarity-web/{model_name}_graphs/'
    color_map = {0: '#a70000',
                 1: '#f0000',
                 2: '#ff7b7b',
                 3: '#ffbaba'}
    for index in range(0, len(language_model_words)):
        deep_map = {}
        graph = generate_graph(similarity_matrix, nx.Graph(), language_model_words[index],
                               top_words=4, threshold=metric_threshold
                               , max_deep_level=2, deep_map=deep_map, color_map=color_map)
        network_adjacency_matrix = pd.DataFrame(data=nx.adjacency_matrix(graph).todense(),
                                                index=graph.nodes(), columns=graph.nodes())
        node_color_list = [deep_map[node][0] for node in graph.nodes()]
        d3graph(network_adjacency_matrix, savepath=graph_folder_path, savename=language_model_words[index],
                node_color=node_color_list,
                width=1920, height=1080, edge_width=5,
                edge_distance=60, directed=True)
        

In [34]:
class LanguageModelPipeline:
    """
        This pipeline executes the steps for word2vec language training.
    """

    def __init__(self, dataset_sources: List[Tuple[IndexTabularDataSource, List[str]]], language_model_name: str):
        """
            :param dataset_sources: represents the source of the datasets.
        """
        self.dataset_sources = dataset_sources
        self.language_model_name = language_model_name
        self.documents_corpus = pd.Series()
        self.word2vec = None

    def download_datasets(self):
        """
            In this step it will download the dataset and detect selected columns.
            It can be downloaded as many datasets as there are in data source.
        """
        self.dataset_sources = [(dataset_columns, dataset_source.fetch())
                                for dataset_source, dataset_columns in self.dataset_sources]

    def extract_textual_data(self):
        """
            After downloading the datasets, the textual data will be found and and concatenated
            with executing of several steps as well. It will fill the NaN values with empty space,
            add a dot at the end of each concatenated column and reset the index.
        """
        self.documents_corpus = pd.concat([dataset[columns]
                                          .fillna(value="")
                                          .agg('. '.join, axis=1)
                                          .reset_index(drop=True)
                                           for columns, dataset in self.dataset_sources
                                           ], ignore_index=True)

    def clean_textual_data(self):
        """
            The next step is data cleaning. In this step the function "apply_cleaning_functions"
            applies the following actions:
                - clean the document from specific characters
                - delete unicode
                - removes emails and URLs and currency symbols
        """
        self.documents_corpus = apply_cleaning_functions(self.documents_corpus)

    def transform_to_spacy_doc(self):
        """
            When the document is clean, is going to be transform into spacy document
        """
        self.documents_corpus = self.documents_corpus.apply(nlp)

    def extract_features(self):
        """
            To extract the parts of speech, below it was defined classes for each token is necessary.
        """
        self.documents_corpus = pd.concat([self.documents_corpus,
                                           self.documents_corpus.apply(document_atomization_noun_phrases),
                                           self.documents_corpus.apply(lemmatize_document)]
                                           ,ignore_index=True)

        self.documents_corpus = self.documents_corpus.apply(lambda x: list(map(str, x)))

    def model_training(self):
        """
            When the data is prepared it's stored into Word2Vec model.
        """
        self.word2vec = Word2Vec(sentences=self.documents_corpus, window=WINDOW,
                                 min_count=MIN_COUNT, vector_size=VECTOR_SIZE)

    def save_language_model(self):
        """
            Saves trained model in MinIO
        """
        minio = store_registry.minio_object_store('mdl-language')
        minio.put_object('word2vec/' + self.language_model_name, pickle.dumps(self.word2vec))


    def execute(self):
        """
            The final step is execution, where are stored each step and it will be executed in a row
        """
        self.download_datasets()
        self.extract_textual_data()
        self.clean_textual_data()
        self.transform_to_spacy_doc()
        self.extract_features()
        self.model_training()
        self.save_language_model()


class LanguageModelWordsFilter:
    def __init__(self, word2vec_model: Word2Vec, key_words: List[str], pos: List[str]) -> None:
        self.word2vec_model = word2vec_model
        self.key_words = key_words
        self.pos = pos
        self.word2vec_document = None
        self.word2vec_document = nlp(' '.join(self.word2vec_model.wv.index_to_key))
        self.word2vec_document = select_pos(self.word2vec_document, self.pos)
        self._extract_pos = list(map(str, self.word2vec_document))

    def extract_pos(self) -> List[str]:
        """
            transforms a word2vec indexes into spacy document and selects parts of
            speech. After that it puts into a list and converts those parts of speech
            into strings.
        """
        return self._extract_pos

    def select_key_words(self) -> List[str]:
        """
            Finds each word form inserted list of key words and returns a
            list with those words if there are presented in the list of
            extracted parts of speech.
        """
        return [word for word in self.key_words if word in self._extract_pos]

    def select_pos_index(self) -> List[int]:
        """
            Detects the part of speech indexes and returns them into a list
        """
        return [self.word2vec_model.wv.index_to_key.index(token) for token in self._extract_pos
                if token in self.word2vec_model.wv.index_to_key]

    def select_pos_embeddings(self) -> List[np.ndarray]:
        """
            Detects part of speech embeddings from their indexes
        """
        selected_pos_index = self.select_pos_index()
        return [self.word2vec_model.wv.vectors[index] for index in selected_pos_index]


def train_language_model(dataset_sources_config: List[tuple], language_model_name: str):
    """
        1. creates word2vec model with LanguageModelPipeline
        2. filters nouns and adjectives
     Args:
        dataset_sources_config: the dataset and his textual data columns
        language_model_name: the name of the language model, to be saved in MinIO
    Returns: Amount of filtered words
    """
    model_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config,
                                                          language_model_name=language_model_name)
    model_language_model_pipeline.execute()

    return LanguageModelWordsFilter(model_language_model_pipeline.word2vec, KEY_WORDS, pos=['NOUN', 'ADJ'])


def train_similarity_matrices(model_name: str, language_model: train_language_model):
    """
        Generates similarity matrix using extracts pos and pos embeddings
    Args:
        model_name: the name of the model that will be saved on the server
        language_model: trained word2vec model
    """
    similarity_functions = [cosine_similarity, euclidean_similarity, manhattan_similarity]
    for index in range(len(similarity_functions)):
        print('Start computing similarity matrix.')
        model_similarity_matrix = build_similarity_matrix(language_model.select_pos_embeddings(),
                                                          language_model.extract_pos(),
                                                          metric=similarity_functions[index])
        print('Finish computing similarity matrix.')
        print('Save similarity matrix.')
        store_registry.minio_object_store('semantic-similarity-matrices').put_object(
            f'{model_name}_{similarity_functions[index].__name__}_matrix.json',
            model_similarity_matrix.to_json(orient='columns'))
        print('Create d3Graphs')

## Experiment Nr#1 language model based on:
- PWDB
- eu-timeline
- ireland-timeline


## Experiment Nr#2 language model based on:
- eu-cellar

## Experiment Nr#3 language model based on:
- PWDB
- eu-timeline
- ireland-timeline
- eu-cellar


In [32]:
%%time

model1_dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN)
]

model2_dataset_sources_config = [
    (Dataset.EU_CELLAR_ENRICHED, DEFAULT_TEXTUAL_COLUMN),
]

model3_dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.EU_CELLAR_ENRICHED, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN)
]

CPU times: user 8 µs, sys: 1 µs, total: 9 µs
Wall time: 12.9 µs


### Train language model and execute similarity matrices

In [35]:
%%time
# !!! Below three cells will be executing a good period of time. If for any reason you run it,
# take your time, go and make some food, go for a walk maybe. !!!

# execution time: 1h 23m
model1_word2vec = train_language_model(model1_dataset_sources_config, MODEL1_FILE_NAME)
model1_similarity_matrix = train_similarity_matrices(NR1_MODEL_NAME, model1_word2vec)

100% (1288 of 1288) |####################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (171 of 171) |######################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (410 of 410) |######################| Elapsed Time: 0:00:00 Time:  0:00:00


CPU times: user 3min 9s, sys: 1.4 s, total: 3min 10s
Wall time: 3min 6s


In [67]:
%%time
# execution time: 3m
# model2 = execute_model_steps(NR2_MODEL_NAME, model2_dataset_sources_config)
model2_word2vec = train_language_model(model2_dataset_sources_config, MODEL2_FILE_NAME)
model2_similarity_matrix = train_similarity_matrices(NR2_MODEL_NAME, model2_word2vec)

Start computing similarity matrix.
Finish computing similarity matrix.
Save similarity matrix.
Create d3Graphs
CPU times: user 3min 20s, sys: 787 ms, total: 3min 21s
Wall time: 3min 20s


In [None]:
%%time
# execution time: 1h 33m
model3_word2vec = train_language_model(model3_dataset_sources_config, MODEL3_FILE_NAME)
model3_similarity_matrix = train_similarity_matrices(NR3_MODEL_NAME, model3_word2vec)

### Generate D3 Graphs

In [None]:
model1_cosine_matrix = store_registry\
    .minio_object_store('semantic-similarity-matrices')\
    .get_object('model1_cosine_similarity_matrix.json')


model1_graphs = create_graph_for_language_model_key_words(pd.read_json(model1_cosine_matrix),
                                                          model1_word2vec.select_key_words(),
                                                          model_name=NR2_MODEL_NAME, metric_threshold=0.4)

In [None]:
model2_cosine_matrix = store_registry\
    .minio_object_store('semantic-similarity-matrices')\
    .get_object('model2_cosine_similarity_matrix.json')


model2_graphs = create_graph_for_language_model_key_words(pd.read_json(model2_cosine_matrix),
                                                          model2_word2vec.select_key_words(),
                                                          model_name=NR3_MODEL_NAME, metric_threshold=0.4)

In [None]:
model3_cosine_matrix = store_registry\
    .minio_object_store('semantic-similarity-matrices')\
    .get_object('model3_cosine_similarity_matrix.json')


model3_graphs = create_graph_for_language_model_key_words(pd.read_json(model3_cosine_matrix),
                                                          model3_word2vec.select_key_words(),
                                                          model_name=NR3_MODEL_NAME, metric_threshold=0.4)
