## Word2vec model training
#### Model training based on three datasets' text data:
- M1: pwdb + eu_timeline  ( +  ireland_timeline )
- M2: ds_eu_cellar
- M3: M1+M2

#### Extract NOUN and NOUN PHRASES from each text data
#### Train the word2vec model with each dataset's textual data

### Import libraries

In [180]:
import sys

import numpy as np


sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))

import os
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import spacy
nlp = spacy.load('en_core_web_sm')

from sklearn.manifold import TSNE

import plotly.express as px
from sklearn.metrics import pairwise_distances
from typing import List, Tuple
import pandas as pd
from gensim.models import Word2Vec
from sem_covid.adapters.data_source import IndexTabularDataSource
from sem_covid.services.sc_wrangling.data_cleaning import (clean_text_from_specific_characters, clean_fix_unicode,
                                                           clean_remove_currency_symbols, clean_remove_emails,
                                                           clean_remove_urls)
from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.similarity_calculus import (
    euclidean_similarity, manhattan_similarity, cosine_similarity, get_similarity_matrix)
from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.document_handling_tools import \
    document_atomization_noun_phrases
from sem_covid.services.data_registry import Dataset

## Define constants

In [2]:
PWDB_TEXTUAL_CLASS = ['title', 'background_info_description', 'content_of_measure_description',
                      'use_of_measure_description', 'involvement_of_social_partners_description']

DEFAULT_TEXTUAL_COLUMN = ['title']
WINDOW = 5
MIN_COUNT = 1
VECTOR_SIZE = 300
EPOCHS = 50
EU_TIMELINE_TOTAL_EXAMPLES = 171
IRELAND_TIMELINE_TOTAL_EXAMPLES = 410
EU_CELLAR_TOTAL_EXAMPLES = 2653

## Data preprocessing
- data cleanup
- turn corpus into spacy document

In [3]:
def apply_cleaning_functions(document_corpus: pd.Series) -> pd.Series:
    """
    This function receives the document and leads through cleaning steps
    Args:
        document_corpus: dataset document corpus

    Returns: clean document corpus
    """
    unused_characters = ["\\r", ">", "\n", "\\", "<", "''", "%", "...", "\'", '"', "(", "\n", "*", "1)", "2)", "3)",
                         "[", "]", "-", "_", "\r"]

    new_document_corpus = document_corpus.apply(clean_text_from_specific_characters, characters=unused_characters)
    new_document_corpus = new_document_corpus.apply(clean_fix_unicode)
    new_document_corpus = new_document_corpus.apply(clean_remove_urls)
    new_document_corpus = new_document_corpus.apply(clean_remove_emails)
    new_document_corpus = new_document_corpus.apply(clean_remove_currency_symbols)

    return new_document_corpus

In [4]:
class LanguageModelPipeline:
    """
        This pipeline executes the steps for word2vec language training.
    """

    def __init__(self, dataset_sources: List[Tuple[IndexTabularDataSource, List[str]]]):
        """
            :param dataset_sources: represents the source of the datasets.
        """
        self.dataset_sources = dataset_sources
        self.documents_corpus = pd.Series()
        self.word2vec = None

    def download_datasets(self):
        """
            In this step it will download the dataset and detect selected columns.
            It can be downloaded as many datasets as there are in data source.
        """
        self.dataset_sources = [(dataset_columns, dataset_source.fetch())
                                for dataset_source, dataset_columns in self.dataset_sources]

    def extract_textual_data(self):
        """
            After downloading the datasets, the textual data will be found and and concatenated
            with executing of several steps as well. It will fill the NaN values with empty space,
            add a dot at the end of each concatenated column and reset the index.
        """
        self.documents_corpus = pd.concat([dataset[columns]
                                          .fillna(value="")
                                          .agg('. '.join, axis=1)
                                          .reset_index(drop=True)
                                           for columns, dataset in self.dataset_sources
                                           ], ignore_index=True)

    def clean_textual_data(self):
        """
            The next step is data cleaning. In this step the function "apply_cleaning_functions"
            applies the following actions:
                - clean the document from specific characters
                - delete unicode
                - removes emails and URLs and currency symbols
        """
        self.documents_corpus = apply_cleaning_functions(self.documents_corpus)

    def transform_to_spacy_doc(self):
        """
            When the document is clean, is going to be transform into spacy document
        """
        self.documents_corpus = self.documents_corpus.apply(nlp)

    def extract_features(self):
        """
            To extract the parts of speech, below it was defined classes for each token is necessary.
        """
        self.documents_corpus = pd.concat([self.documents_corpus,
                                           self.documents_corpus.apply(document_atomization_noun_phrases)]
                                          , ignore_index=True)
        self.documents_corpus = self.documents_corpus.apply(lambda x: list(map(str, x)))

    def model_training(self):
        """
            When the data is prepared it's stored into Word2Vec model.
        """
        self.word2vec = Word2Vec(sentences=self.documents_corpus, window=WINDOW,
                                 min_count=MIN_COUNT, vector_size=VECTOR_SIZE)

    def execute(self):
        """
            The final step is execution, where are stored each step and it will be executed in a row
        """
        self.download_datasets()
        self.extract_textual_data()
        self.clean_textual_data()
        self.transform_to_spacy_doc()
        self.extract_features()
        self.model_training()

## Experiment Nr#1 language model based on:
- PWDB
- eu-timeline
- ireland-timeline

In [5]:
dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN)
]
model1_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
model1_language_model_pipeline.execute()

100% (1288 of 1288) |####################| Elapsed Time: 0:00:01 Time:  0:00:01
100% (171 of 171) |######################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (410 of 410) |######################| Elapsed Time: 0:00:00 Time:  0:00:00


## Experiment Nr#2 language model based on:
- eu-cellar


In [25]:
dataset_sources_config = [
    (Dataset.EU_CELLAR, DEFAULT_TEXTUAL_COLUMN)
]
model2_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
model2_language_model_pipeline.execute()

## Experiment Nr#3 language model based on:
- PWDB
- eu-timeline
- ireland-timeline
- eu-cellar

In [26]:
dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.EU_CELLAR, DEFAULT_TEXTUAL_COLUMN)
]
model3_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
model3_language_model_pipeline.execute()

## Similarity matrices
### Euclidean similarity

In [176]:
def get_similarity_matrix_new(vector: np.ndarray, keys: list, metric)->pd.DataFrame:
    return pd.DataFrame(pairwise_distances(vector, metric = metric), columns= keys, index = keys)


In [183]:
%%time
get_similarity_matrix_new(model1_language_model_pipeline.word2vec.wv.vectors[:2000],
                      model1_language_model_pipeline.word2vec.wv.index_to_key[:2000],
                      metric = euclidean_similarity
                      )

CPU times: user 19.5 s, sys: 182 ms, total: 19.7 s
Wall time: 19.9 s


Unnamed: 0,of,the,",",.,to,and,in,for,is,),...,sustainable,neither,post,remote_working,tourists,proportion,its_employees,leading,tasks,concertation
of,1.000000,0.087021,0.076782,0.082264,0.060755,0.075211,0.078992,0.077981,0.064526,0.078702,...,0.091899,0.091928,0.091899,0.091173,0.089621,0.091691,0.090817,0.093242,0.089526,0.090108
the,0.087021,1.000000,0.078414,0.084651,0.060859,0.078640,0.075379,0.078242,0.064396,0.072223,...,0.086244,0.087403,0.087633,0.087663,0.086291,0.087016,0.086951,0.088072,0.085919,0.085878
",",0.076782,0.078414,1.000000,0.112882,0.062765,0.102690,0.088205,0.084247,0.061964,0.102634,...,0.114308,0.114743,0.112278,0.111951,0.113976,0.113630,0.112525,0.116316,0.110696,0.112404
.,0.082264,0.084651,0.112882,1.000000,0.068273,0.089037,0.081809,0.088355,0.068201,0.089310,...,0.109286,0.110937,0.108862,0.110309,0.108357,0.110950,0.108485,0.109691,0.105254,0.109863
to,0.060755,0.060859,0.062765,0.068273,1.000000,0.067350,0.061253,0.072983,0.055050,0.057124,...,0.071899,0.069899,0.071957,0.072490,0.072446,0.072036,0.072103,0.071039,0.071921,0.068768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
proportion,0.091691,0.087016,0.113630,0.110950,0.072036,0.107366,0.085939,0.107855,0.075164,0.116053,...,0.405517,0.352391,0.410547,0.414327,0.430415,1.000000,0.454366,0.395990,0.398638,0.303377
its_employees,0.090817,0.086951,0.112525,0.108485,0.072103,0.109910,0.085418,0.105211,0.075973,0.112413,...,0.521459,0.387706,0.564905,0.574819,0.484130,0.454366,1.000000,0.440326,0.544921,0.324229
leading,0.093242,0.088072,0.116316,0.109691,0.071039,0.114621,0.086932,0.106113,0.073647,0.122232,...,0.475664,0.452822,0.472531,0.433111,0.382320,0.395990,0.440326,1.000000,0.387971,0.340269
tasks,0.089526,0.085919,0.110696,0.105254,0.071921,0.108889,0.084744,0.104860,0.074610,0.111081,...,0.458457,0.373548,0.468036,0.489354,0.505626,0.398638,0.544921,0.387971,1.000000,0.310877


In [2]:
%%time
model1_euclidean_similarity_matrix = get_similarity_matrix(wv=model1_language_model_pipeline.word2vec.wv,
                                                           similarity_function=euclidean_similarity)
model1_euclidean_similarity_matrix

NameError: name 'get_similarity_matrix' is not defined

In [None]:
%%time
model2_euclidean_similarity_matrix = get_similarity_matrix(wv=model2_language_model_pipeline.word2vec.wv,
                                                           similarity_function=euclidean_similarity)
model2_euclidean_similarity_matrix

In [None]:
%%time
model3_euclidean_similarity_matrix = get_similarity_matrix(wv=model3_language_model_pipeline.word2vec.wv,
                                                           similarity_function=euclidean_similarity)
model3_euclidean_similarity_matrix

### Cosine similarity

In [None]:
%%time
model1_cosine_similarity_matrix = get_similarity_matrix(wv=model1_language_model_pipeline.word2vec.wv,
                                                           similarity_function=cosine_similarity)
model1_cosine_similarity_matrix

In [None]:
%%time
model2_cosine_similarity_matrix = get_similarity_matrix(wv=model2_language_model_pipeline.word2vec.wv,
                                                           similarity_function=cosine_similarity)
model2_cosine_similarity_matrix

In [None]:
%%time
model3_cosine_similarity_matrix = get_similarity_matrix(wv=model3_language_model_pipeline.word2vec.wv,
                                                           similarity_function=cosine_similarity)
model3_cosine_similarity_matrix

### Manhattan similarity

In [None]:
%%time
model1_manhattan_similarity_matrix = get_similarity_matrix(wv=model1_language_model_pipeline.word2vec.wv,
                                                           similarity_function=manhattan_similarity)
model1_manhattan_similarity_matrix

In [None]:
%%time
model2_manhattan_similarity_matrix = get_similarity_matrix(wv=model2_language_model_pipeline.word2vec.wv,
                                                           similarity_function=manhattan_similarity)
model2_manhattan_similarity_matrix

In [None]:
%%time
model3_manhattan_similarity_matrix = get_similarity_matrix(wv=model3_language_model_pipeline.word2vec.wv,
                                                           similarity_function=manhattan_similarity)
model3_manhattan_similarity_matrix

In [77]:
key_words = ['president', 'covid', 'economic',
             'country', 'workers', 'health',
             'crisis', 'tax', 'law',
             'costs', 'legal', 'companys',
             'manufacturing', 'property']

# vectors of each word selected
embedding_clusters = []
# words selected, based on key words
word_clusters = []

# for every key word we find most similar
# words and set them into lists
for word in key_words:
    embedding = []
    words = []

    for similar_word, word_vector in model1_language_model_pipeline.word2vec.wv.most_similar(word, topn=30):
        words.append(similar_word)
        embedding.append(model1_language_model_pipeline.word2vec.wv[similar_word])

    embedding_clusters.append(embedding)
    word_clusters.append(words)

In [96]:
from gensim.models import KeyedVectors

def select_words_and_embedding_clusters(word2vec_model: KeyedVectors, key_words: dict) -> tuple:
    """
        This function finds top 30 most similar words from inserted key words and
        insert their clusters into arrays
    """
    # vectors of each word selected
    embedding_clusters = []
    # words selected, based on key words
    word_clusters = []

    # for every key word we find most similar
    # words and set them into lists
    for word in key_words:
        embedding = []
        words = []

        for similar_word, word_vector in word2vec_model.most_similar(word, topn=30):
            words.append(similar_word)
            embedding.append(word2vec_model[similar_word])

        embedding_clusters.append(embedding)
        word_clusters.append(words)

    return embedding_clusters, word_clusters




In [97]:
embeddings = select_words_and_embedding_clusters(model1_language_model_pipeline.word2vec.wv, key_words)

In [106]:

# Transform clusters into NumPy Array
word_embedding_clusters = np.array(embeddings[0])
# Declare every axis of the embedding cluster
axis_0, axis_1, axis_2 = word_embedding_clusters.shape
"""
:perplexity: The number of nearest neighbors that is used in other manifold learning algorithms
:n_components: Dimension of the embedded space
:init: Initialization of embedding
:n_iter: Maxim number of iterations for the optimization (!!! At least 250 !!!)
:random_state: Determines the random number generator
"""
tsne_model_specific_words = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
fit_specific_words = tsne_model_specific_words.fit_transform(embedding_clusters.reshape(axis_0 * axis_1, axis_2))

In [159]:
def create_word_clusters_matrix(key_words: dict, word_clusters: list, tsne_words_model: np.ndarray) -> pd.DataFrame:
    """
        Creates a dataframe that show words vector representation on a graph
    """
    specific_words_dataframe = pd.DataFrame(
        {'word_cluster': key_words, 'word': word_clusters},
        columns=['word_cluster', 'word']).explode('word').reset_index(drop=True)
    dataframe_vectors = pd.DataFrame(tsne_words_model, columns=['X', 'Y'])

    return pd.concat([specific_words_dataframe, dataframe_vectors], axis=1)

In [160]:
df = create_word_clusters_matrix(key_words, embeddings[1], fit_specific_words)
df

Unnamed: 0,word_cluster,word,X,Y
0,president,the_president,-16.681551,40.985405
1,president,2/2015,-50.330994,29.013401
2,president,legislative,-6.457219,49.867496
3,president,the_republic,1.649387,52.978394
4,president,the_general_tax_law,-62.624191,20.209379
...,...,...,...,...
415,property,sustaining,-25.640423,-15.535745
416,property,the_regeneration,-77.978127,8.393215
417,property,primary,19.320084,-13.802938
418,property,building,26.817053,-4.491646


In [162]:
# Scatter plot of specific words clusters
fig = px.scatter(df, x='X', y='Y',
                 color=df.word_cluster,
                 labels={'color': 'word'},
                 hover_data=["word"])
fig