## Word2vec model training
#### Model training based on three datasets' text data:
- M1: pwdb + eu_timeline  ( +  ireland_timeline )
- M2: ds_eu_cellar
- M3: M1+M2

#### Extract NOUN and NOUN PHRASES from each text data
#### Train the word2vec model with each dataset's textual data

### Import libraries

In [160]:
import sys

import numpy as np

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import warnings

from spacy.tokens.doc import Doc

warnings.filterwarnings("ignore", category=DeprecationWarning)

import spacy

nlp = spacy.load('en_core_web_sm')
from typing import List, Tuple
import pandas as pd
from gensim.models import Word2Vec, KeyedVectors
from sem_covid.adapters.data_source import IndexTabularDataSource
from sem_covid.services.sc_wrangling.data_cleaning import (clean_text_from_specific_characters, clean_fix_unicode,
                                                           clean_remove_currency_symbols, clean_remove_emails,
                                                           clean_remove_urls)

from sem_covid.entrypoints.notebooks.topic_modeling.topic_modeling_wrangling.token_management import (select_pos,
                                                                                                      filter_stop_words_on_a_span_list)

from sem_covid.services.data_registry import Dataset

## Define constants

In [61]:
PWDB_TEXTUAL_CLASS = ['title', 'background_info_description', 'content_of_measure_description',
                      'use_of_measure_description', 'involvement_of_social_partners_description']

DEFAULT_TEXTUAL_COLUMN = ['title']
WINDOW = 5
MIN_COUNT = 1
VECTOR_SIZE = 300
EPOCHS = 50
EU_TIMELINE_TOTAL_EXAMPLES = 171
IRELAND_TIMELINE_TOTAL_EXAMPLES = 410
EU_CELLAR_TOTAL_EXAMPLES = 2653

## Data preprocessing
- data cleanup
- turn corpus into spacy document

In [3]:
def apply_cleaning_functions(document_corpus: pd.Series) -> pd.Series:
    """
    This function receives the document and leads through cleaning steps
    Args:
        document_corpus: dataset document corpus

    Returns: clean document corpus
    """
    unused_characters = ["\\r", ">", "\n", "\\", "<", "''", "%", "...", "\'", '"', "(", "\n", "*", "1)", "2)", "3)",
                         "[", "]", "-", "_", "\r"]

    new_document_corpus = document_corpus.apply(clean_text_from_specific_characters, characters=unused_characters)
    new_document_corpus = new_document_corpus.apply(clean_fix_unicode)
    new_document_corpus = new_document_corpus.apply(clean_remove_urls)
    new_document_corpus = new_document_corpus.apply(clean_remove_emails)
    new_document_corpus = new_document_corpus.apply(clean_remove_currency_symbols)

    return new_document_corpus

In [91]:
class LanguageModelPipeline:
    """
        This pipeline executes the steps for word2vec language training.
    """

    def __init__(self, dataset_sources: List[Tuple[IndexTabularDataSource, List[str]]]):
        """
            :param dataset_sources: represents the source of the datasets.
        """
        self.dataset_sources = dataset_sources
        self.documents_corpus = pd.Series()
        self.word2vec = None

    def download_datasets(self):
        """
            In this step it will download the dataset and detect selected columns.
            It can be downloaded as many datasets as there are in data source.
        """
        self.dataset_sources = [(dataset_columns, dataset_source.fetch())
                                for dataset_source, dataset_columns in self.dataset_sources]

    def extract_textual_data(self):
        """
            After downloading the datasets, the textual data will be found and and concatenated
            with executing of several steps as well. It will fill the NaN values with empty space,
            add a dot at the end of each concatenated column and reset the index.
        """
        self.documents_corpus = pd.concat([dataset[columns]
                                          .fillna(value="")
                                          .agg('. '.join, axis=1)
                                          .reset_index(drop=True)
                                           for columns, dataset in self.dataset_sources
                                           ], ignore_index=True)

    def clean_textual_data(self):
        """
            The next step is data cleaning. In this step the function "apply_cleaning_functions"
            applies the following actions:
                - clean the document from specific characters
                - delete unicode
                - removes emails and URLs and currency symbols
        """
        self.documents_corpus = apply_cleaning_functions(self.documents_corpus)

    def transform_to_spacy_doc(self):
        """
            When the document is clean, is going to be transform into spacy document
        """
        self.documents_corpus = self.documents_corpus.apply(nlp)

    def extract_features(self):
        """
            To extract the parts of speech, below it was defined classes for each token is necessary.
        """

        def doc_atomization_noun_phrases(doc: Doc):
            """

            :param doc:
            :return:
            """
            sentence = str(doc)
            for noun_phrase in doc.noun_chunks:
                seq = str(noun_phrase)
                sentence = sentence.replace(seq, seq.replace(' ', '_'))
            return nlp(sentence)

        self.documents_corpus = pd.concat([self.documents_corpus,
                                           self.documents_corpus.apply(doc_atomization_noun_phrases)]
                                          , ignore_index=True)
        self.documents_corpus = self.documents_corpus.apply(lambda x: list(map(str, x)))

    def model_training(self):
        """
            When the data is prepared it's stored into Word2Vec model.
        """
        self.word2vec = Word2Vec(sentences=self.documents_corpus, window=WINDOW,
                                 min_count=MIN_COUNT, vector_size=VECTOR_SIZE)

    def execute(self):
        """
            The final step is execution, where are stored each step and it will be executed in a row
        """
        self.download_datasets()
        self.extract_textual_data()
        self.clean_textual_data()
        self.transform_to_spacy_doc()
        self.extract_features()
        self.model_training()


In [5]:
class LanguageModelPipelineNouns(LanguageModelPipeline):
    """
        It injects the LanguageModelPipeline method extract_pos,
        that extracts NOUNs from the document
    """

    def extract_pos(self):
        self.documents_corpus = self.documents_corpus.apply(select_pos, pos="NOUN")
        self.documents_corpus = self.documents_corpus.apply(lambda x: list(map(lambda docs: docs.lemma_, x)))


class LanguageModelPipelineNounPhrases(LanguageModelPipeline):
    """
        It injects the LanguageModelPipeline method extract_pos,
        that extracts NOUN PHRASES from the document
    """

    def extract_pos(self):
        self.documents_corpus = self.documents_corpus.apply(lambda x: x.noun_chunks)
        self.documents_corpus = self.documents_corpus.apply(filter_stop_words_on_a_span_list)



## Experiment Nr#1 language model based on:
- PWDB
- eu-timeline
- ireland-timeline

In [162]:
dataset_sources_config = [
    #(Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    #(Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN)
]
noun_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
noun_language_model_pipeline.execute()

In [194]:
def euclidean_similarity(v1: np.array, v2: np.array) -> np.float:
    """

    :param v1:
    :param v2:
    :return:
    """
    similarity_coefficient = 1 / (1 + np.linalg.norm(v1 - v2))
    return similarity_coefficient

def cosine_similarity(v1: np.array, v2: np.array) -> np.float:
    """

    :param v1:
    :param v2:
    :return:
    """
    similarity_coefficient = np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))
    return similarity_coefficient


def get_similarity_matrix(wv: KeyedVectors, similarity_function) -> pd.DataFrame:
    """

    :param wv:
    :param similarity_function:
    :return:
    """
    similarity_matrix_columns = wv.index_to_key
    similarity_matrix = pd.DataFrame(columns=similarity_matrix_columns)
    for row_index in similarity_matrix_columns:
        new_similarity_matrix_line = {column_index: similarity_function(wv[row_index], wv[column_index])
                                      for column_index in similarity_matrix_columns
                                      }
        new_similarity_matrix_line = pd.DataFrame(new_similarity_matrix_line, index=[row_index])
        similarity_matrix = similarity_matrix.append(new_similarity_matrix_line)
    return similarity_matrix

In [202]:
"""
    This implementation is much faster than the above implementation.
"""
def get_similarity_matrix_v1(wv: KeyedVectors, similarity_function) -> pd.DataFrame:
    """

    :param wv:
    :param similarity_function:
    :return:
    """
    similarity_matrix_columns = wv.index_to_key
    return pd.DataFrame([{column_index: similarity_function(wv[row_index], wv[column_index])
                          for column_index in similarity_matrix_columns
                          }
                         for row_index in similarity_matrix_columns
                         ], columns=similarity_matrix_columns, index=[similarity_matrix_columns])


In [192]:
%%time
df = get_similarity_matrix_v1(wv=noun_language_model_pipeline.word2vec.wv, similarity_function=euclidean_similarity)

CPU times: user 38.5 s, sys: 1.2 s, total: 39.7 s
Wall time: 38.4 s


In [200]:
%%time
df = get_similarity_matrix_v1(wv=noun_language_model_pipeline.word2vec.wv, similarity_function=cosine_similarity)

CPU times: user 115 ms, sys: 4.06 ms, total: 119 ms
Wall time: 113 ms


In [188]:
%%time
df = get_similarity_matrix(wv=noun_language_model_pipeline.word2vec.wv, similarity_function=euclidean_similarity)


CPU times: user 40.1 s, sys: 19.3 ms, total: 40.1 s
Wall time: 40.1 s


In [193]:
df

Unnamed: 0,for,to,of,and,the,minister,on,in,",",announces,...,social_farming,dogs,the_measurement,broader_living_standards,initiatives,euled,next_step,marine_protected_areas,ireland's_maritime_area,all_risk_groups
for,1.000000,0.951911,0.947571,0.956258,0.908373,0.870570,0.915606,0.926763,0.921064,0.875706,...,0.787285,0.787541,0.787130,0.785744,0.788661,0.786558,0.787796,0.789761,0.786751,0.787543
to,0.951911,1.000000,0.951528,0.949225,0.899460,0.861631,0.906145,0.917454,0.912570,0.865935,...,0.780127,0.780287,0.779763,0.778489,0.781207,0.778600,0.780350,0.782111,0.779499,0.780182
of,0.947571,0.951528,1.000000,0.944124,0.885983,0.849024,0.894797,0.905088,0.899220,0.853596,...,0.769048,0.769471,0.768915,0.767766,0.770158,0.768126,0.769827,0.771583,0.768503,0.769329
and,0.956258,0.949225,0.944124,1.000000,0.913641,0.875805,0.922270,0.931306,0.926641,0.880517,...,0.791921,0.792300,0.791907,0.790418,0.793042,0.790537,0.792169,0.794219,0.791248,0.792035
the,0.908373,0.899460,0.885983,0.913641,1.000000,0.933557,0.952978,0.949767,0.949722,0.937587,...,0.844992,0.844810,0.844702,0.843001,0.846351,0.843371,0.845059,0.847532,0.844026,0.844608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
euled,0.786558,0.778600,0.768126,0.790537,0.843371,0.880878,0.837063,0.826050,0.831560,0.875723,...,0.955181,0.954625,0.954493,0.954521,0.954689,1.000000,0.952700,0.956104,0.954511,0.955890
next_step,0.787796,0.780350,0.769827,0.792169,0.845059,0.881369,0.838564,0.828269,0.832797,0.877183,...,0.953788,0.952865,0.954011,0.954686,0.956742,0.952700,1.000000,0.952608,0.954621,0.956171
marine_protected_areas,0.789761,0.782111,0.771583,0.794219,0.847532,0.884338,0.841262,0.829949,0.835792,0.879971,...,0.956592,0.955095,0.955567,0.958140,0.955732,0.956104,0.952608,1.000000,0.954582,0.955316
ireland's_maritime_area,0.786751,0.779499,0.768503,0.791248,0.844026,0.880768,0.837900,0.826724,0.832032,0.876315,...,0.955867,0.953616,0.953385,0.955880,0.957022,0.954511,0.954621,0.954582,1.000000,0.957535


## Experiment Nr#2 language model based on:
- eu-cellar


In [7]:
dataset_sources_config = [
    (Dataset.EU_CELLAR, DEFAULT_TEXTUAL_COLUMN)
]
noun_language_model_pipeline = LanguageModelPipelineNouns(dataset_sources=dataset_sources_config)
noun_language_model_pipeline.execute()

noun_phrases_language_model_pipeline = LanguageModelPipelineNounPhrases(dataset_sources=dataset_sources_config)
noun_phrases_language_model_pipeline.execute()

100% (2653 of 2653) |####################| Elapsed Time: 0:00:03 Time:  0:00:03


## Experiment Nr#3 language model based on:
- PWDB
- eu-timeline
- ireland-timeline
- eu-cellar

In [8]:
dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.EU_CELLAR, DEFAULT_TEXTUAL_COLUMN)
]
noun_language_model_pipeline = LanguageModelPipelineNouns(dataset_sources=dataset_sources_config)
noun_language_model_pipeline.execute()

noun_phrases_language_model_pipeline = LanguageModelPipelineNounPhrases(dataset_sources=dataset_sources_config)
noun_phrases_language_model_pipeline.execute()


