## Word2vec model training
#### Model training based on three datasets' text data:
- M1: pwdb + eu_timeline  ( +  ireland_timeline )
- M2: ds_eu_cellar
- M3: M1+M2

#### Extract NOUN and NOUN PHRASES from each text data
#### Train the word2vec model with each dataset's textual data

### Import libraries

In [4]:
import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

import spacy

nlp = spacy.load('en_core_web_sm')
from typing import List, Tuple
import pandas as pd
from gensim.models import Word2Vec
from sem_covid.adapters.data_source import IndexTabularDataSource
from sem_covid.services.sc_wrangling.data_cleaning import (clean_text_from_specific_characters, clean_fix_unicode
, clean_remove_currency_symbols, clean_remove_emails, clean_remove_urls)

from sem_covid.entrypoints.notebooks.topic_modeling.topic_modeling_wrangling.token_management import (filter_stop_words,
                                                                                                      select_pos,
                                                                                                      filter_stop_words_on_a_span_list)

from sem_covid.services.data_registry import Dataset

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


## Define constants

In [5]:
PWDB_TEXTUAL_CLASS = ['title', 'background_info_description', 'content_of_measure_description',
                      'use_of_measure_description', 'involvement_of_social_partners_description']

DEFAULT_TEXTUAL_COLUMN = ['title']
WINDOW = 5
MIN_COUNT = 10
VECTOR_SIZE = 300
EPOCHS = 50
EU_TIMELINE_TOTAL_EXAMPLES = 171
IRELAND_TIMELINE_TOTAL_EXAMPLES = 410
EU_CELLAR_TOTAL_EXAMPLES = 2653

## Data preprocessing
- data cleanup
- turn corpus into spacy document

In [6]:
def apply_cleaning_functions(document_corpus: pd.Series) -> pd.Series:
    """
    This function receives the document and leads through cleaning steps
    Args:
        document_corpus: dataset document corpus

    Returns: clean document corpus
    """
    unused_characters = ["\\r", ">", "\n", "\\", "<", "''", "%", "...", "\'", '"', "(", "\n", "*", "1)", "2)", "3)",
                         "[", "]", "-", "_", "\r"]

    new_document_corpus = document_corpus.apply(clean_text_from_specific_characters, characters=unused_characters)
    new_document_corpus = new_document_corpus.apply(clean_fix_unicode)
    new_document_corpus = new_document_corpus.apply(clean_remove_urls)
    new_document_corpus = new_document_corpus.apply(clean_remove_emails)
    new_document_corpus = new_document_corpus.apply(clean_remove_currency_symbols)

    return new_document_corpus

In [9]:
class LanguageModelPipeline:
    """

    """

    def __init__(self, dataset_sources: List[Tuple[IndexTabularDataSource, List[str]]]):
        """

        :param dataset_sources:
        """
        self.dataset_sources = dataset_sources
        self.documents_corpus = pd.Series()
        self.word2vec = None

    def download_datasets(self):
        """

        :return:
        """
        self.dataset_sources = [(dataset_columns, dataset_source.fetch())
                                for dataset_source, dataset_columns in self.dataset_sources]

    def extract_textual_data(self):
        """

        :return:
        """
        self.documents_corpus = pd.concat([dataset[columns]
                                          .fillna(value="")
                                          .agg('. '.join, axis=1)
                                          .reset_index(drop=True)
                                           for columns, dataset in self.dataset_sources
                                           ], ignore_index=True)

    def clean_textual_data(self):
        """

        :return:
        """
        self.documents_corpus = apply_cleaning_functions(self.documents_corpus)

    def transform_to_spacy_doc(self):
        """

        :return:
        """
        self.documents_corpus = self.documents_corpus.apply(nlp)

    def extract_pos(self):
        """

        :return:
        """
        self.documents_corpus = self.documents_corpus.apply(select_pos, pos="NOUN")
        self.documents_corpus = self.documents_corpus.apply(lambda x: list(map(lambda docs: docs.lemma_, x)))

    def model_training(self):
        """

        :return:
        """
        self.word2vec = Word2Vec(sentences=self.documents_corpus, window=WINDOW,
                                 min_count=MIN_COUNT, vector_size=VECTOR_SIZE)

    def execute(self):
        """

        :return:
        """
        self.download_datasets()
        self.extract_textual_data()
        self.clean_textual_data()
        self.transform_to_spacy_doc()
        self.extract_pos()
        self.model_training()

## Experiment Nr#1 language model based on:
- PWDB
- eu-timeline
- ireland-timeline

In [None]:
dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN)
]
language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
language_model_pipeline.execute()

## Experiment Nr#2 language model based on:
- eu-cellar


In [None]:
dataset_sources_config = [
    (Dataset.EU_CELLAR, DEFAULT_TEXTUAL_COLUMN)
]
language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
language_model_pipeline.execute()

## Experiment Nr#3 language model based on:
- PWDB
- eu-timeline
- ireland-timeline
- eu-cellar

In [None]:
dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.EU_CELLAR, DEFAULT_TEXTUAL_COLUMN)
]
language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
language_model_pipeline.execute()




### Filter document tokens and select only NOUN and NOUN PHRASES

In [39]:
pwdb_noun_corpus = pwdb_spacy_corpus.apply(select_pos, pos="NOUN")
pwdb_noun_corpus = pwdb_noun_corpus.apply(lambda x: list(map(lambda docs: docs.lemma_, x)))

eu_timeline_noun_corpus = eu_timeline_spacy_corpus.apply(select_pos, pos="NOUN")
eu_timeline_noun_corpus = eu_timeline_noun_corpus.apply(lambda x: list(map(lambda docs: docs.lemma_, x)))

ireland_timeline_noun_corpus = ireland_timeline_spacy_corpus.apply(select_pos, pos="NOUN")
ireland_timeline_noun_corpus = ireland_timeline_noun_corpus.apply(lambda x: list(map(lambda docs: docs.lemma_, x)))

eu_cellar_noun_corpus = eu_cellar_spacy_corpus.apply(select_pos, pos="NOUN")
eu_cellar_noun_corpus = eu_cellar_noun_corpus.apply(lambda x: list(map(lambda docs: docs.lemma_, x)))

In [50]:
pwdb_noun_phrase_corpus = pwdb_spacy_corpus.apply(lambda x: x.noun_chunks)
pwdb_noun_phrase_corpus = pwdb_noun_phrase_corpus.apply(filter_stop_words_on_a_span_list)

eu_timeline_noun_phrase_corpus = eu_timeline_spacy_corpus.apply(lambda x: x.noun_chunks)
eu_timeline_noun_phrase_corpus = eu_timeline_noun_phrase_corpus.apply(filter_stop_words_on_a_span_list)

ireland_timeline_noun_phrase_corpus = ireland_timeline_spacy_corpus.apply(lambda x: x.noun_chunks)
ireland_timeline_noun_phrase_corpus = ireland_timeline_noun_phrase_corpus.apply(filter_stop_words_on_a_span_list)

eu_cellar_noun_phrase_corpus = eu_cellar_spacy_corpus.apply(lambda x: x.noun_chunks)
eu_cellar_noun_phrase_corpus = eu_cellar_noun_phrase_corpus.apply(filter_stop_words_on_a_span_list)

### Train Word2vec model based on extracted NOUNS and NOUN PHRASES

In [57]:
WINDOW = 5
MIN_COUNT = 10
VECTOR_SIZE = 300
EPOCHS = 50
EU_TIMELINE_TOTAL_EXAMPLES = 171
IRELAND_TIMELINE_TOTAL_EXAMPLES = 410
EU_CELLAR_TOTAL_EXAMPLES = 2653

m1_noun_word2vec = Word2Vec(sentences=pwdb_noun_corpus, window=WINDOW,
                            min_count=MIN_COUNT, vector_size=VECTOR_SIZE)
m1_noun_word2vec.train(eu_timeline_noun_corpus, total_examples=EU_TIMELINE_TOTAL_EXAMPLES, epochs=EPOCHS)
m1_noun_word2vec.train(ireland_timeline_noun_corpus, total_examples=IRELAND_TIMELINE_TOTAL_EXAMPLES, epochs=EPOCHS)

m1_noun_phrases_word2vec = Word2Vec(sentences=pwdb_noun_phrase_corpus, window=WINDOW,
                                    min_count=MIN_COUNT, vector_size=VECTOR_SIZE)
m1_noun_phrases_word2vec.train(eu_timeline_noun_phrase_corpus,
                               total_examples=EU_TIMELINE_TOTAL_EXAMPLES, epochs=EPOCHS)
m1_noun_phrases_word2vec.train(ireland_timeline_noun_phrase_corpus,
                               total_examples=IRELAND_TIMELINE_TOTAL_EXAMPLES, epochs=EPOCHS)

(23096, 72200)

In [76]:
m2_noun_word2vec = Word2Vec(sentences=eu_cellar_noun_corpus, window=WINDOW,
                            min_count=MIN_COUNT, vector_size=VECTOR_SIZE)
m2_noun_phrases_word2vec = Word2Vec(sentences=eu_cellar_noun_phrase_corpus,
                                    window=WINDOW, min_count=MIN_COUNT, vector_size=VECTOR_SIZE)

In [74]:
m3_noun_word2vec = Word2Vec(sentences=pwdb_noun_corpus, window=WINDOW,
                            min_count=MIN_COUNT, vector_size=VECTOR_SIZE)
m3_noun_word2vec.train(eu_timeline_noun_corpus, total_examples=EU_TIMELINE_TOTAL_EXAMPLES, epochs=EPOCHS)
m3_noun_word2vec.train(ireland_timeline_noun_corpus, total_examples=IRELAND_TIMELINE_TOTAL_EXAMPLES, epochs=EPOCHS)
m3_noun_word2vec.train(eu_cellar_noun_corpus, total_examples=EU_CELLAR_TOTAL_EXAMPLES, epochs=EPOCHS)

m3_noun_phrases_word2vec = Word2Vec(sentences=pwdb_noun_phrase_corpus, window=WINDOW,
                                    min_count=MIN_COUNT, vector_size=VECTOR_SIZE)
m3_noun_phrases_word2vec.train(eu_timeline_noun_phrase_corpus, total_examples=EU_TIMELINE_TOTAL_EXAMPLES, epochs=EPOCHS)
m3_noun_phrases_word2vec.train(ireland_timeline_noun_phrase_corpus,
                               total_examples=IRELAND_TIMELINE_TOTAL_EXAMPLES, epochs=EPOCHS)
m3_noun_phrases_word2vec.train(eu_cellar_noun_phrase_corpus, total_examples=EU_CELLAR_TOTAL_EXAMPLES, epochs=EPOCHS)

(532854, 1293000)