## Word2vec model training
#### Model training based on three datasets' text data:
- M1: pwdb + eu_timeline  ( +  ireland_timeline )
- M2: ds_eu_cellar
- M3: M1+M2

#### Extract NOUN and NOUN PHRASES from each text data
#### Train the word2vec model with each dataset's textual data

### Import libraries

In [2]:
# import sys
# sys.path.append("/home/jovyan/work/sem-covid/")
# sys.path = list(set(sys.path))

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import spacy
nlp = spacy.load('en_core_web_sm')

import pandas as pd
from gensim.models import Word2Vec

from sem_covid.services.sc_wrangling.data_cleaning import clean_text_from_specific_characters, clean_fix_unicode, \
    clean_remove_currency_symbols, clean_remove_emails, clean_remove_urls

from sem_covid.entrypoints.notebooks.topic_modeling.topic_modeling_wrangling.token_management import filter_stop_words, \
    select_pos, filter_stop_words_on_a_span_list

from sem_covid.services.data_registry import Dataset

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


### Download datasets and extract textual data
### Fetching the data

In [77]:
pwdb = Dataset.PWDB.fetch()
eu_timeline = Dataset.EU_ACTION_TIMELINE_ENRICHED.fetch()
ireland_timeline = Dataset.IRELAND_ACTION_TIMELINE_ENRICHED.fetch()
eu_cellar = Dataset.EU_CELLAR_ENRICHED.fetch()

pwdb.fillna(value="", inplace=True)
eu_timeline.fillna(value="", inplace=True)
ireland_timeline.fillna(value="", inplace=True)
eu_cellar.fillna(value="", inplace=True)

pwdb_document_corpus = pwdb['title'] + '. ' + \
                       pwdb['background_info_description'] + '. ' + \
                       pwdb['content_of_measure_description'] + '. ' + \
                       pwdb['use_of_measure_description'] + '. ' + \
                       pwdb['involvement_of_social_partners_description']

eu_timeline_document_corpus = eu_timeline['title']

ireland_timeline_document_corpus = ireland_timeline['title']

eu_cellar_document_corpus = eu_cellar['title']

## Data preprocessing
- data cleanup
- turn corpus into spacy document

In [21]:
def apply_cleaning_functions(document_corpus: pd.Series) -> pd.Series:
    """
    This function receives the document and leads through cleaning steps
    Args:
        document_corpus: dataset document corpus

    Returns: clean document corpus
    """
    unused_characters = ["\\r", ">", "\n", "\\", "<", "''", "%", "...", "\'", '"', "(", "\n", "*", "1)", "2)", "3)",
                     "[", "]", "-", "_", "\r"]

    new_document_corpus = document_corpus.apply(clean_text_from_specific_characters, characters=unused_characters)
    new_document_corpus = new_document_corpus.apply(clean_fix_unicode)
    new_document_corpus = new_document_corpus.apply(clean_remove_urls)
    new_document_corpus = new_document_corpus.apply(clean_remove_emails)
    new_document_corpus = new_document_corpus.apply(clean_remove_currency_symbols)

    return new_document_corpus

In [24]:
clean_pwdb_document = apply_cleaning_functions(pwdb_document_corpus)
clean_eu_timeline_document = apply_cleaning_functions(eu_timeline_document_corpus)
clean_ireland_timeline_document = apply_cleaning_functions(ireland_timeline_document_corpus)
clean_eu_cellar_document = apply_cleaning_functions(eu_cellar_document_corpus)

pwdb_spacy_corpus = clean_pwdb_document.apply(nlp)
eu_timeline_spacy_corpus = clean_eu_timeline_document.apply(nlp)
ireland_timeline_spacy_corpus = clean_ireland_timeline_document.apply(nlp)
eu_cellar_spacy_corpus = clean_eu_cellar_document.apply(nlp)

### Filter document tokens and select only NOUN and NOUN PHRASES

In [39]:
pwdb_noun_corpus = pwdb_spacy_corpus.apply(select_pos, pos="NOUN")
pwdb_noun_corpus = pwdb_noun_corpus.apply(lambda x: list(map(lambda docs: docs.lemma_, x)))

eu_timeline_noun_corpus = eu_timeline_spacy_corpus.apply(select_pos, pos="NOUN")
eu_timeline_noun_corpus = eu_timeline_noun_corpus.apply(lambda x: list(map(lambda docs: docs.lemma_, x)))

ireland_timeline_noun_corpus = ireland_timeline_spacy_corpus.apply(select_pos, pos="NOUN")
ireland_timeline_noun_corpus = ireland_timeline_noun_corpus.apply(lambda x: list(map(lambda docs: docs.lemma_, x)))

eu_cellar_noun_corpus = eu_cellar_spacy_corpus.apply(select_pos, pos="NOUN")
eu_cellar_noun_corpus = eu_cellar_noun_corpus.apply(lambda x: list(map(lambda docs: docs.lemma_, x)))

In [50]:
pwdb_noun_phrase_corpus = pwdb_spacy_corpus.apply(lambda x: x.noun_chunks)
pwdb_noun_phrase_corpus = pwdb_noun_phrase_corpus.apply(filter_stop_words_on_a_span_list)

eu_timeline_noun_phrase_corpus = eu_timeline_spacy_corpus.apply(lambda x: x.noun_chunks)
eu_timeline_noun_phrase_corpus = eu_timeline_noun_phrase_corpus.apply(filter_stop_words_on_a_span_list)

ireland_timeline_noun_phrase_corpus = ireland_timeline_spacy_corpus.apply(lambda x: x.noun_chunks)
ireland_timeline_noun_phrase_corpus = ireland_timeline_noun_phrase_corpus.apply(filter_stop_words_on_a_span_list)

eu_cellar_noun_phrase_corpus = eu_cellar_spacy_corpus.apply(lambda x: x.noun_chunks)
eu_cellar_noun_phrase_corpus = eu_cellar_noun_phrase_corpus.apply(filter_stop_words_on_a_span_list)

### Train Word2vec model based on extracted NOUNS and NOUN PHRASES

In [57]:
WINDOW = 5
MIN_COUNT = 10
VECTOR_SIZE = 300
EPOCHS = 50
EU_TIMELINE_TOTAL_EXAMPLES = 171
IRELAND_TIMELINE_TOTAL_EXAMPLES = 410
EU_CELLAR_TOTAL_EXAMPLES = 2653

m1_noun_word2vec = Word2Vec(sentences=pwdb_noun_corpus, window=WINDOW,
                            min_count=MIN_COUNT, vector_size=VECTOR_SIZE)
m1_noun_word2vec.train(eu_timeline_noun_corpus, total_examples=EU_TIMELINE_TOTAL_EXAMPLES, epochs=EPOCHS)
m1_noun_word2vec.train(ireland_timeline_noun_corpus, total_examples=IRELAND_TIMELINE_TOTAL_EXAMPLES, epochs=EPOCHS)

m1_noun_phrases_word2vec = Word2Vec(sentences=pwdb_noun_phrase_corpus, window=WINDOW,
                                    min_count=MIN_COUNT, vector_size=VECTOR_SIZE)
m1_noun_phrases_word2vec.train(eu_timeline_noun_phrase_corpus,
                               total_examples=EU_TIMELINE_TOTAL_EXAMPLES, epochs=EPOCHS)
m1_noun_phrases_word2vec.train(ireland_timeline_noun_phrase_corpus,
                               total_examples=IRELAND_TIMELINE_TOTAL_EXAMPLES, epochs=EPOCHS)

(23096, 72200)

In [76]:
m2_noun_word2vec = Word2Vec(sentences=eu_cellar_noun_corpus, window=WINDOW,
                            min_count=MIN_COUNT, vector_size=VECTOR_SIZE)
m2_noun_phrases_word2vec = Word2Vec(sentences=eu_cellar_noun_phrase_corpus,
                                    window=WINDOW, min_count=MIN_COUNT, vector_size=VECTOR_SIZE)

In [74]:
m3_noun_word2vec = Word2Vec(sentences=pwdb_noun_corpus, window=WINDOW,
                            min_count=MIN_COUNT, vector_size=VECTOR_SIZE)
m3_noun_word2vec.train(eu_timeline_noun_corpus, total_examples=EU_TIMELINE_TOTAL_EXAMPLES, epochs=EPOCHS)
m3_noun_word2vec.train(ireland_timeline_noun_corpus, total_examples=IRELAND_TIMELINE_TOTAL_EXAMPLES, epochs=EPOCHS)
m3_noun_word2vec.train(eu_cellar_noun_corpus, total_examples=EU_CELLAR_TOTAL_EXAMPLES, epochs=EPOCHS)


m3_noun_phrases_word2vec = Word2Vec(sentences=pwdb_noun_phrase_corpus, window=WINDOW,
                                    min_count=MIN_COUNT, vector_size=VECTOR_SIZE)
m3_noun_phrases_word2vec.train(eu_timeline_noun_phrase_corpus, total_examples=EU_TIMELINE_TOTAL_EXAMPLES, epochs=EPOCHS)
m3_noun_phrases_word2vec.train(ireland_timeline_noun_phrase_corpus,
                               total_examples=IRELAND_TIMELINE_TOTAL_EXAMPLES, epochs=EPOCHS)
m3_noun_phrases_word2vec.train(eu_cellar_noun_phrase_corpus, total_examples=EU_CELLAR_TOTAL_EXAMPLES, epochs=EPOCHS)

(532854, 1293000)