## Word2vec model training
#### Model training based on three datasets' text data:
- M1: pwdb + eu_timeline  ( +  ireland_timeline )
- M2: ds_eu_cellar
- M3: M1+M2

#### Extract NOUN and NOUN PHRASES from each text data
#### Train the word2vec model with each dataset's textual data

### Import libraries

In [20]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))

import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import warnings
warnings.filterwarnings("ignore")

import re
import time
import pickle
from typing import List, Tuple

import nltk
from nltk.corpus import words
import spacy
import enchant
import numpy as np
import pandas as pd
# from spacy.tokens.doc import Doc
from gensim.models import Word2Vec
# from gensim.parsing.preprocessing import remove_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer


from sem_covid.services.store_registry import store_registry
from sem_covid.adapters.embedding_models import BasicSentenceSplitterModel
from sem_covid.services.language_model_pipelines.language_model_execution_steps import LanguageModelExecutionSteps
from sem_covid.services.language_model_pipelines.language_model_pipeline import LanguageModelPipeline
from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.document_handling_tools import (
    document_atomization_noun_phrases, clean_text)
from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.similarity_calculus import (
    build_similarity_matrix)
from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.graph_handling import (
    create_graph_for_language_model_key_words)



# en_words = set(words.words())
# d = enchant.Dict("en_US")
# nltk.download('words')
nlp = spacy.load('en_core_web_sm')

nlp.max_length = 5000000
WINDOW = 5
MIN_COUNT = 1
VECTOR_SIZE = 300
LANGUAGE_MODEL_MINIO_FOLDER = 'word2vec/'
LANGUAGE_MODEL_BUCKET_NAME = 'mdl-language'


# economic
## Define constants

In [4]:
DEFAULT_TEXTUAL_CLASS = ['Title', 'Content']

PWDB_TEXTUAL_CLASS = ['title', 'background_info_description', 'content_of_measure_description',
                      'country', 'category', 'subcategory', 'target_groups']

EU_CELLAR_TEXTUAL_CLASS = ['title', 'content', 'eurovoc_concept_labels', 'subject_matter_labels',
                           'directory_codes_labels']

IRELAND_ACTION_TIMELINE_CLASS = ['title', 'content', 'keyword']

EU_ACTION_TIMELINE_CLASS = ['abstract', 'title', 'topics', 'detail_content']

KEY_WORDS_FOR_ALL_MODELS = ["eu", "national", "work", "aid", "coronavirus", "covid19", "measures",
                            "vaccine", "minister", "government", "organisations",
                            "agreement", "unemployment", "insurance", "reorientation", "economy",
                            "economic", "innovation", "research", "development", "risk", "transport"]

COUNTRIES = ['austria', 'belgium', 'bulgaria', 'croatia', 'cyprus', 'czechia', 'denmark', 'estonia',
             'european_union', 'finland', 'france', 'germany', 'greece', 'hungary', 'ireland', 'italy',
             'latvia', 'lithuania', 'luxembourg', 'malta', 'netherlands', 'norway', 'poland', 'portugal',
             'romania', 'slovakia', 'slovenia', 'spain', 'sweden', 'united_kingdom']

CATEGORY = ['retention', 'workplace', 'labour', 'recovery', 'adaptation',
            'protection', 'essential', 'business_continuity',
            'services', 'social', 'market']

SUBCATEGORY = ['safety', 'arrangements', 'health', 'spending', 'working', 'support', 'occupational',
               'stimulus_packages', 'access', 'time', 'finance', 'remote', 'flexibility',
               'essential_services', 'remuneration']

TARGET_GROUPS_L1 = ['businesses', 'workers', 'citizens']

TARGET_GROUPS_L2 = ['company', 'older', 'people', 'female', 'aged', 'corporations',
                    'single', 'person', 'forms', 'smes', 'ups', 'single_parents',
                    'citizens', 'professions', 'parents', 'groups', 'youth',
                    'sector', 'women', 'unemployed', 'care', 'facilities', 'standard',
                    'specific', 'contractors', 'children', 'border', 'refugees',
                    'minors', 'platform', 'employment', 'seasonal', 'disabled', 'migrants',
                    'risk_group', 'commuters']

FUNDING = ['companies', 'national_funds', 'employer', 'funds', 'european_funds', 'no_special_funding_required',
           'regional_funds', 'local_funds', 'employers_organization', 'employees']

WORDS_PACK1 = {'category': CATEGORY,
               'subcategory': SUBCATEGORY,
               'countries': COUNTRIES,
               'target_groups_l1': TARGET_GROUPS_L1,
               'target_groups_l2': TARGET_GROUPS_L2,
               'funding': FUNDING}

WORDS_PACK2 = {'keywords': KEY_WORDS_FOR_ALL_MODELS}

MODEL_WORDS_PACKS = (WORDS_PACK1, WORDS_PACK1, WORDS_PACK2)

MODEL_NAMES = ('model1', 'model2', 'model3')

FILE_NAMES = ('model1_language_model.model',
              'model2_language_model.model',
              'model3_language_model.model'
              )

SIMILARITY_MATRIX_BUCKET_NAME = 'semantic-similarity-matrices'

COSINE_SIMILARITY_MATRICES = ('model1_cosine_matrix.pkl',
                              'model2_cosine_matrix.pkl',
                              'model3_cosine_matrix.pkl'
                              )

## Data preprocessing
- data cleanup
- turn corpus into spacy document


## Experiment Nr#1 language model based on:
- PWDB
- eu-timeline
- ireland-timeline

## Experiment Nr#2 language model based on:
- eu-cellar

## Experiment Nr#3 language model based on:
- PWDB
- eu-timeline
- ireland-timeline
- eu-cellar


In [5]:
ds_unified = store_registry.es_index_store().get_dataframe('ds_unified_datasets')

100% (4126 of 4126) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


In [6]:
model1_df = ds_unified.query('Document_source != "eu_cellar"')
model2_df = ds_unified.query('Document_source == "eu_cellar"')
model3_df = ds_unified


In [7]:
MODEL_DATASET_SOURCES_CONFIGS = [
    (model1_df, DEFAULT_TEXTUAL_CLASS),
    (model2_df, DEFAULT_TEXTUAL_CLASS),
    (model3_df, DEFAULT_TEXTUAL_CLASS),
]

In [None]:
# def clean_text(text: str) -> str:
#     tmp_word_list = re.sub(' +', ' ', re.sub(r'[^a-zA-Z_ ]', ' ', text)).lower().split(' ')
#     result_words = [word
#                     for word in tmp_word_list
#                     if len(word) > 3 and (word in en_words or d.check(word))]
#     if len(result_words) > 3:
#         return ' '.join(result_words)
#     else:
#         return ''

In [None]:
# def apply_cleaning_functions(document_corpus: list) -> list:
#     """
#     This function receives the document and leads through cleaning steps
#     Args:
#         document_corpus: dataset document corpus
#
#     Returns: clean document corpus
#     """
#     splitter = BasicSentenceSplitterModel()
#     textual_data = '. '.join(document_corpus)
#     splitted_text = splitter.split(textual_data)
#     splitted_long_text = [sent for sent in splitted_text if len(sent) > 10]
#     cleaned_text = list(set([sent for sent in list(map(clean_text, splitted_long_text)) if sent]))
#     return cleaned_text


In [None]:
# def document_atomization_noun_phrases(document: Doc) -> str:
#     """
#         Detects each noun phrase from inserted spacy document and transforms it
#         into integrate single token
#         :document: spacy document
#         :return: The same document, but with atomized noun phrases
#     """
#     sentence = ' '.join([word.lemma_ for word in document])
#     for noun_phrase in document.noun_chunks:
#         noun_phrase_lemma = [x.lemma_ for x in noun_phrase]
#         sequence = " ".join(
#             [token for token in noun_phrase_lemma if token != "" and token != " "])
#         sequence_without_stopwords = remove_stopwords(sequence)
#         sentence = sentence.replace(sequence, sequence_without_stopwords.replace(' ', '_'))
#
#     return remove_stopwords(sentence)

In [113]:
# class LanguageModelPipeline:
#     """
#         This pipeline executes the steps for word2vec language training.
#     """
#
#     def __init__(self, dataset_source: pd.DataFrame, textual_columns: List[str], language_model_name: str):
#         """
#             :param dataset_sources: represents the source of the datasets.
#         """
#         self.dataset_source = dataset_source
#         self.textual_columns = textual_columns
#         self.language_model_name = language_model_name
#         self.documents_corpus = pd.Series()
#         self.word2vec = None
#         self.steps = [self.extract_textual_data,
#                       self.clean_textual_data,
#                       self.transform_to_spacy_doc,
#                       self.extract_features,
#                       self.model_training,
#                       self.save_language_model
#                       ]
#
#     def extract_textual_data(self):
#         """
#             After downloading the datasets, the textual data will be found and and concatenated
#             with executing of several steps as well. It will fill the NaN values with empty space,
#             add a dot at the end of each concatenated column and reset the index.
#         """
#         self.documents_corpus = (self.dataset_source[self.textual_columns]
#                                  .fillna(value="")
#                                  .agg('. '.join, axis=1).values
#                                  )
#
#     def clean_textual_data(self):
#         """
#             The next step is data cleaning. In this step the function "apply_cleaning_functions"
#             applies the following actions:
#                 - clean the document from specific characters
#                 - delete unicode
#                 - removes emails and URLs and currency symbols
#         """
#         self.documents_corpus = apply_cleaning_functions(self.documents_corpus)
#
#     def transform_to_spacy_doc(self):
#         """
#             When the document is clean, is going to be transform into spacy document
#         """
#         self.documents_corpus = list(nlp.pipe(self.documents_corpus))
#
#     def extract_features(self):
#         """
#             To extract the parts of speech, below it was defined classes for each token is necessary.
#         """
#         tmp_documents_corpus = self.documents_corpus.copy()
#         tmp_documents_corpus = [
#             ' '.join([word.lemma_ for word in document])
#             for document in tmp_documents_corpus
#         ]
#         self.documents_corpus = list(map(document_atomization_noun_phrases, self.documents_corpus))
#         self.documents_corpus = self.documents_corpus + tmp_documents_corpus
#
#     def model_training(self):
#         """
#             When the data is prepared it's stored into Word2Vec model.
#         """
#         self.word2vec = Word2Vec(sentences=list(map(lambda x: x.split(' '), self.documents_corpus)),
#                                  window=WINDOW,
#                                  min_count=MIN_COUNT, vector_size=VECTOR_SIZE)
#
#     def save_language_model(self):
#         """
#             Saves trained model in MinIO
#         """
#         minio = store_registry.minio_object_store(LANGUAGE_MODEL_BUCKET_NAME)
#         minio.put_object(LANGUAGE_MODEL_MINIO_FOLDER + self.language_model_name, pickle.dumps(self.word2vec))
#
#     def execute(self):
#         """
#             The final step is execution, where are stored each step and it will be executed in a row
#         """
#         for step in self.steps:
#             start_time = time.time()
#             print(f'Start: {step.__name__}')
#             step()
#             end_time = time.time()
#             print(f'Finish: {step.__name__}')
#             print(f'Time elapsed: {round(end_time - start_time, 4)} seconds')


In [214]:
lang_model_pipeline = LanguageModelPipeline(dataset_source=model2_df,
                                            textual_columns=DEFAULT_TEXTUAL_CLASS,
                                            language_model_name=MODEL_NAMES[1])

lang_model_pipeline.execute()
word2vec = lang_model_pipeline.word2vec


Start: extract_textual_data
Finish: extract_textual_data
Time elapsed: 0.0321 seconds
Start: clean_textual_data
Finish: clean_textual_data
Time elapsed: 9.9328 seconds
Start: transform_to_spacy_doc
Finish: transform_to_spacy_doc
Time elapsed: 184.0714 seconds
Start: extract_features
Finish: extract_features
Time elapsed: 6.3946 seconds
Start: model_training
Finish: model_training
Time elapsed: 23.3179 seconds
Start: save_language_model
Finish: save_language_model
Time elapsed: 2.9704 seconds


In [202]:
def get_filtered_words(corpus):
    tfidf_vec = TfidfVectorizer(use_idf=True)
    transformed = tfidf_vec.fit_transform(raw_documents=corpus)
    words = list(tfidf_vec.vocabulary_.items())
    idf = tfidf_vec.idf_
    weighted_words = [(pair[0], idf[pair[1]]) for pair in words]
    tmp_df = pd.DataFrame(weighted_words, columns=['word', 'idf'])
    filter_limit = tmp_df.groupby(by='idf').agg(['count']).tail(2).index[0]
    words = tmp_df[tmp_df.idf < filter_limit].word.values.tolist()
    nlp_documents = list(nlp.pipe(words))
    filtered_words = [str(word) for document in nlp_documents
                      for word in document
                      if word.pos_ in ['NOUN', 'ADJ']]
    return filtered_words

In [215]:
filtered_words = get_filtered_words(lang_model_pipeline.documents_corpus)

vectors = [word2vec.wv.vectors[word2vec.wv.key_to_index[word]] for word in filtered_words]

In [None]:
similarity_functions = ['cosine', 'euclidean']
for index in range(len(similarity_functions)):
    print('Start computing similarity matrix.')
    model_similarity_matrix = build_similarity_matrix(
        np.array(vectors),
        filtered_words,
        metric=similarity_functions[index])
    print('Finish computing similarity matrix.')
    print('Save similarity matrix.')
    store_registry.minio_feature_store('semantic-similarity-matrices').put_features(
        features_name=f'model2_{similarity_functions[index]}_matrix.pkl',
        content=model_similarity_matrix
    )
    del model_similarity_matrix

### Generate D3 Graphs

#### Cosine similarity graph

In [6]:
def plot_graphs(pipeline: LanguageModelExecutionSteps, model_name: str, model_file_name: str,
                threshold: np.float64, word_graph_configs: dict, normalize_func) -> None:
    """
        steps of generating d3 graph, calling the similarity matrix from minio and normalizing it.
    Args:
        pipeline: Pipeline of language model execution stems
        model_name: the name of the model
        model_file_name: word2vec file name from MinIO
        threshold: the minimum of similarity number
        word_graph_configs: dictionary of key words
        normalize_func: function of similarity normalization
    """
    model_cosine_matrix = store_registry.minio_feature_store(SIMILARITY_MATRIX_BUCKET_NAME).get_features(
        model_file_name)
    model_cosine_matrix = model_cosine_matrix.applymap(normalize_func)
    for key in word_graph_configs.keys():
        create_graph_for_language_model_key_words(model_cosine_matrix,
                                                  pipeline.filter_language_model_words().select_key_words(
                                                      key_words=word_graph_configs[key]),
                                                  model_name=model_name,
                                                  metric_threshold=threshold, column_name=key)

In [7]:
def execute_language_model_pipeline(model_file_name: str,
                                    model_name: str,
                                    model_dataset_sources_config: List[tuple],
                                    #model_words_pack: dict,
                                    #cosine_similarity_matrix: str
                                    ):
    model_execution_steps = LanguageModelExecutionSteps(language_model_file_name=model_file_name,
                                                        model_name=model_name)
    model_execution_steps.train_language_model(model_dataset_sources_config)
    model_execution_steps.train_similarity_matrices()
    # plot_graphs(pipeline=model_execution_steps,
    #             model_name=model_name,
    #             model_file_name=cosine_similarity_matrix,
    #             threshold=0.6,
    #             word_graph_configs=model_words_pack,
    #             normalize_func=lambda x: 1 - x)
    del model_execution_steps