## Word2vec model training
#### Model training based on three datasets' text data:
- M1: pwdb + eu_timeline  ( +  ireland_timeline )
- M2: ds_eu_cellar
- M3: M1+M2

#### Extract NOUN and NOUN PHRASES from each text data
#### Train the word2vec model with each dataset's textual data

### Import libraries

In [1]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))

import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import warnings

warnings.filterwarnings("ignore")

import numpy as np

from sem_covid.services.store_registry import store_registry
from sem_covid.services.language_model_execution_steps import LanguageModelExecutionSteps
from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.graph_handling import (
    create_graph_for_language_model_key_words)
from typing import List
import time

In [106]:

import pickle
import re
from typing import List, Tuple

import pandas as pd
import spacy
from gensim.models import Word2Vec
from gensim.parsing.preprocessing import remove_stopwords

from sem_covid.services.store_registry import store_registry

from sem_covid.adapters.embedding_models import BasicSentenceSplitterModel

from nltk.corpus import words
import enchant

en_words = set(words.words())
d = enchant.Dict("en_US")

nlp = spacy.load('en_core_web_sm')
nlp.max_length = 5000000
WINDOW = 5
MIN_COUNT = 1
VECTOR_SIZE = 300
LANGUAGE_MODEL_MINIO_FOLDER = 'word2vec/'
LANGUAGE_MODEL_BUCKET_NAME = 'mdl-language'





def apply_cleaning_functions(document_corpus: list) -> list:
    """
    This function receives the document and leads through cleaning steps
    Args:
        document_corpus: dataset document corpus

    Returns: clean document corpus
    """
    splitter = BasicSentenceSplitterModel()
    textual_data = '. '.join(document_corpus)
    splitted_text = splitter.split(textual_data)
    splitted_long_text = [sent for sent in splitted_text if len(sent) > 10]
    cleaned_text = list(set([sent for sent in list(map(clean_text, splitted_long_text)) if sent]))
    return cleaned_text


In [184]:
def clean_text(text: str) -> str:
    tmp_text = re.sub(' +', ' ', re.sub(r',', ' , ', re.sub(r'[^a-zA-Z_ ]', ' ', text))).lower()
    tmp_words = tmp_text.split(' ')
    result_words = []
    for word in tmp_words:
        if word==",":
            result_words.append(word)
        elif len(word)>3 and (word in en_words or d.check(word)):
            result_words.append(word)
    if len(result_words) > 3:
        return ' '.join(result_words)
    else:
        return ''

# economic
## Define constants

In [2]:
DEFAULT_TEXTUAL_CLASS = ['Title', 'Content']

PWDB_TEXTUAL_CLASS = ['title', 'background_info_description', 'content_of_measure_description',
                      'country', 'category', 'subcategory', 'target_groups']

EU_CELLAR_TEXTUAL_CLASS = ['title', 'content', 'eurovoc_concept_labels', 'subject_matter_labels',
                           'directory_codes_labels']

IRELAND_ACTION_TIMELINE_CLASS = ['title', 'content', 'keyword']

EU_ACTION_TIMELINE_CLASS = ['abstract', 'title', 'topics', 'detail_content']

KEY_WORDS_FOR_ALL_MODELS = ["eu", "national", "work", "aid", "coronavirus", "covid19", "measures",
                            "vaccine", "minister", "government", "organisations",
                            "agreement", "unemployment", "insurance", "reorientation", "economy",
                            "economic", "innovation", "research", "development", "risk", "transport"]

COUNTRIES = ['austria', 'belgium', 'bulgaria', 'croatia', 'cyprus', 'czechia', 'denmark', 'estonia',
             'european_union', 'finland', 'france', 'germany', 'greece', 'hungary', 'ireland', 'italy',
             'latvia', 'lithuania', 'luxembourg', 'malta', 'netherlands', 'norway', 'poland', 'portugal',
             'romania', 'slovakia', 'slovenia', 'spain', 'sweden', 'united_kingdom']

CATEGORY = ['retention', 'workplace', 'labour', 'recovery', 'adaptation',
            'protection', 'essential', 'business_continuity',
            'services', 'social', 'market']

SUBCATEGORY = ['safety', 'arrangements', 'health', 'spending', 'working', 'support', 'occupational',
               'stimulus_packages', 'access', 'time', 'finance', 'remote', 'flexibility',
               'essential_services', 'remuneration']

TARGET_GROUPS_L1 = ['businesses', 'workers', 'citizens']

TARGET_GROUPS_L2 = ['company', 'older', 'people', 'female', 'aged', 'corporations',
                    'single', 'person', 'forms', 'smes', 'ups', 'single_parents',
                    'citizens', 'professions', 'parents', 'groups', 'youth',
                    'sector', 'women', 'unemployed', 'care', 'facilities', 'standard',
                    'specific', 'contractors', 'children', 'border', 'refugees',
                    'minors', 'platform', 'employment', 'seasonal', 'disabled', 'migrants',
                    'risk_group', 'commuters']

FUNDING = ['companies', 'national_funds', 'employer', 'funds', 'european_funds', 'no_special_funding_required',
           'regional_funds', 'local_funds', 'employers_organization', 'employees']

WORDS_PACK1 = {'category': CATEGORY,
               'subcategory': SUBCATEGORY,
               'countries': COUNTRIES,
               'target_groups_l1': TARGET_GROUPS_L1,
               'target_groups_l2': TARGET_GROUPS_L2,
               'funding': FUNDING}

WORDS_PACK2 = {'keywords': KEY_WORDS_FOR_ALL_MODELS}

MODEL_WORDS_PACKS = (WORDS_PACK1, WORDS_PACK1, WORDS_PACK2)

MODEL_NAMES = ('model1', 'model2', 'model3')

FILE_NAMES = ('model1_language_model.model',
              'model2_language_model.model',
              'model3_language_model.model'
              )

SIMILARITY_MATRIX_BUCKET_NAME = 'semantic-similarity-matrices'

COSINE_SIMILARITY_MATRICES = ('model1_cosine_matrix.pkl',
                              'model2_cosine_matrix.pkl',
                              'model3_cosine_matrix.pkl'
                              )

## Data preprocessing
- data cleanup
- turn corpus into spacy document


## Experiment Nr#1 language model based on:
- PWDB
- eu-timeline
- ireland-timeline

## Experiment Nr#2 language model based on:
- eu-cellar

## Experiment Nr#3 language model based on:
- PWDB
- eu-timeline
- ireland-timeline
- eu-cellar


In [3]:
ds_unified = store_registry.es_index_store().get_dataframe('ds_unified_datasets')

100% (4126 of 4126) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


In [4]:
model1_df = ds_unified.query('Document_source != "eu_cellar"')
model2_df = ds_unified.query('Document_source == "eu_cellar"')
model3_df = ds_unified


In [5]:
MODEL_DATASET_SOURCES_CONFIGS = (
    [
        (model1_df, DEFAULT_TEXTUAL_CLASS),
    ],
    [
        (model2_df, DEFAULT_TEXTUAL_CLASS),
    ],
    [
        (model3_df, DEFAULT_TEXTUAL_CLASS),
    ]
)


In [225]:



from spacy.tokens.doc import Doc


def document_atomization_noun_phrases(document: Doc) -> str:
    """
        Detects each noun phrase from inserted spacy document and transforms it
        into integrate single token
        :document: spacy document
        :return: The same document, but with atomized noun phrases
    """
    sentence = ' '.join([word.lemma_ for word in document])
    for noun_phrase in document.noun_chunks:
        noun_phrase_lemma = [x.lemma_ for x in noun_phrase]
        sequence = " ".join(
            [token for token in noun_phrase_lemma if token != "" and token != " "])
        sequence_without_stopwords = remove_stopwords(sequence)
        sentence = sentence.replace(sequence, sequence_without_stopwords.replace(' ', '_'))

    return sentence

In [223]:
doc = nlp('I am the president of unitated states')


In [230]:
for noun_phrase in doc.noun_chunks:
    print(noun_phrase.start_char)
    print(noun_phrase.end_char)


0
1
5
18
22
37


In [226]:
document_atomization_noun_phrases(doc)

'I be president of unitated_state'

In [175]:
class LanguageModelPipeline:
    """
        This pipeline executes the steps for word2vec language training.
    """

    def __init__(self, dataset_source: pd.DataFrame, textual_columns: List[str], language_model_name: str):
        """
            :param dataset_sources: represents the source of the datasets.
        """
        self.dataset_source = dataset_source
        self.textual_columns = textual_columns
        self.language_model_name = language_model_name
        self.documents_corpus = pd.Series()
        self.word2vec = None
        self.steps = [self.extract_textual_data,
                      self.clean_textual_data,
                      self.transform_to_spacy_doc,
                      self.extract_features,
                      self.model_training,
                      self.save_language_model
                      ]

    def extract_textual_data(self):
        """
            After downloading the datasets, the textual data will be found and and concatenated
            with executing of several steps as well. It will fill the NaN values with empty space,
            add a dot at the end of each concatenated column and reset the index.
        """
        self.documents_corpus = (self.dataset_source[self.textual_columns]
                                 .fillna(value="")
                                 .agg('. '.join, axis=1).values
                                 )

    def clean_textual_data(self):
        """
            The next step is data cleaning. In this step the function "apply_cleaning_functions"
            applies the following actions:
                - clean the document from specific characters
                - delete unicode
                - removes emails and URLs and currency symbols
        """
        self.documents_corpus = apply_cleaning_functions(self.documents_corpus)


    def transform_to_spacy_doc(self):
        """
            When the document is clean, is going to be transform into spacy document
        """
        self.documents_corpus = list(nlp.pipe(self.documents_corpus))

    def extract_features(self):
        """
            To extract the parts of speech, below it was defined classes for each token is necessary.
        """
        self.documents_corpus = list(map(document_atomization_noun_phrases, self.documents_corpus))
        self.documents_corpus = list(map(remove_stopwords,self.documents_corpus))



    def model_training(self):
        """
            When the data is prepared it's stored into Word2Vec model.
        """
        self.word2vec = Word2Vec(sentences=list(map(lambda x: x.split(' '), self.documents_corpus)),
                                 window=WINDOW,
                                 min_count=MIN_COUNT, vector_size=VECTOR_SIZE)

    def save_language_model(self):
        """
            Saves trained model in MinIO
        """
        minio = store_registry.minio_object_store(LANGUAGE_MODEL_BUCKET_NAME)
        minio.put_object(LANGUAGE_MODEL_MINIO_FOLDER + self.language_model_name, pickle.dumps(self.word2vec))

    def execute(self):
        """
            The final step is execution, where are stored each step and it will be executed in a row
        """
        for step in self.steps:
            start_time = time.time()
            print(f'Start: {step.__name__}')
            step()
            end_time = time.time()
            print(f'Finish: {step.__name__}')
            print(f'Time elapsed: {round(end_time - start_time, 4)} seconds')


In [185]:
lang_model_pipeline = LanguageModelPipeline(dataset_source=model1_df,
                                            textual_columns=DEFAULT_TEXTUAL_CLASS,
                                            language_model_name=MODEL_NAMES[0])

lang_model_pipeline.execute()

Start: extract_textual_data
Finish: extract_textual_data
Time elapsed: 0.1087 seconds
Start: clean_textual_data
Finish: clean_textual_data
Time elapsed: 3.6553 seconds
Start: transform_to_spacy_doc
Finish: transform_to_spacy_doc
Time elapsed: 58.8227 seconds
Start: extract_features
Finish: extract_features
Time elapsed: 1.7391 seconds
Start: model_training
Finish: model_training
Time elapsed: 4.0717 seconds
Start: save_language_model
Finish: save_language_model
Time elapsed: 1.0615 seconds


In [151]:
minio = store_registry.minio_object_store(LANGUAGE_MODEL_BUCKET_NAME)

In [186]:
keyed_vector = pickle.loads(minio.get_object(LANGUAGE_MODEL_MINIO_FOLDER + 'model1'))

In [187]:
len(keyed_vector.wv.index_to_key)

118320

In [188]:
words = keyed_vector.wv.index_to_key

In [189]:
words.sort()
words

['',
 'abandon',
 'abandon_loyalty',
 'abate',
 'abattoir_blood_sample',
 'abbey',
 'abbey_arts_council_screen',
 'abbey_convenience_retail_consulting_green_cross_pharmacy_limerick_university_pharmacy_blackberry',
 'abbreviate',
 'abbreviate_special_follow_initial_regulation',
 'abdicate',
 'abdominal_pain_nausea_fatigue',
 'abhorrent',
 'abide',
 'abide_government_guideline',
 'abide_health_safety_precaution',
 'abide_medical_advice_self_isolate',
 'abide_public_health_advice',
 'abigail',
 'ability',
 'ability_analyse_knowledge',
 'ability_company',
 'ability_condense_idea',
 'ability_county_county',
 'ability_enterprise',
 'ability_institution',
 'ability_mount_immune_response',
 'ability_nation',
 'ability_point',
 'ability_provider',
 'ability_purchase_grocery',
 'ability_recover',
 'ability_transform_bathroom',
 'ability_virus_cause',
 'ability_west',
 'ability_work',
 'able',
 'able_access',
 'able_access_care',
 'able_access_subject_percentage_mark',
 'able_access_support',
 'a

In [190]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()

In [191]:
transformed = tfidf_vec.fit_transform(raw_documents=lang_model_pipeline.documents_corpus)

In [203]:
tmp_words = list(tfidf_vec.vocabulary_.items())

In [199]:
list(tfidf_vec.vocabulary_.items())

[('covid_specific_support', 19746),
 ('assume', 6422),
 ('unwound', 111571),
 ('only_exception', 65504),
 ('this_provision_seasonal_worker', 106346),
 ('temporary_contract', 98484),
 ('minister', 58485),
 ('very_glad_government', 112757),
 ('agree', 3056),
 ('phase_reopening_construction_sector', 71677),
 ('similarly_labour_market_number_person', 89117),
 ('pandemic_unemployment_payment', 68813),
 ('fall', 33552),
 ('cent', 11703),
 ('peak', 70734),
 ('early', 26643),
 ('large_decline', 50766),
 ('early_august', 26657),
 ('come', 14435),
 ('know', 50311),
 ('hard', 41846),
 ('continue', 17328),
 ('adhere_public_health_measure', 2325),
 ('individual', 46854),
 ('effort', 27866),
 ('need', 62832),
 ('risk_delta', 84087),
 ('variant_highly_transmissible_strain', 112339),
 ('pose', 72725),
 ('significant_threat', 88912),
 ('unvaccinate', 111553),
 ('partially', 69505),
 ('vaccinate', 112030),
 ('introduction', 49191),
 ('phase_june_level_payment_structure_link_pandemic_unemployment_payment

In [207]:
tmp_words

[('covid_specific_support', 19746),
 ('assume', 6422),
 ('unwound', 111571),
 ('only_exception', 65504),
 ('this_provision_seasonal_worker', 106346),
 ('temporary_contract', 98484),
 ('minister', 58485),
 ('very_glad_government', 112757),
 ('agree', 3056),
 ('phase_reopening_construction_sector', 71677),
 ('similarly_labour_market_number_person', 89117),
 ('pandemic_unemployment_payment', 68813),
 ('fall', 33552),
 ('cent', 11703),
 ('peak', 70734),
 ('early', 26643),
 ('large_decline', 50766),
 ('early_august', 26657),
 ('come', 14435),
 ('know', 50311),
 ('hard', 41846),
 ('continue', 17328),
 ('adhere_public_health_measure', 2325),
 ('individual', 46854),
 ('effort', 27866),
 ('need', 62832),
 ('risk_delta', 84087),
 ('variant_highly_transmissible_strain', 112339),
 ('pose', 72725),
 ('significant_threat', 88912),
 ('unvaccinate', 111553),
 ('partially', 69505),
 ('vaccinate', 112030),
 ('introduction', 49191),
 ('phase_june_level_payment_structure_link_pandemic_unemployment_payment

In [194]:
def custom_cmp(item1, item2):
    return item1[1] - item2[1]

In [195]:
from functools import cmp_to_key

In [206]:
sorted(tmp_words, key=lambda x: x[1], reverse=True)

[('zoom_video_call', 118317),
 ('zoom_provision', 118316),
 ('zoom_available_people', 118315),
 ('zoom', 118314),
 ('zoo_animal_park_summer_camp', 118313),
 ('zoo', 118312),
 ('zone', 118311),
 ('zinc_mine_output', 118310),
 ('zinc_lead_copper_gold_bearing_quartz_vein', 118309),
 ('zimbabwe', 118308),
 ('zest', 118307),
 ('zero_three_year', 118306),
 ('zero_risk_option_reopen_school', 118305),
 ('zero_reduced_rate_sale_period_product_number_jurisdiction', 118304),
 ('zero_rate_website', 118303),
 ('zero_rate_personal_protective_equipment', 118302),
 ('zero_project_annual_conference', 118301),
 ('zero_project', 118300),
 ('zero_policy', 118299),
 ('zero_percent_repayment_period', 118298),
 ('zero_interest_rate', 118297),
 ('zero_interest_phase', 118296),
 ('zero_hour_contract_call_contract', 118295),
 ('zero_hour_contract', 118294),
 ('zero_hour', 118293),
 ('zero_greenhouse_emission', 118292),
 ('zero_enhance_biodiversity', 118291),
 ('zero_employee', 118290),
 ('zero_emission_governme

### Generate D3 Graphs

#### Cosine similarity graph

In [6]:
def plot_graphs(pipeline: LanguageModelExecutionSteps, model_name: str, model_file_name: str,
                threshold: np.float64, word_graph_configs: dict, normalize_func) -> None:
    """
        steps of generating d3 graph, calling the similarity matrix from minio and normalizing it.
    Args:
        pipeline: Pipeline of language model execution stems
        model_name: the name of the model
        model_file_name: word2vec file name from MinIO
        threshold: the minimum of similarity number
        word_graph_configs: dictionary of key words
        normalize_func: function of similarity normalization
    """
    model_cosine_matrix = store_registry.minio_feature_store(SIMILARITY_MATRIX_BUCKET_NAME).get_features(
        model_file_name)
    model_cosine_matrix = model_cosine_matrix.applymap(normalize_func)
    for key in word_graph_configs.keys():
        create_graph_for_language_model_key_words(model_cosine_matrix,
                                                  pipeline.filter_language_model_words().select_key_words(
                                                      key_words=word_graph_configs[key]),
                                                  model_name=model_name,
                                                  metric_threshold=threshold, column_name=key)

In [7]:
def execute_language_model_pipeline(model_file_name: str,
                                    model_name: str,
                                    model_dataset_sources_config: List[tuple],
                                    #model_words_pack: dict,
                                    #cosine_similarity_matrix: str
                                    ):
    start = time.time()
    print(f'Start execution for {model_name}:')
    model_execution_steps = LanguageModelExecutionSteps(language_model_file_name=model_file_name,
                                                        model_name=model_name)
    model_execution_steps.train_language_model(model_dataset_sources_config)
    model_execution_steps.train_similarity_matrices()
    # plot_graphs(pipeline=model_execution_steps,
    #             model_name=model_name,
    #             model_file_name=cosine_similarity_matrix,
    #             threshold=0.6,
    #             word_graph_configs=model_words_pack,
    #             normalize_func=lambda x: 1 - x)
    del model_execution_steps
    end = time.time()
    print(f'Execution finish for {model_name} in:')
    print(round((end - start), 4), 'seconds')



In [8]:
start = 0
end = 1

In [9]:
for model_file_name, model_name, model_dataset_sources_config in zip(
        FILE_NAMES[start:end], MODEL_NAMES[start:end],
        MODEL_DATASET_SOURCES_CONFIGS[start:end],
        #MODEL_WORDS_PACKS,
        #COSINE_SIMILARITY_MATRICES
):
    execute_language_model_pipeline(model_file_name=model_file_name,
                                    model_name=model_name,
                                    model_dataset_sources_config=model_dataset_sources_config,
                                    #model_words_pack=model_words_pack,
                                    #cosine_similarity_matrix=cosine_similarity_matrix
                                    )



Start execution for model1:


KeyboardInterrupt: 

In [217]:
p = nlp('I am the president of united states')

In [221]:
remove_stopwords(str(list(p.noun_chunks)[1]))

'president'