## Word2vec model training
#### Model training based on three datasets' text data:
- M1: pwdb + eu_timeline  ( +  ireland_timeline )
- M2: ds_eu_cellar
- M3: M1+M2

#### Extract NOUN and NOUN PHRASES from each text data
#### Train the word2vec model with each dataset's textual data

### Import libraries

In [181]:
import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))

import os
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from typing import List, Tuple

import spacy
nlp = spacy.load('en_core_web_sm')

import numpy as np
import pandas as pd
import networkx as nx
from gensim.models import Word2Vec
from d3graph import d3graph
import plotly.express as px

from sem_covid.services.data_registry import Dataset
from sem_covid.adapters.data_source import IndexTabularDataSource

from sem_covid.entrypoints.notebooks.topic_modeling.topic_modeling_wrangling.token_management import select_pos

from sem_covid.services.sc_wrangling.data_cleaning import (clean_text_from_specific_characters, clean_fix_unicode,
                                                           clean_remove_currency_symbols, clean_remove_emails,
                                                           clean_remove_urls, clean_remove_stopwords)

from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.similarity_calculus import (
    euclidean_similarity, manhattan_similarity, cosine_similarity, get_similarity_matrix)

from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.document_handling_tools import(
    document_atomization_noun_phrases, lemmatize_document)

from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.word_embeddings_handler import (
    select_words_and_embedding_clusters, create_tsne_model, create_word_clusters_matrix)

## Define constants

In [13]:
PWDB_TEXTUAL_CLASS = ['title', 'background_info_description', 'content_of_measure_description',
                      'use_of_measure_description', 'involvement_of_social_partners_description']

DEFAULT_TEXTUAL_COLUMN = ['title']
WINDOW = 5
MIN_COUNT = 1
VECTOR_SIZE = 300
EPOCHS = 50
EU_TIMELINE_TOTAL_EXAMPLES = 171
IRELAND_TIMELINE_TOTAL_EXAMPLES = 410
EU_CELLAR_TOTAL_EXAMPLES = 2653

## Data preprocessing
- data cleanup
- turn corpus into spacy document

In [182]:
def apply_cleaning_functions(document_corpus: pd.Series) -> pd.Series:
    """
    This function receives the document and leads through cleaning steps
    Args:
        document_corpus: dataset document corpus

    Returns: clean document corpus
    """
    unused_characters = ["\\r", ">", "\n", "\\", "<", "''", "%", "...", "\'", '"', "(", "\n", "*", "1)", "2)", "3)",
                         "[", "]", "-", "_", "\r"]

    new_document_corpus = document_corpus.apply(clean_text_from_specific_characters, characters=unused_characters)
    new_document_corpus = new_document_corpus.apply(clean_fix_unicode)
    new_document_corpus = new_document_corpus.apply(clean_remove_urls)
    new_document_corpus = new_document_corpus.apply(clean_remove_emails)
    new_document_corpus = new_document_corpus.apply(clean_remove_currency_symbols)
    new_document_corpus = new_document_corpus.apply(clean_remove_stopwords)

    return new_document_corpus


def generate_graph(similarity_matrix: pd.DataFrame, graph: nx.Graph, root_word: str, top_words: int,
                   threshold:np.float64 = 0.8, deep_level: int = 0, max_deep_level: int = 2) -> nx.Graph:
    if deep_level > max_deep_level:
        return graph
    new_nodes = similarity_matrix[root_word].sort_values(ascending=False)[:top_words].index.to_list()
    new_nodes_weight = list(similarity_matrix[root_word].sort_values(ascending=False)[:top_words].values)
    for index in range(0, len(new_nodes)):
        if new_nodes_weight[index] >= threshold:
            graph.add_edge(root_word, new_nodes[index])
            generate_graph(similarity_matrix, graph, new_nodes[index], top_words, threshold, deep_level+1, max_deep_level)

    return graph


def create_graph_for_language_model_key_words(similarity_matrix: pd.DataFrame, language_model_words: list,
                                              model_number: int) -> d3graph:
    """
    !!! This is not reusable function. It was made for a single thing !!!

    It generates d3graph based on language model selected words and and the similarity
    matrix created with those words.
    """
    graph_folder_path = f'docs/word-similarity-web/model{model_number}_graphs/'
    for index in range(0, len(language_model_words)):
        graph = generate_graph(similarity_matrix, nx.Graph(), language_model_words[index],
                               top_words=4, threshold=0.5 ,max_deep_level=2)
        network_adjacency_matrix = pd.DataFrame(data=nx.adjacency_matrix(graph).todense(),
                                                index=graph.nodes(), columns=graph.nodes())
        d3graph(network_adjacency_matrix, savepath=graph_folder_path, savename=language_model_words[index],
                node_color=network_adjacency_matrix.columns.values, width=1920, height=1080, edge_width=5,
                edge_distance=60, directed=True)

In [183]:
class LanguageModelPipeline:
    """
        This pipeline executes the steps for word2vec language training.
    """

    def __init__(self, dataset_sources: List[Tuple[IndexTabularDataSource, List[str]]]):
        """
            :param dataset_sources: represents the source of the datasets.
        """
        self.dataset_sources = dataset_sources
        self.documents_corpus = pd.Series()
        self.word2vec = None

    def download_datasets(self):
        """
            In this step it will download the dataset and detect selected columns.
            It can be downloaded as many datasets as there are in data source.
        """
        self.dataset_sources = [(dataset_columns, dataset_source.fetch())
                                for dataset_source, dataset_columns in self.dataset_sources]

    def extract_textual_data(self):
        """
            After downloading the datasets, the textual data will be found and and concatenated
            with executing of several steps as well. It will fill the NaN values with empty space,
            add a dot at the end of each concatenated column and reset the index.
        """
        self.documents_corpus = pd.concat([dataset[columns]
                                          .fillna(value="")
                                          .agg('. '.join, axis=1)
                                          .reset_index(drop=True)
                                           for columns, dataset in self.dataset_sources
                                           ], ignore_index=True)

    def clean_textual_data(self):
        """
            The next step is data cleaning. In this step the function "apply_cleaning_functions"
            applies the following actions:
                - clean the document from specific characters
                - delete unicode
                - removes emails and URLs and currency symbols
        """
        self.documents_corpus = apply_cleaning_functions(self.documents_corpus)

    def transform_to_spacy_doc(self):
        """
            When the document is clean, is going to be transform into spacy document
        """
        self.documents_corpus = self.documents_corpus.apply(nlp)

    def extract_features(self):
        """
            To extract the parts of speech, below it was defined classes for each token is necessary.
        """
        self.documents_corpus = pd.concat([self.documents_corpus,
                                           self.documents_corpus.apply(document_atomization_noun_phrases),
                                           self.documents_corpus.apply(lemmatize_document)]
                                           ,ignore_index=True)

        self.documents_corpus = self.documents_corpus.apply(lambda x: list(map(str, x)))

    def model_training(self):
        """
            When the data is prepared it's stored into Word2Vec model.
        """
        self.word2vec = Word2Vec(sentences=self.documents_corpus, window=WINDOW,
                                 min_count=MIN_COUNT, vector_size=VECTOR_SIZE)

    def execute(self):
        """
            The final step is execution, where are stored each step and it will be executed in a row
        """
        self.download_datasets()
        self.extract_textual_data()
        self.clean_textual_data()
        self.transform_to_spacy_doc()
        self.extract_features()
        self.model_training()


class LanguageModelWordsFilter:
    def __init__(self, word2vec_model: Word2Vec, key_words: List[str], pos: List[str]) -> None:
        self.word2vec_model = word2vec_model
        self.key_words = key_words
        self.pos = pos
        self.word2vec_document = None

    def extract_pos(self) -> List[str]:
        """
            transforms a word2vec indexes into spacy document and selects parts of
            speech. After that it puts into a list and converts those parts of speech
            into strings.
        """
        self.word2vec_document = nlp(str(self.word2vec_model.wv.index_to_key))
        self.word2vec_document = select_pos(self.word2vec_document, self.pos)

        return list(map(str, self.word2vec_document))

    def select_key_words(self) -> List[str]:
        """
            Finds each word form inserted list of key words and returns a
            list with those words if there are presented in the list of
            extracted parts of speech.
        """
        return [word for word in self.key_words if word in self.extract_pos()]

    def select_pos_index(self) -> list:
        """
            Detects the part of speech indexes and returns them into a list
        """
        return [self.word2vec_model.wv.index_to_key.index(token) for token in self.extract_pos()
                if token in self.word2vec_model.wv.index_to_key]

    def select_pos_embeddings(self) -> list:
        """
            Detects part of speech embeddings from their indexes
        """
        return [self.word2vec_model.wv.vectors[index] for index in self.select_pos_index()]

## Experiment Nr#1 language model based on:
- PWDB
- eu-timeline
- ireland-timeline

In [44]:
dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN)
]
model1_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
model1_language_model_pipeline.execute()

100% (1288 of 1288) |####################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (171 of 171) |######################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (410 of 410) |######################| Elapsed Time: 0:00:00 Time:  0:00:00


## Experiment Nr#2 language model based on:
- eu-cellar


In [16]:
dataset_sources_config = [
    (Dataset.EU_CELLAR, DEFAULT_TEXTUAL_COLUMN)
]
model2_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
model2_language_model_pipeline.execute()

100% (2653 of 2653) |####################| Elapsed Time: 0:00:01 Time:  0:00:01


## Experiment Nr#3 language model based on:
- PWDB
- eu-timeline
- ireland-timeline
- eu-cellar

In [145]:
dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.EU_CELLAR, DEFAULT_TEXTUAL_COLUMN)
]
model3_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
model3_language_model_pipeline.execute()

In [None]:
key_words = ['work', 'agreement', 'working', 'companies', 'workers',
             'measures', 'temporary', 'social', 'support', 'covid19',
             '2020', 'public', 'national', 'ireland', 'statement', '2021',
             'announce', 'health', 'minister', 'new', 'billion', 'coronavirus',
             'vaccine', 'eur', 'support', 'million', 'commission', 'eu']

model1_language_model_filter = LanguageModelWordsFilter(model1_language_model_pipeline.word2vec,
                                                 key_words, pos=['NOUN', 'ADJ'])

model2_language_model_filter = LanguageModelWordsFilter(model2_language_model_pipeline.word2vec,
                                                 key_words, pos=['NOUN', 'ADJ'])

model3_language_model_filter = LanguageModelWordsFilter(model3_language_model_pipeline.word2vec,
                                                 key_words, pos=['NOUN', 'ADJ'])

## Similarity matrices
### Euclidean similarity

In [149]:
model1_euclidean_similarity_matrix = get_similarity_matrix(model1_language_model_filter.select_pos_embeddings(),
                                                           model1_language_model_filter.extract_pos(),
                                                           metric=euclidean_similarity)
model1_euclidean_similarity_matrix

Unnamed: 0,social,measure,covid19,support,government,company,employees,measures,period,companies,...,outlook,revisit,skille,consutle,pedagogue,appearance,walloniebruxelle,invest.brussel,deadlines.flander,colive
social,1.000000,0.045414,0.036553,0.040688,0.038418,0.038948,0.041439,0.042446,0.041354,0.041731,...,0.049398,0.049399,0.049402,0.049401,0.049373,0.049397,0.049346,0.049368,0.049364,0.049391
measure,0.045414,1.000000,0.040072,0.052514,0.045136,0.046630,0.046503,0.097676,0.053653,0.053452,...,0.060099,0.060079,0.060139,0.060107,0.060043,0.060178,0.060070,0.060048,0.060149,0.060187
covid19,0.036553,0.040072,1.000000,0.040927,0.038312,0.044664,0.043341,0.043267,0.045235,0.046020,...,0.050691,0.050640,0.050615,0.050556,0.050566,0.050684,0.050557,0.050609,0.050561,0.050696
support,0.040688,0.052514,0.040927,1.000000,0.044290,0.045513,0.048725,0.048246,0.048973,0.053119,...,0.062091,0.062080,0.062247,0.062099,0.062089,0.062111,0.062157,0.062105,0.062092,0.062148
government,0.038418,0.045136,0.038312,0.044290,1.000000,0.041164,0.041489,0.045369,0.043063,0.043451,...,0.050292,0.050257,0.050279,0.050318,0.050251,0.050361,0.050291,0.050259,0.050297,0.050333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
appearance,0.049397,0.060178,0.050684,0.062111,0.050361,0.068271,0.066202,0.057942,0.073440,0.073691,...,0.924269,0.882282,0.881245,0.881813,0.895364,1.000000,0.875456,0.874494,0.873430,0.920749
walloniebruxelle,0.049346,0.060070,0.050557,0.062157,0.050291,0.068264,0.066135,0.057774,0.073270,0.073689,...,0.871843,0.866184,0.878068,0.867170,0.895470,0.875456,1.000000,0.876839,0.846993,0.875342
invest.brussel,0.049368,0.060048,0.050609,0.062105,0.050259,0.068202,0.066131,0.057793,0.073115,0.073582,...,0.884120,0.878707,0.885699,0.878762,0.896787,0.874494,0.876839,1.000000,0.878381,0.890829
deadlines.flander,0.049364,0.060149,0.050561,0.062092,0.050297,0.068185,0.066235,0.057867,0.073352,0.073497,...,0.880254,0.878745,0.889340,0.879620,0.896835,0.873430,0.846993,0.878381,1.000000,0.890266


In [141]:
model2_euclidean_similarity_matrix = get_similarity_matrix(model2_language_model_filter.select_pos_embeddings(),
                                                           model2_language_model_filter.extract_pos(),
                                                           metric=euclidean_similarity)
model2_euclidean_similarity_matrix

Unnamed: 0,european,council,regulation,commission,parliament,decision,committee,c,case,document,...,cheese,donation,supplier,torvald,salamandrivoran,salamander,subsidise,affair,recomendation,boto
european,1.000000,0.093699,0.068245,0.078582,0.086148,0.066943,0.080274,0.044170,0.039734,0.065066,...,0.071885,0.072017,0.072025,0.072039,0.072028,0.072078,0.072084,0.072049,0.071993,0.071968
council,0.093699,1.000000,0.081587,0.090028,0.081496,0.071549,0.070198,0.045082,0.038972,0.067715,...,0.071001,0.071216,0.071209,0.071137,0.071118,0.071174,0.071169,0.071119,0.071109,0.071046
regulation,0.068245,0.081587,1.000000,0.066500,0.079297,0.082912,0.053828,0.043661,0.036714,0.056090,...,0.063278,0.063404,0.063314,0.063299,0.063280,0.063349,0.063462,0.063170,0.063431,0.063254
commission,0.078582,0.090028,0.066500,1.000000,0.079339,0.075965,0.065807,0.048335,0.044915,0.088855,...,0.086595,0.086783,0.087016,0.086822,0.086993,0.086641,0.086885,0.086618,0.086582,0.086663
parliament,0.086148,0.081496,0.079297,0.079339,1.000000,0.069556,0.066409,0.042552,0.037238,0.063679,...,0.064697,0.064816,0.064794,0.064789,0.064769,0.064794,0.064865,0.064751,0.064762,0.064709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
salamander,0.072078,0.071174,0.063349,0.086641,0.064794,0.077444,0.066074,0.058359,0.055883,0.065174,...,0.870812,0.872041,0.821891,0.883616,0.879646,1.000000,0.873806,0.891242,0.860991,0.906744
subsidise,0.072084,0.071169,0.063462,0.086885,0.064865,0.077704,0.066059,0.058471,0.056081,0.065220,...,0.868185,0.863674,0.843144,0.893469,0.861726,0.873806,1.000000,0.832433,0.898354,0.885257
affair,0.072049,0.071119,0.063170,0.086618,0.064751,0.077368,0.066264,0.058305,0.055915,0.065191,...,0.847245,0.840011,0.810927,0.868540,0.857422,0.891242,0.832433,1.000000,0.828108,0.884529
recomendation,0.071993,0.071109,0.063431,0.086582,0.064762,0.077764,0.066060,0.058568,0.056114,0.065169,...,0.865549,0.881332,0.849117,0.895851,0.855870,0.860991,0.898354,0.828108,1.000000,0.880963


In [152]:
%%time
model3_euclidean_similarity_matrix = get_similarity_matrix(model3_language_model_filter.select_pos_embeddings(),
                                                           model3_language_model_filter.extract_pos(),
                                                           metric=euclidean_similarity)
model3_euclidean_similarity_matrix

CPU times: user 20min 7s, sys: 21.2 s, total: 20min 28s
Wall time: 20min 8s


Unnamed: 0,social,covid19,measure,european,support,council,government,regulation,company,commission,...,approvedamounte,munition,everrise,advertiser,cutoff,automatically.brussel,questionsanswer,handler,airplane,boto
social,1.000000,0.034061,0.041251,0.032243,0.036040,0.031143,0.035644,0.031709,0.036170,0.033705,...,0.044528,0.044570,0.044544,0.044537,0.044542,0.044557,0.044599,0.044513,0.044465,0.044513
covid19,0.034061,1.000000,0.039526,0.035435,0.038814,0.032450,0.037448,0.035612,0.042400,0.037817,...,0.048558,0.048575,0.048616,0.048617,0.048554,0.048546,0.048576,0.048555,0.048522,0.048621
measure,0.041251,0.039526,1.000000,0.034286,0.047579,0.033643,0.043368,0.036559,0.044555,0.035780,...,0.053996,0.053951,0.054048,0.053953,0.053963,0.053975,0.054033,0.053869,0.053871,0.054020
european,0.032243,0.035435,0.034286,1.000000,0.038121,0.041426,0.033996,0.037270,0.036386,0.038194,...,0.046347,0.046391,0.046358,0.046339,0.046385,0.046360,0.046395,0.046423,0.046347,0.046383
support,0.036040,0.038814,0.047579,0.038121,1.000000,0.034895,0.040708,0.036478,0.042896,0.038022,...,0.054903,0.054815,0.054878,0.054766,0.054860,0.054936,0.054828,0.054767,0.054805,0.054913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
automatically.brussel,0.044557,0.048546,0.053975,0.046360,0.054936,0.041089,0.047473,0.044783,0.062558,0.052059,...,0.829451,0.831950,0.837391,0.822218,0.885484,1.000000,0.811928,0.822979,0.835793,0.851533
questionsanswer,0.044599,0.048576,0.054033,0.046395,0.054828,0.041110,0.047512,0.044868,0.062526,0.052139,...,0.823674,0.911257,0.862432,0.871493,0.837361,0.811928,1.000000,0.852018,0.828552,0.859508
handler,0.044513,0.048555,0.053869,0.046423,0.054767,0.041069,0.047483,0.044796,0.062559,0.052060,...,0.841297,0.870104,0.866427,0.876984,0.839930,0.822979,0.852018,1.000000,0.873703,0.858525
airplane,0.044465,0.048522,0.053871,0.046347,0.054805,0.041046,0.047435,0.044819,0.062522,0.052022,...,0.851443,0.857082,0.863481,0.874141,0.842735,0.835793,0.828552,0.873703,1.000000,0.840136


### Cosine similarity

In [153]:
%%time
model1_cosine_similarity_matrix = get_similarity_matrix(model1_language_model_filter.select_pos_embeddings(),
                                                        model1_language_model_filter.extract_pos(),
                                                        metric=cosine_similarity)
model1_cosine_similarity_matrix

CPU times: user 18min 18s, sys: 2.3 s, total: 18min 21s
Wall time: 18min 20s


Unnamed: 0,social,measure,covid19,support,government,company,employees,measures,period,companies,...,outlook,revisit,skille,consutle,pedagogue,appearance,walloniebruxelle,invest.brussel,deadlines.flander,colive
social,1.000000,0.288134,0.039221,0.074017,0.138933,-0.097899,0.063995,0.203206,-0.014286,0.004020,...,0.128852,0.140055,0.126438,0.140507,0.017404,0.110663,-0.069746,-0.003046,-0.019748,0.092014
measure,0.288134,1.000000,0.039240,0.311611,0.260343,0.030873,0.052726,0.833483,0.235665,0.227267,...,0.007057,-0.054723,0.109887,0.032083,-0.170082,0.213768,-0.052065,-0.138582,0.145609,0.258185
covid19,0.039221,0.039240,1.000000,0.055240,0.111384,0.158719,0.121158,0.210883,0.140071,0.171517,...,0.462688,0.258696,0.117556,-0.111088,-0.075086,0.375900,-0.072236,0.107480,-0.082800,0.455562
support,0.074017,0.311611,0.055240,1.000000,0.209045,-0.060666,0.108145,0.212650,0.028025,0.181148,...,0.003051,-0.030678,0.373923,0.024031,-0.002734,0.052478,0.132464,0.041190,0.004834,0.155049
government,0.138933,0.260343,0.111384,0.209045,1.000000,0.001508,0.041477,0.291842,0.047306,0.064329,...,0.047498,-0.114009,-0.009228,0.156411,-0.139077,0.299020,0.030185,-0.093501,0.062012,0.210918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
appearance,0.110663,0.213768,0.375900,0.052478,0.299020,0.123938,0.046329,0.401192,0.178841,0.181617,...,0.663420,0.043696,0.207283,0.095323,0.274622,1.000000,0.312793,-0.002725,0.019845,0.643453
walloniebruxelle,-0.069746,-0.052065,-0.072236,0.132464,0.030185,0.087726,-0.077218,-0.056308,-0.092892,0.141905,...,0.190948,0.063648,0.357350,0.124941,0.508592,0.312793,1.000000,0.291434,-0.165060,0.272772
invest.brussel,-0.003046,-0.138582,0.107480,0.041190,-0.093501,-0.018812,-0.118479,-0.023556,-0.430829,-0.010435,...,0.031418,-0.136019,0.211813,-0.055908,0.216506,-0.002725,0.291434,1.000000,0.022530,0.202819
deadlines.flander,-0.019748,0.145609,-0.082800,0.004834,0.062012,-0.053273,0.121225,0.197773,0.025927,-0.163920,...,0.006035,-0.078665,0.296176,0.009576,0.258038,0.019845,-0.165060,0.022530,1.000000,0.229816


In [154]:
%%time
model2_cosine_similarity_matrix = get_similarity_matrix(model2_language_model_filter.select_pos_embeddings(),
                                                        model2_language_model_filter.extract_pos(),
                                                        metric=cosine_similarity)
model2_cosine_similarity_matrix

CPU times: user 2min 23s, sys: 273 ms, total: 2min 23s
Wall time: 2min 23s


Unnamed: 0,european,council,regulation,commission,parliament,decision,committee,c,case,document,...,cheese,donation,supplier,torvald,salamandrivoran,salamander,subsidise,affair,recomendation,boto
european,1.000000,0.721793,0.519890,0.513305,0.703763,0.368377,0.644128,-0.101847,-0.307602,0.446838,...,-0.262311,-0.027714,-0.007310,0.007956,-0.009048,0.096228,0.085406,0.022897,-0.068913,-0.170900
council,0.721793,1.000000,0.679218,0.651171,0.667754,0.461146,0.527958,-0.043628,-0.348359,0.497900,...,-0.318868,0.064782,0.039717,-0.080088,-0.094919,-0.010855,-0.015348,-0.089455,-0.119494,-0.347730
regulation,0.519890,0.679218,1.000000,0.424612,0.684206,0.675476,0.261896,-0.002797,-0.372161,0.332257,...,-0.217946,0.064109,-0.085868,-0.185503,-0.188472,-0.075636,0.193103,-0.395140,0.122972,-0.397313
commission,0.513305,0.651171,0.424612,1.000000,0.606831,0.415231,0.366933,-0.049948,-0.160739,0.699789,...,-0.249152,-0.018638,0.174843,0.026984,0.207064,-0.249446,0.100355,-0.186217,-0.255388,-0.244516
parliament,0.703763,0.667754,0.684206,0.606831,1.000000,0.496947,0.516035,-0.081712,-0.361064,0.477553,...,-0.240908,0.018232,-0.016774,-0.045310,-0.073097,-0.040038,0.120931,-0.102943,-0.095874,-0.312954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
salamander,0.096228,-0.010855,-0.075636,-0.249446,-0.040038,-0.418216,0.112679,-0.112797,-0.329545,-0.053768,...,-0.133096,-0.008122,-0.402202,0.017078,0.187428,1.000000,-0.039335,0.412278,-0.289802,0.198891
subsidise,0.085406,-0.015348,0.193103,0.100355,0.120931,0.065917,0.056236,0.207329,0.311794,0.055868,...,0.070483,0.066019,0.161404,0.382829,0.081333,-0.039335,1.000000,-0.384266,0.501774,0.098403
affair,0.022897,-0.089455,-0.395140,-0.186217,-0.102943,-0.374214,0.412072,-0.194662,-0.137489,-0.002264,...,-0.132596,-0.185482,-0.225167,0.152667,0.134940,0.412278,-0.384266,1.000000,-0.457239,0.274568
recomendation,-0.068913,-0.119494,0.122972,-0.255388,-0.095874,0.152697,0.057575,0.452812,0.399243,-0.049620,...,0.038348,0.327888,0.246783,0.421507,-0.001800,-0.289802,0.501774,-0.457239,1.000000,0.030047


In [155]:
%%time
model3_cosine_similarity_matrix = get_similarity_matrix(model3_language_model_filter.select_pos_embeddings(),
                                                        model3_language_model_filter.extract_pos(),
                                                        metric=cosine_similarity)
model3_cosine_similarity_matrix

CPU times: user 25min 3s, sys: 3.14 s, total: 25min 6s
Wall time: 25min 5s


Unnamed: 0,social,covid19,measure,european,support,council,government,regulation,company,commission,...,approvedamounte,munition,everrise,advertiser,cutoff,automatically.brussel,questionsanswer,handler,airplane,boto
social,1.000000,0.047261,0.302269,-0.019376,0.056585,0.038042,0.151979,-0.019496,-0.038914,-0.037715,...,-0.039634,0.127300,0.026371,-0.012672,0.011492,0.057163,0.188196,-0.100938,-0.369806,-0.082933
covid19,0.047261,1.000000,0.146516,0.082327,0.100074,0.044251,0.159929,0.125610,0.167732,0.096154,...,-0.024665,0.030856,0.209850,0.213827,-0.030014,-0.051312,0.026658,-0.036327,-0.189783,0.151625
measure,0.302269,0.146516,1.000000,-0.086456,0.337076,0.034257,0.317543,0.090096,0.137636,-0.135527,...,0.087248,-0.028011,0.297404,-0.030079,0.006785,0.031503,0.153136,-0.245271,-0.308768,0.133775
european,-0.019376,0.082327,-0.086456,1.000000,0.118375,0.451246,0.022791,0.240230,-0.085735,0.162220,...,0.038040,0.212847,0.105400,0.015265,0.146220,0.071790,0.171603,0.315906,0.050668,0.144458
support,0.056585,0.100074,0.337076,0.118375,1.000000,0.096740,0.208751,0.073296,0.046556,-0.017247,...,0.157114,-0.062734,0.132723,-0.244210,0.048411,0.204179,-0.018281,-0.182164,-0.110551,0.163857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
automatically.brussel,0.057163,-0.051312,0.031503,0.071790,0.204179,0.154683,0.013049,-0.065895,0.068018,0.122448,...,0.050595,-0.010092,-0.069824,-0.354710,0.673157,1.000000,-0.029381,-0.096897,-0.090031,0.381919
questionsanswer,0.188196,0.026658,0.153136,0.171603,-0.018281,0.229237,0.118948,0.193595,0.017860,0.300632,...,-0.006560,0.803923,0.340550,0.454249,0.277353,-0.029381,1.000000,0.311355,-0.185365,0.468453
handler,-0.100938,-0.036327,-0.245271,0.315906,-0.182164,0.099855,0.047742,-0.031953,0.083851,0.152472,...,0.002418,0.294676,0.111859,0.270040,0.145424,-0.096897,0.311355,1.000000,0.229185,0.331484
airplane,-0.369806,-0.189783,-0.308768,0.050668,-0.110551,-0.005845,-0.150000,0.068081,0.012641,0.055260,...,-0.048040,-0.113657,-0.244392,-0.033106,0.029493,-0.090031,-0.185365,0.229185,1.000000,-0.078955


### Manhattan similarity

In [156]:
%%time
model1_manhattan_similarity_matrix = get_similarity_matrix(model1_language_model_filter.select_pos_embeddings(),
                                                           model1_language_model_filter.extract_pos(),
                                                           metric=manhattan_similarity)
model1_manhattan_similarity_matrix

CPU times: user 14min 59s, sys: 732 ms, total: 14min 59s
Wall time: 15min


Unnamed: 0,social,measure,covid19,support,government,company,employees,measures,period,companies,...,outlook,revisit,skille,consutle,pedagogue,appearance,walloniebruxelle,invest.brussel,deadlines.flander,colive
social,1.000000,0.003328,0.002649,0.003028,0.002814,0.002918,0.003114,0.003108,0.003132,0.003125,...,0.003723,0.003724,0.003724,0.003725,0.003722,0.003723,0.003720,0.003721,0.003720,0.003722
measure,0.003328,1.000000,0.002925,0.003912,0.003491,0.003518,0.003548,0.007710,0.004097,0.004145,...,0.004635,0.004632,0.004639,0.004638,0.004631,0.004643,0.004632,0.004632,0.004641,0.004643
covid19,0.002649,0.002925,1.000000,0.003028,0.002879,0.003302,0.003164,0.003261,0.003319,0.003421,...,0.003673,0.003668,0.003666,0.003662,0.003662,0.003672,0.003661,0.003666,0.003661,0.003673
support,0.003028,0.003912,0.003028,1.000000,0.003279,0.003469,0.003636,0.003560,0.003634,0.003983,...,0.004703,0.004703,0.004717,0.004703,0.004704,0.004705,0.004709,0.004705,0.004704,0.004707
government,0.002814,0.003491,0.002879,0.003279,1.000000,0.003163,0.003127,0.003514,0.003157,0.003260,...,0.003775,0.003772,0.003773,0.003776,0.003772,0.003779,0.003774,0.003772,0.003776,0.003777
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
appearance,0.003723,0.004643,0.003672,0.004705,0.003779,0.005341,0.005090,0.004358,0.005764,0.005830,...,0.467610,0.351231,0.351070,0.354458,0.382433,1.000000,0.328692,0.338967,0.335299,0.464067
walloniebruxelle,0.003720,0.004632,0.003661,0.004709,0.003774,0.005340,0.005086,0.004348,0.005747,0.005830,...,0.325016,0.318744,0.340388,0.314917,0.381068,0.328692,1.000000,0.340315,0.280700,0.337550
invest.brussel,0.003721,0.004632,0.003666,0.004705,0.003772,0.005336,0.005089,0.004346,0.005733,0.005826,...,0.356213,0.343195,0.356779,0.342668,0.381393,0.338967,0.340315,1.000000,0.343618,0.377392
deadlines.flander,0.003720,0.004641,0.003661,0.004704,0.003776,0.005332,0.005096,0.004354,0.005756,0.005818,...,0.352887,0.346610,0.364487,0.347214,0.382428,0.335299,0.280700,0.343618,1.000000,0.374029


In [157]:
model2_manhattan_similarity_matrix = get_similarity_matrix(model2_language_model_filter.select_pos_embeddings(),
                                                           model2_language_model_filter.extract_pos(),
                                                           metric=manhattan_similarity)
model2_manhattan_similarity_matrix

Unnamed: 0,european,council,regulation,commission,parliament,decision,committee,c,case,document,...,cheese,donation,supplier,torvald,salamandrivoran,salamander,subsidise,affair,recomendation,boto
european,1.000000,0.007463,0.005203,0.006098,0.006612,0.005198,0.006044,0.003325,0.002928,0.004877,...,0.005410,0.005421,0.005425,0.005423,0.005422,0.005425,0.005425,0.005422,0.005420,0.005416
council,0.007463,1.000000,0.006318,0.007261,0.006287,0.005532,0.005414,0.003377,0.002867,0.005150,...,0.005398,0.005415,0.005417,0.005406,0.005405,0.005409,0.005407,0.005405,0.005404,0.005396
regulation,0.005203,0.006318,1.000000,0.005080,0.006090,0.006350,0.004019,0.003243,0.002770,0.004306,...,0.004924,0.004933,0.004929,0.004927,0.004923,0.004928,0.004939,0.004914,0.004939,0.004923
commission,0.006098,0.007261,0.005080,1.000000,0.006079,0.005842,0.005005,0.003653,0.003305,0.007021,...,0.006697,0.006706,0.006734,0.006711,0.006723,0.006697,0.006716,0.006699,0.006688,0.006698
parliament,0.006612,0.006287,0.006090,0.006079,1.000000,0.005253,0.005085,0.003172,0.002775,0.005008,...,0.004910,0.004919,0.004921,0.004919,0.004914,0.004915,0.004926,0.004912,0.004917,0.004911
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
salamander,0.005425,0.005409,0.004928,0.006697,0.004915,0.005950,0.005151,0.004396,0.004337,0.005087,...,0.323153,0.329601,0.251994,0.355563,0.345875,1.000000,0.339462,0.370658,0.310972,0.416754
subsidise,0.005425,0.005407,0.004939,0.006716,0.004926,0.005968,0.005151,0.004404,0.004353,0.005091,...,0.325394,0.317107,0.278637,0.376983,0.312028,0.339462,1.000000,0.262862,0.394103,0.355865
affair,0.005422,0.005405,0.004914,0.006699,0.004912,0.005944,0.005166,0.004392,0.004338,0.005085,...,0.281145,0.274125,0.232987,0.319620,0.308289,0.370658,0.262862,1.000000,0.256272,0.354954
recomendation,0.005420,0.005404,0.004939,0.006688,0.004917,0.005974,0.005149,0.004414,0.004357,0.005087,...,0.316515,0.344127,0.287004,0.378579,0.299337,0.310972,0.394103,0.256272,1.000000,0.351290


In [158]:
%%time
model3_manhattan_similarity_matrix = get_similarity_matrix(model3_language_model_filter.select_pos_embeddings(),
                                                           model3_language_model_filter.extract_pos(),
                                                           metric=manhattan_similarity)
model3_manhattan_similarity_matrix

CPU times: user 20min 7s, sys: 2 s, total: 20min 9s
Wall time: 20min 10s


Unnamed: 0,social,covid19,measure,european,support,council,government,regulation,company,commission,...,approvedamounte,munition,everrise,advertiser,cutoff,automatically.brussel,questionsanswer,handler,airplane,boto
social,1.000000,0.002455,0.003110,0.002329,0.002700,0.002292,0.002687,0.002324,0.002654,0.002498,...,0.003254,0.003257,0.003256,0.003255,0.003256,0.003256,0.003259,0.003252,0.003249,0.003252
covid19,0.002455,1.000000,0.002985,0.002661,0.002889,0.002369,0.002833,0.002553,0.003190,0.002777,...,0.003633,0.003633,0.003637,0.003637,0.003632,0.003632,0.003632,0.003633,0.003630,0.003636
measure,0.003110,0.002985,1.000000,0.002423,0.003456,0.002488,0.003232,0.002720,0.003481,0.002686,...,0.004162,0.004159,0.004167,0.004159,0.004158,0.004158,0.004165,0.004152,0.004150,0.004163
european,0.002329,0.002661,0.002423,1.000000,0.002813,0.003102,0.002542,0.002835,0.002676,0.002904,...,0.003464,0.003466,0.003465,0.003463,0.003467,0.003465,0.003466,0.003469,0.003464,0.003466
support,0.002700,0.002889,0.003456,0.002813,1.000000,0.002611,0.003047,0.002721,0.003246,0.002821,...,0.004172,0.004166,0.004171,0.004162,0.004170,0.004176,0.004167,0.004165,0.004166,0.004175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
automatically.brussel,0.003256,0.003632,0.004158,0.003465,0.004176,0.002995,0.003694,0.003342,0.004971,0.003877,...,0.257617,0.269354,0.268863,0.249115,0.356571,1.000000,0.240243,0.255405,0.264911,0.302309
questionsanswer,0.003259,0.003632,0.004165,0.003466,0.004167,0.002999,0.003694,0.003347,0.004967,0.003885,...,0.253718,0.427712,0.311689,0.334277,0.269159,0.240243,1.000000,0.296294,0.257009,0.301854
handler,0.003252,0.003633,0.004152,0.003469,0.004165,0.002993,0.003693,0.003342,0.004973,0.003878,...,0.278211,0.326556,0.316071,0.338031,0.275093,0.255405,0.296294,1.000000,0.329701,0.304744
airplane,0.003249,0.003630,0.004150,0.003464,0.004166,0.002992,0.003690,0.003345,0.004968,0.003875,...,0.296467,0.297905,0.312211,0.330538,0.274980,0.264911,0.257009,0.329701,1.000000,0.271918


### Select key words as clusters to visualize the graph similarity between this words

In [186]:
model1_d3graph = create_graph_for_language_model_key_words(model1_cosine_similarity_matrix,
                                                           model1_language_model_filter.select_key_words(),
                                                           model_number=1)


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/agreement.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/companies.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/workers.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/measures.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/temporary.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/social.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/support.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/covid19.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/public.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/national.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/statement.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/health.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/minister.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/new.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/coronavirus.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/vaccine.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/support.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/commission.html


In [184]:
model2_d3graph = create_graph_for_language_model_key_words(model2_cosine_similarity_matrix,
                                                           model2_language_model_filter.select_key_words(),
                                                           model_number=2)


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/agreement.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/companies.html




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/workers.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/measures.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/temporary.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/social.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/support.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/covid19.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/public.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/national.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/statement.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/health.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/minister.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/new.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/coronavirus.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/vaccine.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/support.html




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/commission.html


In [185]:
model3_d3graph = create_graph_for_language_model_key_words(model3_cosine_similarity_matrix,
                                                           model3_language_model_filter.select_key_words(),
                                                           model_number=3)

[d3graph] >Creating directory [docs/word-similarity-web/model3_graphs/]
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/agreement.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/companies.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is tryin

Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/workers.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/measures.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/temporary.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/social.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/support.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/covid19.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/public.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/national.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/statement.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/health.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/minister.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/new.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/coronavirus.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/vaccine.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/support.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/commission.html


### Steps for word embedding visualization:
* detect and extract key words and most relevant words
* train TSNE model
* create a dataframe with the clusters, their words and their placement on the graph

In [187]:
model1_word_embeddings = select_words_and_embedding_clusters(model1_language_model_pipeline.word2vec.wv,
                                                             model1_language_model_filter.select_key_words())
model2_word_embeddings = select_words_and_embedding_clusters(model2_language_model_pipeline.word2vec.wv,
                                                             model2_language_model_filter.select_key_words())
model3_word_embeddings = select_words_and_embedding_clusters(model3_language_model_pipeline.word2vec.wv,
                                                             model3_language_model_filter.select_key_words())

model1_tsne_model = create_tsne_model(model1_word_embeddings[0])
model2_tsne_model = create_tsne_model(model2_word_embeddings[0])
model3_tsne_model = create_tsne_model(model3_word_embeddings[0])

model1_word_embeddings_dataframe = create_word_clusters_matrix(model1_language_model_filter.select_key_words(),
                                                               model1_word_embeddings[1], model1_tsne_model)
model2_word_embeddings_dataframe = create_word_clusters_matrix(model2_language_model_filter.select_key_words(),
                                                               model2_word_embeddings[1], model2_tsne_model)
model3_word_embeddings_dataframe = create_word_clusters_matrix(model3_language_model_filter.select_key_words(),
                                                               model3_word_embeddings[1], model3_tsne_model)

### Graph visualization for the first model

In [188]:
model1_word_cluster_plot = px.scatter(model1_word_embeddings_dataframe,
                                      x='X', y='Y', color=model1_word_embeddings_dataframe.word_cluster,
                                      labels={'color': 'word'}, hover_data=["word"])
model1_word_cluster_plot


### Graph visualization for the second model

In [189]:
model2_word_cluster_plot = px.scatter(model2_word_embeddings_dataframe,
                                      x='X', y='Y', color=model2_word_embeddings_dataframe.word_cluster,
                                      labels={'color': 'word'}, hover_data=["word"])
model2_word_cluster_plot


### Graph visualization for the third model

In [190]:
model3_word_cluster_plot = px.scatter(model3_word_embeddings_dataframe,
                                      x='X', y='Y', color=model3_word_embeddings_dataframe.word_cluster,
                                      labels={'color': 'word'}, hover_data=["word"])
model3_word_cluster_plot