## Word2vec model training
#### Model training based on three datasets' text data:
- M1: pwdb + eu_timeline  ( +  ireland_timeline )
- M2: ds_eu_cellar
- M3: M1+M2

#### Extract NOUN and NOUN PHRASES from each text data
#### Train the word2vec model with each dataset's textual data

### Import libraries

In [96]:
import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))

import os
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from typing import List, Tuple

import spacy
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 1500000

import numpy as np
import pandas as pd
import networkx as nx
from gensim.models import Word2Vec
from d3graph import d3graph
import plotly.express as px

from sem_covid.services.data_registry import Dataset
from sem_covid.adapters.data_source import IndexTabularDataSource

from sem_covid.entrypoints.notebooks.topic_modeling.topic_modeling_wrangling.token_management import select_pos

from sem_covid.services.sc_wrangling.data_cleaning import (clean_text_from_specific_characters, clean_fix_unicode,
                                                           clean_remove_currency_symbols, clean_remove_emails,
                                                           clean_remove_urls, clean_remove_stopwords)

from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.similarity_calculus import (
    euclidean_similarity, manhattan_similarity, cosine_similarity, get_similarity_matrix)

from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.document_handling_tools import(
    document_atomization_noun_phrases, lemmatize_document)

from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.word_embeddings_handler import (
    select_words_and_embedding_clusters, create_tsne_model, create_word_clusters_matrix)

## Define constants

In [97]:
PWDB_TEXTUAL_CLASS = ['title', 'background_info_description', 'content_of_measure_description',
                      'use_of_measure_description', 'involvement_of_social_partners_description']

DEFAULT_TEXTUAL_COLUMN = ['title']
WINDOW = 5
MIN_COUNT = 1
VECTOR_SIZE = 300
EPOCHS = 50
EU_TIMELINE_TOTAL_EXAMPLES = 171
IRELAND_TIMELINE_TOTAL_EXAMPLES = 410
EU_CELLAR_TOTAL_EXAMPLES = 2653

KEY_WORDS = ['work', 'agreement', 'working', 'companies', 'workers',
             'measures', 'temporary', 'social', 'support', 'covid19',
             '2020', 'public', 'national', 'ireland', 'statement', '2021',
             'announce', 'health', 'minister', 'new', 'billion', 'coronavirus',
             'vaccine', 'eur', 'support', 'million', 'commission', 'eu']

## Data preprocessing
- data cleanup
- turn corpus into spacy document

In [98]:
def apply_cleaning_functions(document_corpus: pd.Series) -> pd.Series:
    """
    This function receives the document and leads through cleaning steps
    Args:
        document_corpus: dataset document corpus

    Returns: clean document corpus
    """
    unused_characters = ["\\r", ">", "\n", "\\", "<", "''", "%", "...", "\'", '"', "(", "\n", "*", "1)", "2)", "3)",
                         "[", "]", "-", "_", "\r"]

    new_document_corpus = document_corpus.apply(clean_text_from_specific_characters, characters=unused_characters)
    new_document_corpus = new_document_corpus.apply(clean_fix_unicode)
    new_document_corpus = new_document_corpus.apply(clean_remove_urls)
    new_document_corpus = new_document_corpus.apply(clean_remove_emails)
    new_document_corpus = new_document_corpus.apply(clean_remove_currency_symbols)
    new_document_corpus = new_document_corpus.apply(clean_remove_stopwords)

    return new_document_corpus


def generate_graph(similarity_matrix: pd.DataFrame, graph: nx.Graph, root_word: str,
                   top_words: int, threshold:np.float64 = 0.8, deep_level: int = 0,
                   max_deep_level: int = 2, deep_map: dict = None, color_map: dict = None) -> nx.Graph:
    if root_word not in deep_map.keys():
        deep_map[root_word] = (deep_level, color_map[deep_level])
    elif deep_map[root_word][0] > deep_level:
        deep_map[root_word] = (deep_level, color_map[deep_level])
    if deep_level > max_deep_level:
        return graph
    new_nodes = similarity_matrix[root_word].sort_values(ascending=False)[:top_words].index.to_list()
    new_nodes_weight = list(similarity_matrix[root_word].sort_values(ascending=False)[:top_words].values)
    for index in range(0, len(new_nodes)):
        if new_nodes_weight[index] >= threshold:
            graph.add_edge(root_word, new_nodes[index])
            generate_graph(similarity_matrix, graph, new_nodes[index], top_words, threshold, deep_level+1, max_deep_level,
                           deep_map=deep_map, color_map=color_map)

    return graph


def create_graph_for_language_model_key_words(similarity_matrix: pd.DataFrame, language_model_words: list,
                                              model_name: str) -> d3graph:
    """
    !!! This is not reusable function. It was made for a single thing !!!

    It generates d3graph based on language model selected words and and the similarity
    matrix created with those words.
    """
    graph_folder_path = f'docs/word-similarity-web/{model_name}_graphs/'
    color_map = {0: '#377eb8',
                 1: '#ffffff',
                 2: '#000000',
                 3: '#000333'}
    for index in range(0, len(language_model_words)):
        deep_map = {}
        graph = generate_graph(similarity_matrix, nx.Graph(), language_model_words[index],
                               top_words=4, threshold=0.3
                               , max_deep_level=2, deep_map=deep_map, color_map=color_map)
        network_adjacency_matrix = pd.DataFrame(data=nx.adjacency_matrix(graph).todense(),
                                                index=graph.nodes(), columns=graph.nodes())
        node_color_list = [deep_map[node][0] for node in graph.nodes()]
        d3graph(network_adjacency_matrix, savepath=graph_folder_path, savename=language_model_words[index],
                node_color=node_color_list,
                width=1920, height=1080, edge_width=5,
                edge_distance=60, directed=True)
        

In [104]:
class LanguageModelPipeline:
    """
        This pipeline executes the steps for word2vec language training.
    """

    def __init__(self, dataset_sources: List[Tuple[IndexTabularDataSource, List[str]]]):
        """
            :param dataset_sources: represents the source of the datasets.
        """
        self.dataset_sources = dataset_sources
        self.documents_corpus = pd.Series()
        self.word2vec = None

    def download_datasets(self):
        """
            In this step it will download the dataset and detect selected columns.
            It can be downloaded as many datasets as there are in data source.
        """
        self.dataset_sources = [(dataset_columns, dataset_source.fetch())
                                for dataset_source, dataset_columns in self.dataset_sources]

    def extract_textual_data(self):
        """
            After downloading the datasets, the textual data will be found and and concatenated
            with executing of several steps as well. It will fill the NaN values with empty space,
            add a dot at the end of each concatenated column and reset the index.
        """
        self.documents_corpus = pd.concat([dataset[columns]
                                          .fillna(value="")
                                          .agg('. '.join, axis=1)
                                          .reset_index(drop=True)
                                           for columns, dataset in self.dataset_sources
                                           ], ignore_index=True)

    def clean_textual_data(self):
        """
            The next step is data cleaning. In this step the function "apply_cleaning_functions"
            applies the following actions:
                - clean the document from specific characters
                - delete unicode
                - removes emails and URLs and currency symbols
        """
        self.documents_corpus = apply_cleaning_functions(self.documents_corpus)

    def transform_to_spacy_doc(self):
        """
            When the document is clean, is going to be transform into spacy document
        """
        self.documents_corpus = self.documents_corpus.apply(nlp)

    def extract_features(self):
        """
            To extract the parts of speech, below it was defined classes for each token is necessary.
        """
        self.documents_corpus = pd.concat([self.documents_corpus,
                                           self.documents_corpus.apply(document_atomization_noun_phrases),
                                           self.documents_corpus.apply(lemmatize_document)]
                                           ,ignore_index=True)

        self.documents_corpus = self.documents_corpus.apply(lambda x: list(map(str, x)))

    def model_training(self):
        """
            When the data is prepared it's stored into Word2Vec model.
        """
        self.word2vec = Word2Vec(sentences=self.documents_corpus, window=WINDOW,
                                 min_count=MIN_COUNT, vector_size=VECTOR_SIZE)

    def execute(self):
        """
            The final step is execution, where are stored each step and it will be executed in a row
        """
        self.download_datasets()
        self.extract_textual_data()
        self.clean_textual_data()
        self.transform_to_spacy_doc()
        self.extract_features()
        self.model_training()


class LanguageModelWordsFilter:
    def __init__(self, word2vec_model: Word2Vec, key_words: List[str], pos: List[str]) -> None:
        self.word2vec_model = word2vec_model
        self.key_words = key_words
        self.pos = pos
        self.word2vec_document = None
        self.word2vec_document = nlp(' '.join(self.word2vec_model.wv.index_to_key))
        self.word2vec_document = select_pos(self.word2vec_document, self.pos)
        self._extract_pos = list(map(str, self.word2vec_document))

    def extract_pos(self) -> List[str]:
        """
            transforms a word2vec indexes into spacy document and selects parts of
            speech. After that it puts into a list and converts those parts of speech
            into strings.
        """
        return self._extract_pos

    def select_key_words(self) -> List[str]:
        """
            Finds each word form inserted list of key words and returns a
            list with those words if there are presented in the list of
            extracted parts of speech.
        """
        return [word for word in self.key_words if word in self.extract_pos()]

    def select_pos_index(self) -> List[int]:
        """
            Detects the part of speech indexes and returns them into a list
        """
        return [self.word2vec_model.wv.index_to_key.index(token) for token in self.extract_pos()
                if token in self.word2vec_model.wv.index_to_key]

    def select_pos_embeddings(self) -> List[np.ndarray]:
        """
            Detects part of speech embeddings from their indexes
        """
        return [self.word2vec_model.wv.vectors[index] for index in self.select_pos_index()]

## Experiment Nr#1 language model based on:
- PWDB
- eu-timeline
- ireland-timeline

In [75]:
%%time

dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN)
]
model1_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
model1_language_model_pipeline.execute()

CPU times: user 4min 19s, sys: 332 ms, total: 4min 19s
Wall time: 6min 38s


## Experiment Nr#2 language model based on:
- eu-cellar

In [64]:
%%time

dataset_sources_config = [
    (Dataset.EU_CELLAR, DEFAULT_TEXTUAL_COLUMN)
]
model2_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
model2_language_model_pipeline.execute()

CPU times: user 1min 23s, sys: 119 ms, total: 1min 23s
Wall time: 2min 4s


In [184]:
from spacy.tokens import Doc

a = model1_language_model_pipeline.word2vec.wv.index_to_key
d = Doc(nlp.vocab, words=a)
d1 = nlp(' '.join(a))
word2vec_document = select_pos(d, pos=['NOUN', 'ADJ'])

In [190]:
type(d1)

spacy.tokens.doc.Doc

In [191]:
type(d)

spacy.tokens.doc.Doc

In [180]:
extract_pos = list(map(str, word2vec_document))

c = [a.index(token) for token in extract_pos if token in a]
#
# [self.word2vec_model.wv.index_to_key.index(token) for token in self.extract_pos()
#                 if token in self.word2vec_model.wv.index_to_key]

In [182]:
len(extract_pos)

22814

## Experiment Nr#3 language model based on:
- PWDB
- eu-timeline
- ireland-timeline
- eu-cellar

In [80]:
%%time

dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.EU_CELLAR, DEFAULT_TEXTUAL_COLUMN)
]
model3_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
model3_language_model_pipeline.execute()

CPU times: user 6min 34s, sys: 1.03 s, total: 6min 35s
Wall time: 11min 32s


In [101]:
def execute_model_steps(model_name: str, dataset_sources_config: List[tuple]):
    model_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
    model_language_model_pipeline.execute()
    model_language_model_filter = LanguageModelWordsFilter(model_language_model_pipeline.word2vec,
                                                           KEY_WORDS, pos=['NOUN', 'ADJ'])

    similarity_functions = [cosine_similarity]
    for similarity_function in similarity_functions:
        print('Start computing similarity matrix.')
        model_similarity_matrix = get_similarity_matrix(model_language_model_filter.select_pos_embeddings(),
                                                        model_language_model_filter.extract_pos(),
                                                        metric=similarity_function)
        print('Finish computing similarity matrix.')
        print('Save similarity matrix.')
        model_similarity_matrix.to_csv(f'docs/similarity_matrices/{model_name}_{similarity_function.__name__}_matrix.csv', index=False)
        print('Create d3Graphs')
        create_graph_for_language_model_key_words(model_similarity_matrix,
                                                  model_language_model_filter.select_key_words(),
                                                  model_name=model_name)

In [199]:
%%time

model1_dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    #(Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    #(Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN)
]

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.06 µs


In [None]:
model1 = execute_model_steps('model1', model1_dataset_sources_config)


In [200]:
dataset_sources_config = model1_dataset_sources_config
model_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
model_language_model_pipeline.execute()

In [201]:
model_language_model_filter = LanguageModelWordsFilter(model_language_model_pipeline.word2vec,
                                                       KEY_WORDS, pos=['NOUN', 'ADJ'])

In [None]:
 def __init__(self, word2vec_model: Word2Vec, key_words: List[str], pos: List[str]) -> None:
        self.word2vec_model = word2vec_model
        self.key_words = key_words
        self.pos = pos
        self.word2vec_document = None
        self.word2vec_document = nlp(' '.join(self.word2vec_model.wv.index_to_key))
        self.word2vec_document = select_pos(self.word2vec_document, self.pos)
        self._extract_pos = list(map(str, self.word2vec_document))

    def extract_pos(self) -> List[str]:
        """
            transforms a word2vec indexes into spacy document and selects parts of
            speech. After that it puts into a list and converts those parts of speech
            into strings.
        """
        return self._extract_pos

    def select_key_words(self) -> List[str]:
        """
            Finds each word form inserted list of key words and returns a
            list with those words if there are presented in the list of
            extracted parts of speech.
        """
        return [word for word in self.key_words if word in self.extract_pos()]

    def select_pos_index(self) -> List[int]:
        """
            Detects the part of speech indexes and returns them into a list
        """
        return [self.word2vec_model.wv.index_to_key.index(token) for token in self.extract_pos()
                if token in self.word2vec_model.wv.index_to_key]

    def select_pos_embeddings(self) -> List[np.ndarray]:
        """
            Detects part of speech embeddings from their indexes
        """
        return [self.word2vec_model.wv.vectors[index] for index in self.select_pos_index()]

In [208]:
len(model_language_model_filter._extract_pos)

22597

In [210]:
word2vec_document = nlp(' '.join(model_language_model_pipeline.word2vec.wv.index_to_key))

In [211]:
word2vec_document



In [212]:
word2vec_document = select_pos(word2vec_document, ['NOUN', 'ADJ'])

203

In [215]:
[ token for token in word2vec_document
  if str(token) not in model_language_model_pipeline.word2vec.wv.index_to_key
  ]

[personal_protection_equipment,
 ministry_economic_affairs_digital_transformation_grant_guarantees,
 tanaiste_minister_enterprise,
 pure_social_measure,
 degree_involvement,
 epidemic_crisis,
 plan_main_focus_workers,
 suitable_skin_surfaces,
 fund_initial_endowment,
 purpose_measure_finance_operating_costs]

In [None]:
list(map(str, self.word2vec_document))

In [209]:
len(model_language_model_filter.select_pos_index())

22587

In [202]:
len(model_language_model_filter.extract_pos())

22597

In [203]:
len(model_language_model_filter.select_pos_embeddings())

22587

In [None]:
similarity_functions = [cosine_similarity]
for similarity_function in similarity_functions:
    print('Start computing similarity matrix.')
    model_similarity_matrix = get_similarity_matrix(model_language_model_filter.select_pos_embeddings(),
                                                    model_language_model_filter.extract_pos(),
                                                    metric=similarity_function)
    print('Finish computing similarity matrix.')
    print('Save similarity matrix.')
    model_similarity_matrix.to_csv(f'docs/similarity_matrices/{model_name}_{similarity_function.__name__}_matrix.csv', index=False)
    print('Create d3Graphs')
    create_graph_for_language_model_key_words(model_similarity_matrix,
                                              model_language_model_filter.select_key_words(),
                                              model_name=model_name)

In [102]:
%%time

model2_dataset_sources_config = [
    (Dataset.EU_CELLAR, DEFAULT_TEXTUAL_COLUMN)
]

model2 = execute_model_steps('model2', model2_dataset_sources_config)

Start computing similarity matrix.
Finish computing similarity matrix.
Save similarity matrix.
Create d3Graphs
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/agreement.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/companies.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a sl

Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/workers.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/measures.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/temporary.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/social.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/support.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/covid19.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/public.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/statement.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/health.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/minister.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/new.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/coronavirus.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/vaccine.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/support.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/commission.html
CPU times: user 4min 15s, sys: 621 ms, total: 4min 16s
Wall time: 4min 14s


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [55]:
%%time

model3_dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.EU_CELLAR, DEFAULT_TEXTUAL_COLUMN)
]

model3 = execute_model_steps('model3', model3_dataset_sources_config)

Start computing similarity matrix.


KeyboardInterrupt: 

In [90]:
%%time

# model1_language_model_filter = LanguageModelWordsFilter(model1_language_model_pipeline.word2vec,
#                                                 KEY_WORDS, pos=['NOUN', 'ADJ'])

model2_language_model_filter = LanguageModelWordsFilter(model2_language_model_pipeline.word2vec,
                                                        KEY_WORDS, pos=['NOUN', 'ADJ'])

# model3_language_model_filter = LanguageModelWordsFilter(model3_language_model_pipeline.word2vec,
#                                                 KEY_WORDS, pos=['NOUN', 'ADJ'])

CPU times: user 2.34 s, sys: 3.69 ms, total: 2.34 s
Wall time: 4.85 s


## Similarity matrices
### Euclidean similarity

In [27]:
# %%time
#
# model1_euclidean_similarity_matrix = get_similarity_matrix(model1_language_model_filter.select_pos_embeddings(),
#                                                            model1_language_model_filter.extract_pos(),
#                                                            metric=euclidean_similarity)
# model1_euclidean_similarity_matrix

KeyboardInterrupt: 

In [28]:
# %%time
#
# model1_euclidean_similarity_matrix.to_csv('docs/similarity_matrices/model1_euclidean_similarity_matrix.csv', index=False)
#
#

NameError: name 'model1_euclidean_similarity_matrix' is not defined

In [34]:
# %%time
#
# model2_euclidean_similarity_matrix = get_similarity_matrix(model2_language_model_filter.select_pos_embeddings(),
#                                                            model2_language_model_filter.extract_pos(),
#                                                            metric=euclidean_similarity)
# model2_euclidean_similarity_matrix

CPU times: user 2min 37s, sys: 2.54 s, total: 2min 39s
Wall time: 2min 37s


Unnamed: 0,commission,parliament,decision,committee,case,document,text,covid19,certain,relevance,...,communication_commission_approval_content_draft_commission_regulation,transparency_confidentiality_requirements,euipo_heretat,sum01_eu_trade_mark_opposition_proceedings,mark_sum011,prior_national_word,transport&logistics,specific_limit_values,case_c744/19:_judgment_court_ninth_chamber,boto
commission,1.000000,0.082800,0.075305,0.064007,0.045613,0.084016,0.058155,0.050062,0.055557,0.054982,...,0.082736,0.082633,0.083345,0.083448,0.083295,0.083205,0.082859,0.082788,0.083021,0.082768
parliament,0.082800,1.000000,0.070180,0.069163,0.039239,0.062551,0.055359,0.050582,0.052394,0.053493,...,0.063684,0.063534,0.063912,0.063998,0.063933,0.063826,0.063742,0.063685,0.063740,0.063652
decision,0.075305,0.070180,1.000000,0.054997,0.044865,0.058793,0.056815,0.051697,0.054655,0.056378,...,0.075993,0.075986,0.076302,0.076406,0.076282,0.076204,0.076045,0.075974,0.075949,0.075834
committee,0.064007,0.069163,0.054997,1.000000,0.043116,0.060642,0.051762,0.050852,0.052483,0.051999,...,0.060487,0.060441,0.060803,0.060950,0.060891,0.060889,0.060648,0.060558,0.060576,0.060580
case,0.045613,0.039239,0.044865,0.043116,1.000000,0.040346,0.046568,0.044179,0.048377,0.047620,...,0.056998,0.056992,0.057450,0.057594,0.057465,0.057553,0.057316,0.057095,0.057171,0.057211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
prior_national_word,0.083205,0.063826,0.076204,0.060889,0.057553,0.062363,0.070270,0.059047,0.070519,0.064096,...,0.788857,0.784391,0.910535,0.917123,0.933999,1.000000,0.846005,0.824710,0.829553,0.829118
transport&logistics,0.082859,0.063742,0.076045,0.060648,0.057316,0.061997,0.070264,0.058963,0.070101,0.064039,...,0.871099,0.845438,0.875161,0.826948,0.863864,0.846005,1.000000,0.897038,0.909750,0.904828
specific_limit_values,0.082788,0.063685,0.075974,0.060558,0.057095,0.062083,0.070295,0.058957,0.070126,0.064021,...,0.884574,0.873756,0.850773,0.804746,0.841960,0.824710,0.897038,1.000000,0.908934,0.896672
case_c744/19:_judgment_court_ninth_chamber,0.083021,0.063740,0.075949,0.060576,0.057171,0.062046,0.070066,0.058889,0.070040,0.063841,...,0.900336,0.873365,0.861692,0.808956,0.849012,0.829553,0.909750,0.908934,1.000000,0.925948


In [30]:
# %%time
#
# model2_euclidean_similarity_matrix.to_csv('docs/similarity_matrices/model2_euclidean_similarity_matrix.csv', index=False)
#
#

NameError: name 'model2_euclidean_similarity_matrix' is not defined

In [31]:
# %%time
#
# model3_euclidean_similarity_matrix = get_similarity_matrix(model3_language_model_filter.select_pos_embeddings(),
#                                                            model3_language_model_filter.extract_pos(),
#                                                            metric=euclidean_similarity)
# model3_euclidean_similarity_matrix
#

KeyboardInterrupt: 

In [32]:
# %%time
#
# model3_euclidean_similarity_matrix.to_csv('docs/similarity_matrices/model3_euclidean_similarity_matrix.csv', index=False)


NameError: name 'model3_euclidean_similarity_matrix' is not defined

### Cosine similarity

In [33]:
# %%time
#
# model1_cosine_similarity_matrix = get_similarity_matrix(model1_language_model_filter.select_pos_embeddings(),
#                                                         model1_language_model_filter.extract_pos(),
#                                                         metric=cosine_similarity)
# model1_cosine_similarity_matrix
#
#

KeyboardInterrupt: 

In [34]:
# %%time
#
# model1_cosine_similarity_matrix.to_csv('docs/similarity_matrices/model1_cosine_similarity_matrix.csv', index=False)
#

NameError: name 'model1_cosine_similarity_matrix' is not defined

In [91]:
%%time

model2_cosine_similarity_matrix = get_similarity_matrix(model2_language_model_filter.select_pos_embeddings(),
                                                        model2_language_model_filter.extract_pos(),
                                                        metric=cosine_similarity)
model2_cosine_similarity_matrix

CPU times: user 4min 53s, sys: 704 ms, total: 4min 53s
Wall time: 11min 49s


Unnamed: 0,commission,parliament,decision,committee,case,document,text,covid19,certain,relevance,...,communication_commission_approval_content_draft_commission_regulation,transparency_confidentiality_requirements,euipo_heretat,sum01_eu_trade_mark_opposition_proceedings,mark_sum011,prior_national_word,transport&logistics,specific_limit_values,case_c744/19:_judgment_court_ninth_chamber,boto
commission,1.000000,0.670427,0.414224,0.423243,-0.145643,0.686729,0.183406,0.053929,0.064672,0.191782,...,0.180886,0.329421,0.347208,0.310690,0.339355,0.348500,0.193206,0.487790,0.494167,-0.112157
parliament,0.670427,1.000000,0.522375,0.562525,-0.262322,0.517516,0.252316,0.232249,0.161022,0.250920,...,0.219128,0.240692,0.212929,0.189861,0.253888,0.269060,0.059297,0.647529,0.426200,-0.109268
decision,0.414224,0.522375,1.000000,0.262408,-0.067434,0.303380,0.181880,0.170522,0.019721,0.217507,...,0.564989,0.298231,0.247045,0.258547,0.343549,0.366274,0.130816,0.315021,0.444230,-0.194096
committee,0.423243,0.562525,0.262408,1.000000,0.040751,0.524238,0.212713,0.264622,0.261182,0.272301,...,0.035211,-0.129559,0.382092,0.435149,0.448923,0.423003,0.034829,0.594126,0.409555,0.139304
case,-0.145643,-0.262322,-0.067434,0.040751,1.000000,-0.151771,0.086390,0.038609,0.174968,0.200303,...,-0.080052,-0.563047,0.636366,0.654276,0.610527,0.596396,0.569846,-0.136172,0.371383,0.252114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
prior_national_word,0.348500,0.269060,0.366274,0.423003,0.596396,0.418330,0.412129,0.272794,0.522457,0.490261,...,0.072649,-0.341930,0.945480,0.957843,0.973236,1.000000,0.378898,0.430758,0.783506,-0.203918
transport&logistics,0.193206,0.059297,0.130816,0.034829,0.569846,-0.173157,-0.023046,-0.340299,-0.098120,0.086016,...,0.336724,-0.168342,0.369911,0.379737,0.368428,0.378898,1.000000,-0.192969,0.305551,0.198106
specific_limit_values,0.487790,0.647529,0.315021,0.594126,-0.136172,0.681655,0.230301,0.236811,0.287000,0.217400,...,-0.209568,0.077483,0.381948,0.408830,0.450076,0.430758,-0.192969,1.000000,0.367981,-0.140975
case_c744/19:_judgment_court_ninth_chamber,0.494167,0.426200,0.444230,0.409555,0.371383,0.364526,0.509949,0.230867,0.391981,0.552217,...,0.182003,-0.284774,0.746701,0.732091,0.739655,0.783506,0.305551,0.367981,1.000000,-0.279124


In [36]:
# %%time
#
# model2_cosine_similarity_matrix.to_csv('docs/similarity_matrices/model2_cosine_similarity_matrix.csv', index=False)
#
#

KeyboardInterrupt: 

In [37]:
# %%time
# model3_cosine_similarity_matrix = get_similarity_matrix(model3_language_model_filter.select_pos_embeddings(),
#                                                         model3_language_model_filter.extract_pos(),
#                                                         metric=cosine_similarity)
# model3_cosine_similarity_matrix
#

KeyboardInterrupt: 

In [38]:
# model3_cosine_similarity_matrix.to_csv('docs/similarity_matrices/model3_cosine_similarity_matrix.csv', index=False)
#

NameError: name 'model3_cosine_similarity_matrix' is not defined

### Manhattan similarity

In [None]:
# %%time
# model1_manhattan_similarity_matrix = get_similarity_matrix(model1_language_model_filter.select_pos_embeddings(),
#                                                            model1_language_model_filter.extract_pos(),
#                                                            metric=manhattan_similarity)
# model1_manhattan_similarity_matrix
#

In [None]:
# model1_manhattan_similarity_matrix.to_csv('docs/similarity_matrices/model1_manhattan_similarity_matrix.csv', index=False)
#

In [None]:
# %%time
#
# model2_manhattan_similarity_matrix = get_similarity_matrix(model2_language_model_filter.select_pos_embeddings(),
#                                                            model2_language_model_filter.extract_pos(),
#                                                            metric=manhattan_similarity)
# model2_manhattan_similarity_matrix
#

In [None]:
# %%time
#
# model2_manhattan_similarity_matrix.to_csv('docs/similarity_matrices/model2_manhattan_similarity_matrix.csv', index=False)
#
#

In [None]:
# %%time
# model3_manhattan_similarity_matrix = get_similarity_matrix(model3_language_model_filter.select_pos_embeddings(),
#                                                            model3_language_model_filter.extract_pos(),
#                                                            metric=manhattan_similarity)
# model3_manhattan_similarity_matrix
#

In [None]:
# %%time
#
# model3_manhattan_similarity_matrix.to_csv('docs/similarity_matrices/model3_manhattan_similarity_matrix.csv', index=False)


### Select key words as clusters to visualize the graph similarity between this words

In [186]:
# model1_d3graph = create_graph_for_language_model_key_words(model1_cosine_similarity_matrix,
#                                                            model1_language_model_filter.select_key_words(),
#                                                            model_number=1)
#


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/agreement.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/companies.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/workers.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/measures.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/temporary.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/social.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/support.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/covid19.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/public.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/national.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/statement.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/health.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/minister.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/new.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/coronavirus.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/vaccine.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/support.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model1_graphs/commission.html


In [93]:
model2_d3graph = create_graph_for_language_model_key_words(model2_cosine_similarity_matrix,
                                                           model2_language_model_filter.select_key_words(),
                                                           model_name='model2')

  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/agreement.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/companies.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/workers.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/measures.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/temporary.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/social.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/support.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/covid19.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/public.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/statement.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/health.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/minister.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/new.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/coronavirus.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/vaccine.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/support.html


  df.columns=df.columns.str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/commission.html


In [185]:
# model3_d3graph = create_graph_for_language_model_key_words(model3_cosine_similarity_matrix,
#                                                            model3_language_model_filter.select_key_words(),
#                                                            model_number=3)

[d3graph] >Creating directory [docs/word-similarity-web/model3_graphs/]
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/agreement.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/companies.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is tryin

Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/workers.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/measures.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/temporary.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/social.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/support.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/covid19.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/public.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/national.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/statement.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/health.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/minister.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/new.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/coronavirus.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/vaccine.html



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/support.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model3_graphs/commission.html


### Steps for word embedding visualization:
* detect and extract key words and most relevant words
* train TSNE model
* create a dataframe with the clusters, their words and their placement on the graph

In [187]:
model1_word_embeddings = select_words_and_embedding_clusters(model1_language_model_pipeline.word2vec.wv,
                                                             model1_language_model_filter.select_key_words())
model2_word_embeddings = select_words_and_embedding_clusters(model2_language_model_pipeline.word2vec.wv,
                                                             model2_language_model_filter.select_key_words())
model3_word_embeddings = select_words_and_embedding_clusters(model3_language_model_pipeline.word2vec.wv,
                                                             model3_language_model_filter.select_key_words())

model1_tsne_model = create_tsne_model(model1_word_embeddings[0])
model2_tsne_model = create_tsne_model(model2_word_embeddings[0])
model3_tsne_model = create_tsne_model(model3_word_embeddings[0])

model1_word_embeddings_dataframe = create_word_clusters_matrix(model1_language_model_filter.select_key_words(),
                                                               model1_word_embeddings[1], model1_tsne_model)
model2_word_embeddings_dataframe = create_word_clusters_matrix(model2_language_model_filter.select_key_words(),
                                                               model2_word_embeddings[1], model2_tsne_model)
model3_word_embeddings_dataframe = create_word_clusters_matrix(model3_language_model_filter.select_key_words(),
                                                               model3_word_embeddings[1], model3_tsne_model)

### Graph visualization for the first model

In [188]:
model1_word_cluster_plot = px.scatter(model1_word_embeddings_dataframe,
                                      x='X', y='Y', color=model1_word_embeddings_dataframe.word_cluster,
                                      labels={'color': 'word'}, hover_data=["word"])
model1_word_cluster_plot


### Graph visualization for the second model

In [189]:
model2_word_cluster_plot = px.scatter(model2_word_embeddings_dataframe,
                                      x='X', y='Y', color=model2_word_embeddings_dataframe.word_cluster,
                                      labels={'color': 'word'}, hover_data=["word"])
model2_word_cluster_plot


### Graph visualization for the third model

In [190]:
model3_word_cluster_plot = px.scatter(model3_word_embeddings_dataframe,
                                      x='X', y='Y', color=model3_word_embeddings_dataframe.word_cluster,
                                      labels={'color': 'word'}, hover_data=["word"])
model3_word_cluster_plot