## Word2vec model training
#### Model training based on three datasets' text data:
- M1: pwdb + eu_timeline  ( +  ireland_timeline )
- M2: ds_eu_cellar
- M3: M1+M2

#### Extract NOUN and NOUN PHRASES from each text data
#### Train the word2vec model with each dataset's textual data

### Import libraries

In [4]:
import sys

import numpy as np
from IPython.core.display import display
from spacy.tokens.doc import Doc

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))

import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

from typing import List, Tuple

import spacy

nlp = spacy.load('en_core_web_sm')

import pandas as pd
from gensim.models import Word2Vec
import plotly.express as px
import networkx as nx
import matplotlib.pyplot as plt

from sem_covid.services.data_registry import Dataset
from sem_covid.adapters.data_source import IndexTabularDataSource

from sem_covid.entrypoints.notebooks.topic_modeling.topic_modeling_wrangling.token_management import (filter_pos,
    select_pos, filter_pos, filter_stop_words)

from sem_covid.services.sc_wrangling.data_cleaning import (clean_text_from_specific_characters, clean_fix_unicode,
                                                           clean_remove_currency_symbols, clean_remove_emails,
                                                           clean_remove_urls, clean_remove_stopwords)

from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.similarity_calculus import (
    euclidean_similarity, manhattan_similarity, cosine_similarity, get_similarity_matrix)

from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.document_handling_tools import(
    document_atomization_noun_phrases, lemmatize_document)

from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.word_embeddings_handler import (
    select_words_and_embedding_clusters, create_tsne_model, create_word_clusters_matrix)

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


## Define constants

In [5]:
PWDB_TEXTUAL_CLASS = ['title', 'background_info_description', 'content_of_measure_description',
                      'use_of_measure_description', 'involvement_of_social_partners_description']

DEFAULT_TEXTUAL_COLUMN = ['title']
WINDOW = 5
MIN_COUNT = 1
VECTOR_SIZE = 300
EPOCHS = 50
EU_TIMELINE_TOTAL_EXAMPLES = 171
IRELAND_TIMELINE_TOTAL_EXAMPLES = 410
EU_CELLAR_TOTAL_EXAMPLES = 2653

## Data preprocessing
- data cleanup
- turn corpus into spacy document

In [47]:
def apply_cleaning_functions(document_corpus: pd.Series) -> pd.Series:
    """
    This function receives the document and leads through cleaning steps
    Args:
        document_corpus: dataset document corpus

    Returns: clean document corpus
    """
    unused_characters = ["\\r", ">", "\n", "\\", "<", "''", "%", "...", "\'", '"', "(", "\n", "*", "1)", "2)", "3)",
                         "[", "]", "-", "_", "\r"]

    new_document_corpus = document_corpus.apply(clean_text_from_specific_characters, characters=unused_characters)
    new_document_corpus = new_document_corpus.apply(clean_fix_unicode)
    new_document_corpus = new_document_corpus.apply(clean_remove_urls)
    new_document_corpus = new_document_corpus.apply(clean_remove_emails)
    new_document_corpus = new_document_corpus.apply(clean_remove_currency_symbols)
    new_document_corpus = new_document_corpus.apply(clean_remove_stopwords)

    return new_document_corpus

def generate_graph(sm: pd.DataFrame, graph: nx.Graph, root_word: str, kn: int, threshold:np.float64 = 0.8, deep_level: int = 0, max_deep_level:int = 2 ) -> nx.Graph:
    if deep_level>max_deep_level:
        return graph
    new_nodes = sm[root_word].sort_values(ascending=False)[:kn].index.to_list()
    new_nodes_weight = list(sm[root_word].sort_values(ascending=False)[:kn].values)
    for index in range(0,len(new_nodes)):
        if new_nodes_weight[index] >= threshold:
            graph.add_edge(root_word,new_nodes[index], weight=new_nodes_weight[index])
            generate_graph(sm,graph,new_nodes[index],kn,threshold,deep_level+1,max_deep_level)
    return graph

In [7]:
class LanguageModelPipeline:
    """
        This pipeline executes the steps for word2vec language training.
    """

    def __init__(self, dataset_sources: List[Tuple[IndexTabularDataSource, List[str]]]):
        """
            :param dataset_sources: represents the source of the datasets.
        """
        self.dataset_sources = dataset_sources
        self.documents_corpus = pd.Series()
        self.word2vec = None

    def download_datasets(self):
        """
            In this step it will download the dataset and detect selected columns.
            It can be downloaded as many datasets as there are in data source.
        """
        self.dataset_sources = [(dataset_columns, dataset_source.fetch())
                                for dataset_source, dataset_columns in self.dataset_sources]

    def extract_textual_data(self):
        """
            After downloading the datasets, the textual data will be found and and concatenated
            with executing of several steps as well. It will fill the NaN values with empty space,
            add a dot at the end of each concatenated column and reset the index.
        """
        self.documents_corpus = pd.concat([dataset[columns]
                                          .fillna(value="")
                                          .agg('. '.join, axis=1)
                                          .reset_index(drop=True)
                                           for columns, dataset in self.dataset_sources
                                           ], ignore_index=True)

    def clean_textual_data(self):
        """
            The next step is data cleaning. In this step the function "apply_cleaning_functions"
            applies the following actions:
                - clean the document from specific characters
                - delete unicode
                - removes emails and URLs and currency symbols
        """
        self.documents_corpus = apply_cleaning_functions(self.documents_corpus)

    def transform_to_spacy_doc(self):
        """
            When the document is clean, is going to be transform into spacy document
        """
        self.documents_corpus = self.documents_corpus.apply(nlp)

    def extract_features(self):
        """
            To extract the parts of speech, below it was defined classes for each token is necessary.
        """
        self.documents_corpus = pd.concat([self.documents_corpus,
                                           self.documents_corpus.apply(document_atomization_noun_phrases),
                                           self.documents_corpus.apply(lemmatize_document)]
                                           ,ignore_index=True)

        self.documents_corpus = self.documents_corpus.apply(lambda x: list(map(str, x)))

    def model_training(self):
        """
            When the data is prepared it's stored into Word2Vec model.
        """
        self.word2vec = Word2Vec(sentences=self.documents_corpus, window=WINDOW,
                                 min_count=MIN_COUNT, vector_size=VECTOR_SIZE)

    def execute(self):
        """
            The final step is execution, where are stored each step and it will be executed in a row
        """
        self.download_datasets()
        self.extract_textual_data()
        self.clean_textual_data()
        self.transform_to_spacy_doc()
        self.extract_features()
        self.model_training()

## Experiment Nr#1 language model based on:
- PWDB
- eu-timeline
- ireland-timeline

In [8]:
dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN)
]
model1_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
model1_language_model_pipeline.execute()

100% (1288 of 1288) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


KeyboardInterrupt: 

## Experiment Nr#2 language model based on:
- eu-cellar


In [9]:
dataset_sources_config = [
    (Dataset.EU_CELLAR, DEFAULT_TEXTUAL_COLUMN)
]
model2_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
model2_language_model_pipeline.execute()


100% (2653 of 2653) |####################| Elapsed Time: 0:00:01 Time:  0:00:01


In [14]:
doc = nlp(str(model2_language_model_pipeline.word2vec.wv.index_to_key))
selected_pos = select_pos(doc, pos=['NOUN', 'ADJ'])


In [15]:
selected_pos = list(map(str, selected_pos))


In [16]:
key_words = ['work', 'agreement', 'working', 'companies', 'workers',
             'measures', 'temporary', 'social', 'support', 'covid19',
             '2020', 'public', 'national', 'ireland', 'statement', '2021',
             'announce', 'health', 'minister', 'new', 'billion', 'coronavirus',
             'vaccine', 'eur', 'support', 'million', 'commission', 'eu']
selected_key_words = [ word for word in key_words if word in selected_pos]


In [17]:
selected_pos_index = [model2_language_model_pipeline.word2vec.wv.index_to_key.index(token)
                      for token in selected_pos
                      if token in model2_language_model_pipeline.word2vec.wv.index_to_key]


In [18]:
selected_pos_embeddings = [model2_language_model_pipeline.word2vec.wv.vectors[index]
                           for index in selected_pos_index]

In [19]:
sim_matrix =  get_similarity_matrix(selected_pos_embeddings,
                      selected_pos,
                      metric=cosine_similarity)
sim_matrix


Unnamed: 0,european,council,regulation,commission,parliament,decision,committee,case,document,union,...,wastewater,salamander,supplier,donation,grandfathere,pave,override,balkan,leader,shortage
european,1.000000,0.744325,0.487154,0.511681,0.700258,0.378877,0.658112,-0.292653,0.428513,0.569740,...,0.215637,0.145747,0.011206,0.215781,-0.056217,0.059353,-0.386750,-0.032563,0.212013,0.026078
council,0.744325,1.000000,0.629656,0.658231,0.693651,0.417241,0.542533,-0.358522,0.501669,0.545901,...,0.148657,0.147761,0.011673,0.086677,0.019510,-0.048776,-0.536902,0.021019,0.234899,0.042876
regulation,0.487154,0.629656,1.000000,0.424473,0.652164,0.676131,0.239274,-0.352621,0.329811,0.342675,...,0.078149,0.191997,-0.041570,0.010952,-0.333977,-0.186397,-0.288352,0.157385,0.214810,0.095924
commission,0.511681,0.658231,0.424473,1.000000,0.597338,0.393663,0.357089,-0.193273,0.696010,0.340861,...,0.332862,0.181456,0.211504,0.084190,-0.036268,-0.061286,-0.541670,0.221573,0.318258,-0.092259
parliament,0.700258,0.693651,0.652164,0.597338,1.000000,0.454662,0.495427,-0.344882,0.480537,0.545771,...,0.147232,0.111525,0.007721,0.152890,-0.107517,-0.120531,-0.428568,0.032061,0.206461,0.076532
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pave,0.059353,-0.048776,-0.186397,-0.061286,-0.120531,-0.128652,0.391295,0.697673,-0.082864,0.050038,...,0.479396,0.580868,-0.004239,0.560735,-0.457008,1.000000,0.275214,0.437571,0.570645,0.183942
override,-0.386750,-0.536902,-0.288352,-0.541670,-0.428568,-0.329348,-0.266091,0.297309,-0.579777,-0.142641,...,0.012071,0.027798,-0.100985,0.178105,-0.164118,0.275214,1.000000,0.057855,0.021040,0.072889
balkan,-0.032563,0.021019,0.157385,0.221573,0.032061,0.189217,0.105226,0.432825,0.366109,-0.013851,...,0.794751,0.856240,0.311317,0.418138,-0.631949,0.437571,0.057855,1.000000,0.869438,0.346583
leader,0.212013,0.234899,0.214810,0.318258,0.206461,0.271193,0.378307,0.460051,0.377487,0.214572,...,0.826043,0.858358,0.358055,0.585472,-0.649061,0.570645,0.021040,0.869438,1.000000,0.183269


In [54]:
from d3graph import d3graph


for index in range(0, len(selected_key_words)):
    graph = generate_graph(sim_matrix ,nx.Graph(),selected_key_words[index],4, threshold=0.5 ,max_deep_level=2)
    adjmat = nx.adjacency_matrix(graph).todense()
    # print(len(graph.edges(data=False)))
    # print(adjmat.shape)
    # print(graph.nodes())
    print(graph.adj)
    # adjmat = pd.DataFrame(data=adjmat, index=graph.nodes(), columns=graph.nodes())
    # node_size = [for key, data in graph.adj[selected_key_words[index]]]
    # d3graph(adjmat, savename=selected_key_words[index], node_color=adjmat.columns.values, node_size=,
    #         width=1920, height=1080)
    # plt.figure(index,figsize=(10,10))
    # plt.title(selected_key_words[index])
    # nx.draw_spring(graph, with_labels=True)

dict_keys(['work', 'staff', 'ostreae', 'frances', 'arduous', 'latvias', 'dog', 'analysis', '858/2004', 'lithuanias', 'cap', 'plan', 'nonco2'])
dict_keys(['agreement', 'aviation', 'euromediterranean', 'force', 'international', 'union', 'entry', 'islamic', 'civil', 'organization', 'protocol', 'supplementary', 'maldives', 'community', 'bosniaherzegovina'])
dict_keys(['companies', 'household', 'voice', 'legitimate', 'machines', 'ypiresion', 'recorder', '25hour', 'cockpit', 'handicaps', 'line', 'vasileios', 'kokolakis', 'laboratoires', 'gasification', 'possibility'])
dict_keys(['workers', 'freedom', 'movement', 'worker', 'consideration', 'free', 'children', 'reimbursement', 'property', 'provider', 'treatment', 'taxation', 'occurrence', 'online', 'evidence', 'acquisition', 'intangible', 'contact', 'claim', 'length', 'double', 'benefits'])
dict_keys(['measures', 'measure', 'exceptional', 'diagnoses'])
dict_keys(['temporary', 'specific', '2020/1350', 'derogation', 'dissemination', '16february'

## Experiment Nr#3 language model based on:
- PWDB
- eu-timeline
- ireland-timeline
- eu-cellar

In [7]:
dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.EU_CELLAR, DEFAULT_TEXTUAL_COLUMN)
]
model3_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config)
model3_language_model_pipeline.execute()

In [10]:
model2_document = nlp(str(model2_language_model_pipeline.word2vec.wv.index_to_key))
model2_selected_pos = select_pos(model2_document, pos=['NOUN', 'ADJ'])
model2_selected_pos = list(map(str, model2_selected_pos))

In [11]:
key_words = ['work', 'agreement', 'working', 'companies', 'workers',
             'measures', 'temporary', 'social', 'support', 'covid19',
             '2020', 'public', 'national', 'ireland', 'statement', '2021',
             'announce', 'health', 'minister', 'new', 'billion', 'coronavirus',
             'vaccine', 'eur', 'support', 'million', 'commission', 'eu']



model2_selected_key_words = [ word for word in key_words if word in model2_selected_pos]
model2_selected_pos_index = [model2_language_model_pipeline.word2vec.wv.index_to_key.index(token)
                      for token in model2_selected_pos
                      if token in model2_language_model_pipeline.word2vec.wv.index_to_key]
model2_selected_pos_embeddings = [model2_language_model_pipeline.word2vec.wv.vectors[index]
                           for index in model2_selected_pos_index]

## Similarity matrices
### Euclidean similarity

In [17]:
# model1_euclidean_similarity_matrix = get_similarity_matrix(selected_pos_embeddings,
#                                                            selected_pos,
#                                                            metric=euclidean_similarity)
# model1_euclidean_similarity_matrix

In [18]:
model2_euclidean_similarity_matrix = get_similarity_matrix(model2_selected_pos_embeddings,
                                                           model2_selected_pos,
                                                           metric=euclidean_similarity)
model2_euclidean_similarity_matrix

Unnamed: 0,european,council,regulation,commission,parliament,decision,committee,case,document,union,...,wastewater,salamander,supplier,donation,grandfathere,pave,override,balkan,leader,shortage
european,1.000000,0.092975,0.068035,0.078721,0.084919,0.067845,0.076907,0.039055,0.064252,0.074069,...,0.070132,0.070232,0.070103,0.070190,0.070135,0.070028,0.070207,0.070257,0.070087,0.070256
council,0.092975,1.000000,0.077712,0.089045,0.081967,0.067675,0.067401,0.038438,0.066541,0.069604,...,0.069701,0.069711,0.069643,0.069696,0.069645,0.069539,0.069735,0.069861,0.069663,0.069772
regulation,0.068035,0.077712,1.000000,0.067661,0.081639,0.084227,0.054282,0.037151,0.056214,0.059274,...,0.064737,0.064743,0.064671,0.064674,0.064639,0.064439,0.064759,0.064867,0.064702,0.064796
commission,0.078721,0.089045,0.067661,1.000000,0.080171,0.074055,0.064334,0.043303,0.084105,0.065690,...,0.083329,0.083231,0.083258,0.083302,0.083167,0.082983,0.083314,0.083325,0.083361,0.083318
parliament,0.084919,0.081967,0.081639,0.080171,1.000000,0.068253,0.065765,0.037646,0.062161,0.070635,...,0.065632,0.065639,0.065574,0.065654,0.065578,0.065430,0.065690,0.065763,0.065623,0.065731
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pave,0.070028,0.069539,0.064439,0.082983,0.065430,0.077736,0.065999,0.056018,0.063948,0.070418,...,0.865758,0.874539,0.854341,0.891273,0.897697,1.000000,0.875922,0.856431,0.837585,0.867423
override,0.070207,0.069735,0.064759,0.083314,0.065690,0.078059,0.066043,0.055863,0.064151,0.070653,...,0.919818,0.921074,0.891209,0.918554,0.915038,0.875922,1.000000,0.891130,0.887445,0.924946
balkan,0.070257,0.069861,0.064867,0.083325,0.065763,0.078216,0.066001,0.055830,0.064080,0.070653,...,0.896481,0.893802,0.867420,0.872706,0.890004,0.856431,0.891130,1.000000,0.858806,0.912305
leader,0.070087,0.069663,0.064702,0.083361,0.065623,0.078179,0.065981,0.056002,0.064161,0.070654,...,0.906094,0.869230,0.857809,0.877381,0.864832,0.837585,0.887445,0.858806,1.000000,0.879809


In [19]:
# % % time
# model3_euclidean_similarity_matrix = get_similarity_matrix(model3_language_model_pipeline.word2vec.wv.vectors[:100],
#                                                            model3_language_model_pipeline.word2vec.wv.index_to_key[
#                                                            :100],
#                                                            metric=euclidean_similarity)
# model3_euclidean_similarity_matrix

### Cosine similarity

In [20]:
# % % time
# model1_cosine_similarity_matrix = get_similarity_matrix(model1_language_model_pipeline.word2vec.wv.vectors[:100],
#                                                         model1_language_model_pipeline.word2vec.wv.index_to_key[:100],
#                                                         metric=cosine_similarity)
# model1_cosine_similarity_matrix

In [12]:
%%time
model2_cosine_similarity_matrix = get_similarity_matrix(model2_selected_pos_embeddings,
                                                        model2_selected_pos,
                                                        metric=cosine_similarity)
model2_cosine_similarity_matrix

CPU times: user 1min 27s, sys: 95.4 ms, total: 1min 27s
Wall time: 1min 27s


Unnamed: 0,european,council,regulation,commission,parliament,decision,committee,case,document,union,...,wastewater,salamander,supplier,donation,grandfathere,pave,override,balkan,leader,shortage
european,1.000000,0.744325,0.487154,0.511681,0.700258,0.378877,0.658112,-0.292653,0.428513,0.569740,...,0.215637,0.145747,0.011206,0.215781,-0.056217,0.059353,-0.386750,-0.032563,0.212013,0.026078
council,0.744325,1.000000,0.629656,0.658231,0.693651,0.417241,0.542533,-0.358522,0.501669,0.545901,...,0.148657,0.147761,0.011673,0.086677,0.019510,-0.048776,-0.536902,0.021019,0.234899,0.042876
regulation,0.487154,0.629656,1.000000,0.424473,0.652164,0.676131,0.239274,-0.352621,0.329811,0.342675,...,0.078149,0.191997,-0.041570,0.010952,-0.333977,-0.186397,-0.288352,0.157385,0.214810,0.095924
commission,0.511681,0.658231,0.424473,1.000000,0.597338,0.393663,0.357089,-0.193273,0.696010,0.340861,...,0.332862,0.181456,0.211504,0.084190,-0.036268,-0.061286,-0.541670,0.221573,0.318258,-0.092259
parliament,0.700258,0.693651,0.652164,0.597338,1.000000,0.454662,0.495427,-0.344882,0.480537,0.545771,...,0.147232,0.111525,0.007721,0.152890,-0.107517,-0.120531,-0.428568,0.032061,0.206461,0.076532
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pave,0.059353,-0.048776,-0.186397,-0.061286,-0.120531,-0.128652,0.391295,0.697673,-0.082864,0.050038,...,0.479396,0.580868,-0.004239,0.560735,-0.457008,1.000000,0.275214,0.437571,0.570645,0.183942
override,-0.386750,-0.536902,-0.288352,-0.541670,-0.428568,-0.329348,-0.266091,0.297309,-0.579777,-0.142641,...,0.012071,0.027798,-0.100985,0.178105,-0.164118,0.275214,1.000000,0.057855,0.021040,0.072889
balkan,-0.032563,0.021019,0.157385,0.221573,0.032061,0.189217,0.105226,0.432825,0.366109,-0.013851,...,0.794751,0.856240,0.311317,0.418138,-0.631949,0.437571,0.057855,1.000000,0.869438,0.346583
leader,0.212013,0.234899,0.214810,0.318258,0.206461,0.271193,0.378307,0.460051,0.377487,0.214572,...,0.826043,0.858358,0.358055,0.585472,-0.649061,0.570645,0.021040,0.869438,1.000000,0.183269


In [22]:
# % % time
# model3_cosine_similarity_matrix = get_similarity_matrix(model3_language_model_pipeline.word2vec.wv.vectors[:100],
#                                                         model3_language_model_pipeline.word2vec.wv.index_to_key[:100],
#                                                         metric=cosine_similarity)
# model3_cosine_similarity_matrix

### Manhattan similarity

In [23]:
# % % time
# model1_manhattan_similarity_matrix = get_similarity_matrix(model1_language_model_pipeline.word2vec.wv.vectors[:100],
#                                                            model1_language_model_pipeline.word2vec.wv.index_to_key[
#                                                            :100],
#                                                            metric=manhattan_similarity)
# model1_manhattan_similarity_matrix

In [24]:
model2_manhattan_similarity_matrix = get_similarity_matrix(model2_selected_pos_embeddings,
                                                           model2_selected_pos,
                                                           metric=manhattan_similarity)
model2_manhattan_similarity_matrix

Unnamed: 0,european,council,regulation,commission,parliament,decision,committee,case,document,union,...,wastewater,salamander,supplier,donation,grandfathere,pave,override,balkan,leader,shortage
european,1.000000,0.007373,0.005094,0.006096,0.006707,0.005137,0.005993,0.002886,0.004994,0.005782,...,0.005280,0.005289,0.005274,0.005282,0.005279,0.005270,0.005284,0.005289,0.005275,0.005288
council,0.007373,1.000000,0.006138,0.007098,0.006350,0.005240,0.005257,0.002872,0.005226,0.005607,...,0.005320,0.005320,0.005312,0.005316,0.005312,0.005303,0.005320,0.005331,0.005321,0.005326
regulation,0.005094,0.006138,1.000000,0.005230,0.006254,0.006615,0.004046,0.002841,0.004230,0.004536,...,0.005052,0.005053,0.005049,0.005046,0.005044,0.005026,0.005054,0.005061,0.005049,0.005056
commission,0.006096,0.007098,0.005230,1.000000,0.006083,0.005754,0.004947,0.003218,0.006467,0.005003,...,0.006530,0.006523,0.006520,0.006525,0.006512,0.006498,0.006528,0.006529,0.006539,0.006530
parliament,0.006707,0.006350,0.006254,0.006083,1.000000,0.005265,0.004983,0.002827,0.004794,0.005654,...,0.005006,0.005006,0.005004,0.005008,0.005001,0.004989,0.005010,0.005017,0.005005,0.005015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pave,0.005270,0.005303,0.005026,0.006498,0.004989,0.005911,0.005058,0.004194,0.004976,0.005374,...,0.320973,0.337905,0.296603,0.371773,0.392443,1.000000,0.337010,0.305543,0.276014,0.321604
override,0.005284,0.005320,0.005054,0.006528,0.005010,0.005935,0.005063,0.004182,0.004991,0.005394,...,0.454482,0.458793,0.375071,0.448611,0.444909,0.337010,1.000000,0.371704,0.357356,0.477827
balkan,0.005289,0.005331,0.005061,0.006529,0.005017,0.005945,0.005060,0.004181,0.004983,0.005393,...,0.393788,0.380275,0.323187,0.335197,0.375362,0.305543,0.371704,1.000000,0.306189,0.430648
leader,0.005275,0.005321,0.005049,0.006539,0.005005,0.005942,0.005058,0.004193,0.004993,0.005395,...,0.405444,0.322692,0.304147,0.332309,0.320438,0.276014,0.357356,0.306189,1.000000,0.343105


In [25]:
# % % time
# model3_manhattan_similarity_matrix = get_similarity_matrix(model3_language_model_pipeline.word2vec.wv.vectors[:100],
#                                                            model3_language_model_pipeline.word2vec.wv.index_to_key[
#                                                            :100],
#                                                            metric=manhattan_similarity)
# model3_manhattan_similarity_matrix

In [36]:
# def generate_graph(similarity_matrix: pd.DataFrame, graph: nx.Graph, root_word: str, top_words: int,
#                    threshold:np.float64 = 0.8, deep_level: int = 0, max_deep_level: int = 2) -> nx.Graph:
#     if deep_level > max_deep_level:
#         return graph
#     new_nodes = similarity_matrix[root_word].sort_values(ascending=False)[:top_words].index.to_list()
#     new_nodes_weight = list(similarity_matrix[root_word].sort_values(ascending=False)[:top_words].values)
#     for index in range(0, len(new_nodes)):
#         if new_nodes_weight[index] >= threshold:
#             graph.add_edge(root_word, new_nodes[index])
#             generate_graph(similarity_matrix, graph, new_nodes[index], top_words + 1, threshold, deep_level+1, max_deep_level)
#
#     return graph



In [4]:
# Import library
from d3graph import d3graph, vec2adjmat

source = ['node A', 'node A', 'node A', 'node A', 'node A', 'node A', 'node A', 'node A']
target = ['node F', 'node B', 'node J', 'node F', 'node F', 'node M', 'node M', 'node A']
weight = [5.56, 0.5, 0.64, 0.23, 0.9,3.28,0.5,0.45]
adjmat = vec2adjmat(source, target, weight=weight)

adjmat

target,node A,node B,node F,node J,node M
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
node A,0.45,0.5,6.69,0.64,3.78
node B,0.0,0.0,0.0,0.0,0.0
node F,0.0,0.0,0.0,0.0,0.0
node J,0.0,0.0,0.0,0.0,0.0
node M,0.0,0.0,0.0,0.0,0.0


In [141]:
new_nodes = model2_cosine_similarity_matrix[model2_selected_key_words][:17]


source = list(new_nodes.index)
target = list(new_nodes.columns)
weight = list(new_nodes.values)

adjmat = vec2adjmat(source, target, weight=weight)



ValueError: Must produce aggregated value

### Select key words as clusters to visualize the graph similarity between this words


### Steps for word embedding visualization:
* detect and extract key words and most relevant words
* train TSNE model
* create a dataframe with the clusters, their words and their placement on the graph

In [None]:
# model1_word_embeddings = select_words_and_embedding_clusters(model1_language_model_pipeline.word2vec.wv, key_words)
# model2_word_embeddings = select_words_and_embedding_clusters(model2_language_model_pipeline.word2vec.wv, key_words)
# model3_word_embeddings = select_words_and_embedding_clusters(model3_language_model_pipeline.word2vec.wv, key_words)
#
# model1_tsne_model = create_tsne_model(model1_word_embeddings[0])
# model2_tsne_model = create_tsne_model(model2_word_embeddings[0])
# model3_tsne_model = create_tsne_model(model3_word_embeddings[0])
#
# model1_word_embeddings_dataframe = create_word_clusters_matrix(key_words, model1_word_embeddings[1], model1_tsne_model)
# model2_word_embeddings_dataframe = create_word_clusters_matrix(key_words, model2_word_embeddings[1], model2_tsne_model)
# model3_word_embeddings_dataframe = create_word_clusters_matrix(key_words, model3_word_embeddings[1], model3_tsne_model)

### Graph visualization for the first model

In [None]:
# model1_word_cluster_plot = px.scatter(model1_word_embeddings_dataframe,
#                                       x='X', y='Y', color=model1_word_embeddings_dataframe.word_cluster,
#                                       labels={'color': 'word'}, hover_data=["word"])
# model1_word_cluster_plot
#

### Graph visualization for the second model

In [None]:
# model2_word_cluster_plot = px.scatter(model2_word_embeddings_dataframe,
#                                       x='X', y='Y', color=model2_word_embeddings_dataframe.word_cluster,
#                                       labels={'color': 'word'}, hover_data=["word"])
# model2_word_cluster_plot
#

### Graph visualization for the third model

In [None]:
# model3_word_cluster_plot = px.scatter(model3_word_embeddings_dataframe,
#                                       x='X', y='Y', color=model3_word_embeddings_dataframe.word_cluster,
#                                       labels={'color': 'word'}, hover_data=["word"])
# model3_word_cluster_plot