## Word2vec model training
#### Model training based on three datasets' text data:
- M1: pwdb + eu_timeline  ( +  ireland_timeline )
- M2: ds_eu_cellar
- M3: M1+M2

#### Extract NOUN and NOUN PHRASES from each text data
#### Train the word2vec model with each dataset's textual data

### Import libraries

In [2]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))

import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
import warnings

warnings.filterwarnings("ignore")

import re
import pickle
from typing import List, Tuple

import spacy

nlp = spacy.load('en_core_web_sm')
nlp.max_length = 1500000

import numpy as np
import pandas as pd
import networkx as nx
from gensim.models import Word2Vec
from d3graph import d3graph

from sem_covid.services.data_registry import Dataset
from sem_covid.services.store_registry import store_registry
from sem_covid.adapters.data_source import IndexTabularDataSource

from sem_covid.entrypoints.notebooks.topic_modeling.topic_modeling_wrangling.token_management import select_pos

from sem_covid.services.sc_wrangling.data_cleaning import (clean_text_from_specific_characters, clean_fix_unicode,
                                                           clean_remove_currency_symbols, clean_remove_emails,
                                                           clean_remove_urls, clean_remove_stopwords)

from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.similarity_calculus import (
    euclidean_similarity, manhattan_similarity, cosine_similarity, build_similarity_matrix)

from sem_covid.entrypoints.notebooks.language_modeling.language_model_tools.document_handling_tools import (
    document_atomization_noun_phrases, lemmatize_document)

## Define constants

In [36]:
PWDB_TEXTUAL_CLASS = ['title', 'background_info_description', 'content_of_measure_description',
                      'use_of_measure_description', 'involvement_of_social_partners_description']

DEFAULT_TEXTUAL_COLUMN = ['title']
WINDOW = 5
MIN_COUNT = 1
VECTOR_SIZE = 300
EPOCHS = 50
EU_TIMELINE_TOTAL_EXAMPLES = 171
IRELAND_TIMELINE_TOTAL_EXAMPLES = 410
EU_CELLAR_TOTAL_EXAMPLES = 2653

KEY_WORDS = ['work', 'agreement', 'working', 'companies', 'workers',
             'measures', 'temporary', 'social', 'support', 'covid19',
             '2020', 'public', 'national', 'ireland', 'statement', '2021',
             'announce', 'health', 'minister', 'new', 'billion', 'coronavirus',
             'vaccine', 'eur', 'support', 'million', 'commission', 'eu']

COUNTRIES = ['austria', 'belgium', 'bulgaria', 'croatia', 'cyprus', 'czechia', 'denmark', 'estonia',
             'european_union', 'finland', 'france', 'germany', 'greece', 'hungary', 'ireland', 'italy',
             'latvia', 'lithuania', 'luxembourg', 'malta', 'netherlands', 'norway', 'poland', 'portugal',
             'romania', 'slovakia', 'slovenia', 'spain', 'sweden', 'united', 'kingdom']

CATEGORY = ['retention', 'workplace', 'labour', 'recovery', 'economic', 'adaptation',
            'businesses', 'protection', 'essential', 'workers', 'business_continuity',
            'services', 'social', 'market']

SUBCATEGORY = ['safety', 'arrangements', 'health', 'spending', 'working', 'support', 'occupational',
               'stimulus_packages', 'access', 'time', 'finance', 'remote', 'flexibility', 'workers',
               'essential_services', 'remuneration']

TARGET_GROUPS_L1 = ['businesses', 'workers', 'citizens']

TARGET_GROUPS_L2 = ['company', 'older', 'people', 'female', 'aged', 'corporations', 'businesses',
                    'single', 'person', 'larger', 'forms', 'smes', 'ups', 'non', 'single_parents',
                    'citizens', 'professions', 'parents', 'groups', 'youth', 'workers', 'essential_services',
                    'sector', 'women', 'workplace', 'unemployed', 'care', 'facilities', 'other', 'standard',
                    'specific', 'companies', 'self', 'contractors', 'children', 'border', 'solo', 'refugees',
                    'minors', 'cross', 'platform', 'employment', 'seasonal', 'disabled', 'migrants', 'start',
                    'risk_group', 'commuters', 'employees']

FUNDING = ['companies', 'national_funds', 'employer', 'funds', 'national_funds']

WORD_GRAPH_CONFIGS = {'category': CATEGORY,
                      'subcategory': SUBCATEGORY,
                      'key_words': KEY_WORDS,
                      'countries': COUNTRIES,
                      'target_groups_l1': TARGET_GROUPS_L1,
                      'target_groups_l2': TARGET_GROUPS_L2,
                      'funding': FUNDING
                      }

NR1_MODEL_NAME = 'model1'
NR2_MODEL_NAME = 'model2'
NR3_MODEL_NAME = 'model3'

MODEL1_FILE_NAME = 'model1_language_model.model'
MODEL2_FILE_NAME = 'model2_language_model.model'
MODEL3_FILE_NAME = 'model3_language_model.model'

## Data preprocessing
- data cleanup
- turn corpus into spacy document

In [4]:
def add_space_between_dots_and_commas(text: str):
    return re.sub(r'(?<=[.,])(?=[^\s])', r' ', text)


def apply_cleaning_functions(document_corpus: pd.Series) -> pd.Series:
    """
    This function receives the document and leads through cleaning steps
    Args:
        document_corpus: dataset document corpus

    Returns: clean document corpus
    """
    unused_characters = ["\\r", ">", "\n", "\\", "<", "''", "%", "...", "\'", '"', "(", "\n", "*", "1)", "2)", "3)",
                         "[", "]", "-", "_", "\r", '®', '..']

    new_document_corpus = document_corpus.apply(clean_text_from_specific_characters, characters=unused_characters)
    new_document_corpus = new_document_corpus.apply(clean_fix_unicode)
    new_document_corpus = new_document_corpus.apply(clean_remove_urls)
    new_document_corpus = new_document_corpus.apply(clean_remove_emails)
    new_document_corpus = new_document_corpus.apply(clean_remove_currency_symbols)
    new_document_corpus = new_document_corpus.apply(clean_remove_stopwords)
    new_document_corpus = new_document_corpus.apply(add_space_between_dots_and_commas)

    return new_document_corpus


def generate_graph(similarity_matrix: pd.DataFrame, graph: nx.Graph, root_word: str,
                   top_words: int, threshold: np.float64 = 0.8, deep_level: int = 0,
                   max_deep_level: int = 2, deep_map: dict = None, color_map: dict = None) -> nx.Graph:
    """
        Generates d3 graph using the inserted keywords and their top words from similarity matrix
    Args:
        similarity_matrix: Dataframe with word similarity
        graph: networkx graph
        root_word: key words
        top_words: top similar words from inserted keywords
        threshold: minimum percentage of similarity
        deep_level: the level of generating leaf
        max_deep_level: the maximum number of generated leaf
        deep_map: dictionary of the words and their level of similarity
        color_map: the color of each level of words' similarity

    Returns: a d3 graph with title and root of key word and their similarity words
    """
    if root_word not in deep_map.keys():
        deep_map[root_word] = (deep_level, color_map[deep_level])
    elif deep_map[root_word][0] > deep_level:
        deep_map[root_word] = (deep_level, color_map[deep_level])
    if deep_level > max_deep_level:
        return graph
    new_nodes = similarity_matrix[root_word].sort_values(ascending=False)[:top_words].index.to_list()
    new_nodes_weight = list(similarity_matrix[root_word].sort_values(ascending=False)[:top_words].values)
    for index in range(0, len(new_nodes)):
        if new_nodes_weight[index] >= threshold:
            graph.add_edge(root_word, new_nodes[index])
            generate_graph(similarity_matrix, graph, new_nodes[index], top_words, threshold, deep_level + 1,
                           max_deep_level,
                           deep_map=deep_map, color_map=color_map)

    return graph


def create_graph_for_language_model_key_words(similarity_matrix: pd.DataFrame, language_model_words: list,
                                              model_name: str, column_name: str,
                                              metric_threshold: np.float64) -> d3graph:
    """
    !!! This is not reusable function. It was made for a single thing !!!

    It generates d3graph based on language model selected words and and the similarity
    matrix created with those words.
    """
    graph_folder_path = f'docs/word-similarity-web/{model_name}_graphs/{column_name}/'
    color_map = {0: '#a70000',
                 1: '#f0000',
                 2: '#ff7b7b',
                 3: '#ffbaba'}
    for index in range(0, len(language_model_words)):
        deep_map = {}
        graph = generate_graph(similarity_matrix, nx.Graph(), language_model_words[index],
                               top_words=4, threshold=metric_threshold
                               , max_deep_level=2, deep_map=deep_map, color_map=color_map)
        network_adjacency_matrix = pd.DataFrame(data=nx.adjacency_matrix(graph).todense(),
                                                index=graph.nodes(), columns=graph.nodes())
        node_color_list = [deep_map[node][0] for node in graph.nodes()]
        d3graph(network_adjacency_matrix, savepath=graph_folder_path, savename=language_model_words[index],
                node_color=node_color_list,
                width=1920, height=1080, edge_width=5,
                edge_distance=60, directed=True)


In [20]:
class LanguageModelPipeline:
    """
        This pipeline executes the steps for word2vec language training.
    """

    def __init__(self, dataset_sources: List[Tuple[IndexTabularDataSource, List[str]]], language_model_name: str):
        """
            :param dataset_sources: represents the source of the datasets.
        """
        self.dataset_sources = dataset_sources
        self.language_model_name = language_model_name
        self.documents_corpus = pd.Series()
        self.word2vec = None

    def download_datasets(self):
        """
            In this step it will download the dataset and detect selected columns.
            It can be downloaded as many datasets as there are in data source.
        """
        self.dataset_sources = [(dataset_columns, dataset_source.fetch())
                                for dataset_source, dataset_columns in self.dataset_sources]

    def extract_textual_data(self):
        """
            After downloading the datasets, the textual data will be found and and concatenated
            with executing of several steps as well. It will fill the NaN values with empty space,
            add a dot at the end of each concatenated column and reset the index.
        """
        self.documents_corpus = pd.concat([dataset[columns]
                                          .fillna(value="")
                                          .agg('. '.join, axis=1)
                                          .reset_index(drop=True)
                                           for columns, dataset in self.dataset_sources
                                           ], ignore_index=True)

    def clean_textual_data(self):
        """
            The next step is data cleaning. In this step the function "apply_cleaning_functions"
            applies the following actions:
                - clean the document from specific characters
                - delete unicode
                - removes emails and URLs and currency symbols
        """
        self.documents_corpus = apply_cleaning_functions(self.documents_corpus)

    def transform_to_spacy_doc(self):
        """
            When the document is clean, is going to be transform into spacy document
        """
        self.documents_corpus = self.documents_corpus.apply(nlp)

    def extract_features(self):
        """
            To extract the parts of speech, below it was defined classes for each token is necessary.
        """
        self.documents_corpus = pd.concat([self.documents_corpus,
                                           self.documents_corpus.apply(document_atomization_noun_phrases),
                                           self.documents_corpus.apply(lemmatize_document)]
                                          , ignore_index=True)

        self.documents_corpus = self.documents_corpus.apply(lambda x: list(map(str, x)))

    def model_training(self):
        """
            When the data is prepared it's stored into Word2Vec model.
        """
        self.word2vec = Word2Vec(sentences=self.documents_corpus, window=WINDOW,
                                 min_count=MIN_COUNT, size=VECTOR_SIZE)

    def save_language_model(self):
        """
            Saves trained model in MinIO
        """
        minio = store_registry.minio_object_store('mdl-language')
        minio.put_object('word2vec/' + self.language_model_name, pickle.dumps(self.word2vec))

    def execute(self):
        """
            The final step is execution, where are stored each step and it will be executed in a row
        """
        self.download_datasets()
        self.extract_textual_data()
        self.clean_textual_data()
        self.transform_to_spacy_doc()
        self.extract_features()
        self.model_training()
        self.save_language_model()


class LanguageModelWordsFilter:
    def __init__(self, word2vec_model: Word2Vec, key_words: List[str], pos: List[str]) -> None:
        self.word2vec_model = word2vec_model
        self.key_words = key_words
        self.pos = pos
        self.word2vec_document = None
        self.word2vec_document = nlp(' '.join(self.word2vec_model.wv.index2word))
        self.word2vec_document = select_pos(self.word2vec_document, self.pos)
        self._extract_pos = list(map(str, self.word2vec_document))

    def extract_pos(self) -> List[str]:
        """
            transforms a word2vec indexes into spacy document and selects parts of
            speech. After that it puts into a list and converts those parts of speech
            into strings.
        """
        return self._extract_pos

    def select_key_words(self) -> List[str]:
        """
            Finds each word form inserted list of key words and returns a
            list with those words if there are presented in the list of
            extracted parts of speech.
        """
        return [word for word in self.key_words if word in self._extract_pos]

    def select_pos_index(self) -> List[int]:
        """
            Detects the part of speech indexes and returns them into a list
        """
        return [self.word2vec_model.wv.index2word.index(token) for token in self._extract_pos
                if token in self.word2vec_model.wv.index2word]

    def select_pos_embeddings(self) -> List[np.ndarray]:
        """
            Detects part of speech embeddings from their indexes
        """
        selected_pos_index = self.select_pos_index()
        return [self.word2vec_model.wv.vectors[index] for index in selected_pos_index]


def train_language_model(dataset_sources_config: List[tuple], key_words: List[str], language_model_name: str):
    """
        1. creates word2vec model with LanguageModelPipeline
        2. filters nouns and adjectives
     Args:
        dataset_sources_config: the dataset and his textual data columns
        language_model_name: the name of the language model, to be saved in MinIO
        key_words: the list of keywords
    Returns: Amount of filtered words
    """
    model_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config,
                                                          language_model_name=language_model_name)
    model_language_model_pipeline.execute()

    return model_language_model_pipeline.word2vec


def filter_language_model_words(language_model: train_language_model, key_words: List[str]):
    return LanguageModelWordsFilter(language_model, key_words, pos=['NOUN', 'ADJ'])

    # return LanguageModelWordsFilter(model_language_model_pipeline.word2vec, key_words, pos=['NOUN', 'ADJ'])


def train_similarity_matrices(model_name: str, language_model: filter_language_model_words):
    """
        Generates similarity matrix using extracts pos and pos embeddings
    Args:
        model_name: the name of the model that will be saved on the server
        language_model: trained word2vec model
    """
    similarity_functions = [cosine_similarity, euclidean_similarity, manhattan_similarity]
    for index in range(len(similarity_functions)):
        print('Start computing similarity matrix.')
        model_similarity_matrix = build_similarity_matrix(language_model.select_pos_embeddings(),
                                                          language_model.extract_pos(),
                                                          metric=similarity_functions[index])
        print('Finish computing similarity matrix.')
        print('Save similarity matrix.')
        store_registry.minio_object_store('semantic-similarity-matrices').put_object(
            f'{model_name}_{similarity_functions[index].__name__}_matrix.json',
            model_similarity_matrix.to_json(orient='columns'))

In [8]:
class ComplexPipeline:
    def __init__(self, language_model_file_name: str, model_name: str):

        self.language_model_file_name = language_model_file_name
        self.model_name = model_name
        self.word2vec = None

    def train_language_model(self, dataset_sources_config: List[tuple]):
        model_language_model_pipeline = LanguageModelPipeline(dataset_sources=dataset_sources_config,
                                                              language_model_name=self.language_model_file_name)
        model_language_model_pipeline.execute()
        self.word2vec = model_language_model_pipeline.word2vec

    def filter_language_model_words(self, key_words: List[str]):
        if self.word2vec:
            return LanguageModelWordsFilter(self.word2vec, key_words, pos=['NOUN', 'ADJ'])
        else:
            return None

    def train_similarity_matrices(self, key_words: List[str]):
        similarity_functions = ['cosine', 'euclidean', 'hamming']
        for index in range(len(similarity_functions)):
            print('Start computing similarity matrix.')
            model_similarity_matrix = build_similarity_matrix(
                self.filter_language_model_words(key_words=key_words).select_pos_embeddings(),
                self.filter_language_model_words(key_words=key_words).extract_pos(),
                metric=similarity_functions[index])
            print('Finish computing similarity matrix.')
            print('Save similarity matrix.')
            store_registry.minio_object_store('semantic-similarity-matrices').put_object(
                f'{self.model_name}_{similarity_functions[index]}_matrix.json',
                model_similarity_matrix.to_json(orient='columns'))

## Experiment Nr#1 language model based on:
- PWDB
- eu-timeline
- ireland-timeline


## Experiment Nr#2 language model based on:
- eu-cellar

## Experiment Nr#3 language model based on:
- PWDB
- eu-timeline
- ireland-timeline
- eu-cellar


In [6]:
model1_dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN)
]

model2_dataset_sources_config = [
    (Dataset.EU_CELLAR_ENRICHED, DEFAULT_TEXTUAL_COLUMN),
]

model3_dataset_sources_config = [
    (Dataset.PWDB, PWDB_TEXTUAL_CLASS),
    (Dataset.EU_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.EU_CELLAR_ENRICHED, DEFAULT_TEXTUAL_COLUMN),
    (Dataset.IRELAND_ACTION_TIMELINE, DEFAULT_TEXTUAL_COLUMN)
]

### Train language model and execute similarity matrices

In [12]:
comp_pipeline = ComplexPipeline(language_model_file_name=MODEL2_FILE_NAME, model_name=NR2_MODEL_NAME)

comp_pipeline.train_language_model(model2_dataset_sources_config)

In [22]:
% % time
comp_pipeline.train_similarity_matrices(key_words=CATEGORY)

Start computing similarity matrix.
Finish computing similarity matrix.
Save similarity matrix.
Start computing similarity matrix.
Finish computing similarity matrix.
Save similarity matrix.
Start computing similarity matrix.
Finish computing similarity matrix.
Save similarity matrix.
CPU times: user 15 s, sys: 1.23 s, total: 16.2 s
Wall time: 16.5 s


In [None]:
% % time
# !!! Below three cells will be executing a good period of time. If for any reason you run it,
# take your time, go and make some food, go for a walk maybe. !!!

# execution time: 1h 23m
model1_word2vec = train_language_model(model1_dataset_sources_config, CATEGORY, MODEL1_FILE_NAME)
# model1_similarity_matrix = train_similarity_matrices(NR1_MODEL_NAME, model1_word2vec)

In [67]:
% % time
# execution time: 3m
# model2 = execute_model_steps(NR2_MODEL_NAME, model2_dataset_sources_config)
model2_word2vec = train_language_model(model2_dataset_sources_config, MODEL2_FILE_NAME)
model2_similarity_matrix = train_similarity_matrices(NR2_MODEL_NAME, model2_word2vec)

Start computing similarity matrix.
Finish computing similarity matrix.
Save similarity matrix.
Create d3Graphs
CPU times: user 3min 20s, sys: 787 ms, total: 3min 21s
Wall time: 3min 20s


In [None]:
% % time
# execution time: 1h 33m
model3_word2vec = train_language_model(model3_dataset_sources_config, MODEL3_FILE_NAME)
model3_similarity_matrix = train_similarity_matrices(NR3_MODEL_NAME, model3_word2vec)

### Generate D3 Graphs

In [None]:
model1_cosine_matrix = store_registry.minio_object_store('semantic-similarity-matrices').get_object(
    'model1_cosine_similarity_matrix.json')

model1_language_model = store_registry.minio_object_store('mdl-language').get_object(
    'word2vec/model1_language_model.model')

model1_graphs = create_graph_for_language_model_key_words(pd.read_json(model1_cosine_matrix),
                                                          model1_word2vec.select_key_words(),
                                                          model_name=NR1_MODEL_NAME, column_name='category',
                                                          metric_threshold=0.4)

In [38]:
def plot_graphs(pipeline: ComplexPipeline, model_name: str, model_file_name: str, threshold: np.float64,
                word_graph_configs: dict,
                normalize_func):
    model_cosine_matrix = store_registry.minio_object_store('semantic-similarity-matrices').get_object(model_file_name)
    for key in word_graph_configs.keys():
        create_graph_for_language_model_key_words(pd.read_json(model_cosine_matrix).applymap(normalize_func),
                                                  pipeline.filter_language_model_words(
                                                      key_words=word_graph_configs[key]).select_key_words(),
                                                  model_name=model_name, metric_threshold=threshold,
                                                  column_name=key)

In [40]:
plot_graphs(pipeline = comp_pipeline,
            model_name = NR2_MODEL_NAME,
            model_file_name= 'model2_cosine_matrix.json',
            threshold=0.6,
            word_graph_configs=WORD_GRAPH_CONFIGS,
            normalize_func= lambda x: 1-x)

Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/labour.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/recovery.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/economic.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/adaptation.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/businesses.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/protection.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/essential.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/workers.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/services.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/social.html
Writing /home/jovy

In [35]:
model2_cosine_matrix = store_registry.minio_object_store('semantic-similarity-matrices').get_object(
    'model2_cosine_matrix.json')

model2_graphs = create_graph_for_language_model_key_words(pd.read_json(model2_cosine_matrix).applymap(lambda x: 1 - x),
                                                          comp_pipeline.filter_language_model_words(
                                                              key_words=CATEGORY).select_key_words(),
                                                          model_name=NR2_MODEL_NAME, metric_threshold=0.6,
                                                          column_name='category')

Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/labour.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/recovery.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/economic.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/adaptation.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/businesses.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/protection.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/essential.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/workers.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/services.html
Writing /home/jovyan/work/sem-covid/docs/word-similarity-web/model2_graphs/category/social.html
Writing /home/jovy

In [33]:
temp_df = pd.read_json(model2_cosine_matrix).head(5)

In [34]:
temp_df.applymap(lambda x: 1 - x)

Unnamed: 0,c,parliament,document,union,certain,text,economic,social,proposal,communication,...,bladder,crise,flavour,sandbox,master,exempt,disruptor,emerge,comprise,ingredient
c,1.0,-0.088294,-0.027274,-0.210632,-0.097438,0.567301,0.092788,-0.017132,0.026077,0.010648,...,0.34927,0.34554,-0.054862,0.126926,0.08046,-0.246717,-0.06788,-0.227131,0.157701,0.320721
parliament,-0.088294,1.0,0.434603,0.650382,0.181362,0.226026,0.679419,0.616389,0.742833,0.614692,...,0.040775,0.064712,-0.009583,-0.18356,0.077561,0.142433,-0.297098,-0.053682,-0.05112,-0.005201
document,-0.027274,0.434603,1.0,0.228784,0.09602,0.186051,0.578235,0.49443,0.497438,0.808421,...,0.262805,0.290013,-0.33362,0.170464,0.567621,-0.217398,0.120096,-0.36612,0.074215,0.318565
union,-0.210632,0.650382,0.228784,1.0,0.231797,0.009934,0.58436,0.610301,0.511615,0.420261,...,0.136103,0.189605,0.151163,-0.015864,0.085663,-0.092072,-0.1783,0.051018,0.140259,-0.061578
certain,-0.097438,0.181362,0.09602,0.231797,1.0,0.229572,0.184433,0.264371,0.049539,0.128032,...,0.647518,0.624609,0.469108,0.38841,0.465699,0.413367,0.574125,0.616874,0.424554,0.584702


In [None]:
model3_cosine_matrix = store_registry.minio_object_store('semantic-similarity-matrices').get_object(
    'model3_cosine_similarity_matrix.json')

model3_graphs = create_graph_for_language_model_key_words(pd.read_json(model3_cosine_matrix),
                                                          model3_word2vec.select_key_words(),
                                                          model_name=NR3_MODEL_NAME, metric_threshold=0.4)

In [5]:
model2_cosine_matrix = store_registry.minio_object_store('semantic-similarity-matrices').get_object(
    'model2_cosine_similarity_matrix.json')


In [6]:
tmp_df = pd.read_json(model2_cosine_matrix)

In [29]:
tmp_df = tmp_df.where(np.triu(np.ones(tmp_df.shape)).astype(np.bool))


In [31]:
tmp = tmp_df.stack()

In [32]:
indexes = tmp.index.to_flat_index()
values = tmp.values

In [33]:
data = [indexes[index] + (values[index],) for index in range(0, len(indexes))]

In [35]:
data_updated = [d for d in data if d[2] > 0.4]

In [34]:
len(data)

9303141

In [36]:
len(data_updated)

6379797

In [37]:
data_updated[:100]

[('c', 'c', 1.0),
 ('c', 'case', 0.5449162722),
 ('c', 'text', 0.5692722201),
 ('c', 'relevance', 0.6160866022),
 ('c', 'request', 0.4078985453),
 ('c', 'court', 0.45723506810000003),
 ('c', 't', 0.5243614912),
 ('c', 'p.', 0.6480842233),
 ('c', 'judgment', 0.4635605216),
 ('c', 'final', 0.4800850451),
 ('c', 'raise', 0.6436561942),
 ('c', 'eea_relevance', 0.6934002638),
 ('c', 'concentration', 0.5851275921),
 ('c', 'stability', 0.5182048678),
 ('c', 'i/01', 0.7251908183),
 ('c', 'obligations', 0.5764827728),
 ('c', 'objections', 0.5652606487),
 ('c', 'gmbh', 0.4314999282),
 ('c', 'notification', 0.4941498339),
 ('c', 'authentic', 0.6516040564000001),
 ('c', 'management', 0.42047229410000003),
 ('c', 'p', 0.47767692800000006),
 ('c', 'objection', 0.5742188096),
 ('c', 'president', 0.4273077846),
 ('c', 'ad', 0.6125755310000001),
 ('c', 'candidate', 0.843029201),
 ('c', 'opposition', 0.4285845459),
 ('c', 'e', 0.6431210041000001),
 ('c', 'council_opinion', 0.4901045859),
 ('c', 'group',

In [68]:
from sklearn.metrics.pairwise import pairwise_distances

X = [[1, 1], [2, 6], [1, 1]]
Y = [[1, 1], [6, 8], [1, 2]]
pairwise_distances(X, metric=cosine_similarity)

array([[1.        , 0.89442719, 1.        ],
       [0.89442719, 1.        , 0.89442719],
       [1.        , 0.89442719, 1.        ]])

In [14]:
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform

In [12]:
X = np.random.rand(20000, 2)
Y = np.random.rand(20000)

In [15]:
% % time
t = pdist(X, metric='cosine')
df = pd.DataFrame(squareform(t), columns=Y, index=Y)

CPU times: user 1.94 s, sys: 768 ms, total: 2.71 s
Wall time: 2.71 s


In [56]:
X

array([[0.97110749, 0.07245815],
       [0.85690828, 0.32851239],
       [0.12933652, 0.39534397],
       ...,
       [0.30519707, 0.03996792],
       [0.62630627, 0.10839572],
       [0.44943443, 0.58265998]])

In [65]:
% % time
t = squareform(pdist(X, metric=cosine_similarity))


CPU times: user 3.83 s, sys: 8.22 ms, total: 3.84 s
Wall time: 3.83 s


In [11]:
from itertools import combinations

In [14]:
% % time
comb = list(combinations(Y, 2))

CPU times: user 2.36 s, sys: 612 ms, total: 2.97 s
Wall time: 2.96 s


In [15]:
comb[:10]

[(0.7758772138103938, 0.4581528241795998),
 (0.7758772138103938, 0.7556128188929753),
 (0.7758772138103938, 0.2903824491529986),
 (0.7758772138103938, 0.25190906349060127),
 (0.7758772138103938, 0.09149688546569823),
 (0.7758772138103938, 0.8717632183754248),
 (0.7758772138103938, 0.9347832066775141),
 (0.7758772138103938, 0.4876551931324218),
 (0.7758772138103938, 0.34752915380001737),
 (0.7758772138103938, 0.10621645458722284)]

In [16]:
% % time
indexes = [(Y[i], Y[j]) for i in range(0, len(Y)) for j in range(i + 1, len(Y))]

CPU times: user 8.73 s, sys: 1.26 s, total: 9.99 s
Wall time: 9.99 s


In [None]:
t = pdist(X, metric='cosine')

In [30]:
% % time
#indexes = list(combinations(Y,2))
t = pdist(X, metric='cosine')
df = pd.DataFrame(squareform(t), columns=Y, index=Y)
#data = [indexes[index] + (t[index],) for index in range(0,len(t)) if t[index]>0.4]

CPU times: user 499 ms, sys: 152 ms, total: 652 ms
Wall time: 651 ms


In [8]:
data[:100]

[(0.0937090375708306, 0.35141603903249585, 0.7367782219092458),
 (0.0937090375708306, 0.2693672166378879, 0.7253226718686918),
 (0.0937090375708306, 0.6584325242242225, 0.5764058723983649),
 (0.0937090375708306, 0.7465246584985801, 0.7111720596540034),
 (0.0937090375708306, 0.11147079903121071, 0.7160456618923694),
 (0.0937090375708306, 0.5574183124925692, 0.6100590615603838),
 (0.0937090375708306, 0.26776601564358826, 0.5629955186610089),
 (0.0937090375708306, 0.5314425438770268, 0.5032685757247006),
 (0.0937090375708306, 0.8850698657698504, 0.5619252075068168),
 (0.0937090375708306, 0.0931728509723988, 0.5603959774335598),
 (0.0937090375708306, 0.3783660393765643, 0.702514022029491),
 (0.0937090375708306, 0.17301578127648598, 0.46589331284878976),
 (0.0937090375708306, 0.218771250375704, 0.5292387176022288),
 (0.0937090375708306, 0.020460847024664663, 0.43873327218443803),
 (0.0937090375708306, 0.01590531508609827, 0.4097141308133482),
 (0.0937090375708306, 0.8664807937459708, 0.5692

In [82]:
% % time
t = pdist(X, metric='cosine')

CPU times: user 878 µs, sys: 2.57 ms, total: 3.45 ms
Wall time: 2.85 ms


In [74]:
% % time
t = pairwise_distances(X, metric=cosine_similarity)

CPU times: user 3.77 s, sys: 7.99 ms, total: 3.78 s
Wall time: 3.78 s


In [79]:
% % time
t = pairwise_distances(X, metric='cosine')

CPU times: user 693 ms, sys: 3.97 s, total: 4.66 s
Wall time: 552 ms


In [78]:
% % time
t = pairwise_distances(X, metric='cosine', n_jobs=-1)

CPU times: user 2.62 s, sys: 7.9 s, total: 10.5 s
Wall time: 787 ms
