In [1]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from sem_covid import config
from sem_covid.services.model_registry import embedding_registry
from pathlib import Path
from sem_covid.services.store_registry import store_registry
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean
from sem_covid.services.semantic_similarity_pipelines.document_embedding_pipeline import DocumentEmbeddingPipeline

In [2]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))


def similarity_func(u, v):
    return 1 / (1 + euclidean(u, v))


In [3]:
TEXTUAL_COLUMNS = ['title', 'background_info_description', 'content_of_measure_description',
                   'use_of_measure_description', 'involvement_of_social_partners_description']

DOCUMENT_EMBEDDINGS_FEATURE_STORE_NAME = 'fs_doc_emb_tfidf'

DOCUMENTS_CONFIGS = {config.IRELAND_TIMELINE_ELASTIC_SEARCH_INDEX_NAME: ['title', 'content'],
                     config.EU_TIMELINE_ELASTIC_SEARCH_INDEX_NAME: ['title', 'abstract', 'detail_content'],
                     config.EU_CELLAR_ELASTIC_SEARCH_INDEX_NAME: ['title', 'content'],
                     config.PWDB_ELASTIC_SEARCH_INDEX_NAME: TEXTUAL_COLUMNS
                     }

In [16]:
for config_key in DOCUMENTS_CONFIGS.keys():
    DocumentEmbeddingPipeline(es_index_name=config_key,
                              textual_columns=DOCUMENTS_CONFIGS[config_key],
                              embedding_model=embedding_registry.sent2vec_tfidf_avg(),
                              embedding_model_name='TfIdfEmbeddingModel',
                              store_registry=store_registry,
                              doc_emb_feature_store_name=DOCUMENT_EMBEDDINGS_FEATURE_STORE_NAME
                              ).execute()



100% (410 of 410) |######################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (171 of 171) |######################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (2818 of 2818) |####################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (1288 of 1288) |####################| Elapsed Time: 0:00:00 Time:  0:00:00
 74% (962 of 1288) |###############      | Elapsed Time: 0:00:00 ETA:  00:00:00

In [4]:
import concurrent.futures
import hashlib

import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import pairwise_distances
import seaborn as sns
from pathlib import Path
from sem_covid.services.semantic_similarity_pipelines.document_embedding_pipeline import DOCUMENT_EMBEDDING,

DOCUMENT_EMBEDDING_METHOD
from sem_covid.services.store_registry import StoreRegistryABC

DOCUMENT_NAME_X = 'name_x'
DOCUMENT_NAME_Y = 'name_y'
SIMILARITY_MATRIX = 'similarity_matrix'
SIMILARITY_LIST = 'similarity_list'
DOCUMENT_ID = 'document_id'
SIMILARITY_METRIC = 'similarity_metric'
SIMILARITY_METRIC_VALUE = 'similarity_metric_value'
DOCUMENT_EMBEDDING_METHOD_NOT_FOUND = 'not_found_embedding_method'

In [5]:
class DocumentSimilarityPipeline:

    def __init__(self, document_embeddings_index: str, similarity_metric,
                 similarity_metric_name: str,
                 store_registry: StoreRegistryABC,
                 figures_path: Path = None
                 ):
        self.document_embeddings_index = document_embeddings_index
        self.similarity_metric = similarity_metric
        self.prepared_data = None
        self.document_embeddings = {}
        self.dataset_names = []
        self.figures_path = figures_path
        self.store_registry = store_registry
        self.similarity_metric_name = similarity_metric_name
        self.document_embeddings_method = DOCUMENT_EMBEDDING_METHOD_NOT_FOUND
        self.dataset = pd.DataFrame()

    def load_document_embeddings(self):
        es_index_store = self.store_registry.es_index_store()
        self.dataset = es_index_store.get_dataframe(index_name=self.document_embeddings_index)
        self.dataset_names = list(set(self.dataset.source.values))
        self.document_embeddings = {dataset_name: self.dataset[self.dataset.source == dataset_name]
                                    for dataset_name in self.dataset_names}

    def prepare_similarity_data(self):
        def prepare_worker(name_x: str, name_y: str):
            similarity_matrix = pd.DataFrame(
                pairwise_distances(X=self.document_embeddings[name_x][DOCUMENT_EMBEDDING].to_list(),
                                   Y=self.document_embeddings[name_y][DOCUMENT_EMBEDDING].to_list(),
                                   metric=self.similarity_metric),
                columns=self.document_embeddings[name_y].index.to_list(),
                index=self.document_embeddings[name_x].index.to_list()
            )
            similarity_matrix_values = similarity_matrix.values
            similarity_list = [similarity_matrix_values[row][col]
                               for row in range(0, similarity_matrix_values.shape[0])
                               for col in range(row + 1, similarity_matrix_values.shape[1])]

            return {DOCUMENT_NAME_X: name_x, DOCUMENT_NAME_Y: name_y,
                    SIMILARITY_MATRIX: similarity_matrix,
                    SIMILARITY_LIST: similarity_list}

        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(prepare_worker, name_x, name_y)
                       for name_x in self.dataset_names
                       for name_y in self.dataset_names[self.dataset_names.index(name_x):]]
            self.prepared_data = [future.result() for future in futures]

    def save_similarity_matrix(self):
        minio_feature_store = self.store_registry.minio_feature_store()
        self.document_embeddings_method = list(self.document_embeddings.values())[0][DOCUMENT_EMBEDDING_METHOD][0]
        for data in self.prepared_data:
            similarity_documents_name = f"{data[DOCUMENT_NAME_X]}_X_{data[DOCUMENT_NAME_Y]}"
            similarity_feature_name = "_".join(["sm",
                                                similarity_documents_name,
                                                self.document_embeddings_method,
                                                self.similarity_metric_name]
                                               )
            minio_feature_store.put_features(features_name=similarity_feature_name,
                                             content=data[SIMILARITY_MATRIX]
                                             )

    def save_similarity_pairs(self):

        def generate_new_row(row_index, dataframe, column_suffix):
            new_row = pd.Series(dataframe.loc[row_index])
            new_row[DOCUMENT_ID] = new_row.name
            new_row.index = list(map(lambda x: x + column_suffix, new_row.index))
            return new_row

        def combine_two_rows(left_row: pd.Series, right_row: pd.Series, similarity_metric: str,
                             similarity_metric_value: float) -> pd.Series:
            new_combined_row = left_row.append(right_row)
            new_combined_row[SIMILARITY_METRIC] = similarity_metric
            new_combined_row[SIMILARITY_METRIC_VALUE] = similarity_metric_value
            new_combined_row.name = hashlib.sha256(
                (str(left_row.name) + str(right_row.name)).encode('utf-8')).hexdigest()
            return new_combined_row

        es_index_store = self.store_registry.es_index_store()

        for data in self.prepared_data:
            sim_matrix = data[SIMILARITY_MATRIX]
            sim_pairs_list = [combine_two_rows(generate_new_row(row_index_left, self.dataset, '_left'),
                                               generate_new_row(row_index_right, self.dataset, '_right'),
                                               self.similarity_metric_name,
                                               sim_matrix.loc[row_index_left][row_index_right])
                              for row_index_left in sim_matrix.index
                              for row_index_right in sim_matrix.columns]
            similarity_pairs_df = pd.DataFrame(sim_pairs_list)
            es_index_store.put_dataframe(index_name=f"sm_{data[DOCUMENT_NAME_X]}_X_{data[DOCUMENT_NAME_Y]}",
                                         content=similarity_pairs_df)

    def plot_histograms(self):
        if self.figures_path:
            plt.subplots(figsize=(10, 5))
            for data in self.prepared_data:
                plot_title = f"sm_{data[DOCUMENT_NAME_X]}_X_{data[DOCUMENT_NAME_Y]}_{self.document_embeddings_method}"
                plot = sns.histplot(data=data[SIMILARITY_LIST]).set_title(plot_title)
                plot.figure.savefig(self.figures_path / (plot_title + '.png'))
                plot.figure.clf()

    def execute(self):
        self.load_document_embeddings()
        self.prepare_similarity_data()
        self.save_similarity_matrix()
        self.save_similarity_pairs()
        self.plot_histograms()

In [6]:
doc_sim = DocumentSimilarityPipeline(document_embeddings_index=DOCUMENT_EMBEDDINGS_FEATURE_STORE_NAME,
                                     similarity_metric=cosine,
                                     similarity_metric_name='cosine_similarity',
                                     store_registry=store_registry
                                     )

In [7]:
doc_sim.load_document_embeddings()

100% (2490 of 2490) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


In [8]:
doc_sim.prepare_similarity_data()

In [24]:
def save_similarity_pairs(self):
    def generate_new_row(row_index, dataframe, column_suffix):
        new_row = pd.Series()
        new_row.name = dataframe.loc[row_index].name
        new_row[DOCUMENT_ID] = new_row.name
        new_row.index = list(map(lambda x: x + column_suffix, new_row.index))
        return new_row

    def combine_two_rows(left_row: pd.Series, right_row: pd.Series, similarity_metric: str,
                         similarity_metric_value: float) -> pd.Series:
        new_combined_row = left_row.append(right_row)
        new_combined_row[SIMILARITY_METRIC] = similarity_metric
        new_combined_row[SIMILARITY_METRIC_VALUE] = similarity_metric_value
        new_combined_row.name = hashlib.sha256(
            (str(left_row.name) + str(right_row.name)).encode('utf-8')).hexdigest()
        return new_combined_row

    #es_index_store = self.store_registry.es_index_store()

    def similarity_pairs_worker(sim_matrix: pd.DataFrame,doc_name_1,doc_name_2) -> pd.DataFrame:
        print(f'similarity start {doc_name_1}X{doc_name_2}')
        sim_pairs_list = [combine_two_rows(generate_new_row(row_index_left, self.dataset, '_left'),
                                           generate_new_row(row_index_right, self.dataset, '_right'),
                                           self.similarity_metric_name,
                                           sim_matrix.loc[row_index_left][row_index_right])
                          for row_index_left in sim_matrix.index
                          for row_index_right in sim_matrix.columns]
        print(f'similarity finish {doc_name_1}X{doc_name_2}')
        return pd.DataFrame(sim_pairs_list)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(similarity_pairs_worker, data[SIMILARITY_MATRIX],
                                   data[DOCUMENT_NAME_X],
                                   data[DOCUMENT_NAME_Y])
                   for data in self.prepared_data]
        return [future.result() for future in futures]

  new_row = pd.Series()


In [None]:
results = save_similarity_pairs(doc_sim)


similarity start ds_pwdbXds_pwdb
similarity start ds_pwdbXds_eu_cellar
similarity start ds_pwdbXds_ireland_timeline
similarity start ds_pwdbXds_eu_timeline
similarity start ds_eu_cellarXds_eu_cellar
similarity start ds_eu_cellarXds_ireland_timeline
similarity start ds_eu_cellarXds_eu_timeline
similarity start ds_ireland_timelineXds_ireland_timeline
similarity start ds_ireland_timelineXds_eu_timeline
similarity start ds_eu_timelineXds_eu_timeline


  new_row = pd.Series()


similarity pairs compute finish!
similarity finish ds_eu_timelineXds_eu_timeline
similarity pairs compute finish!
similarity finish ds_ireland_timelineXds_eu_timeline
similarity finish ds_eu_cellarXds_eu_timeline
similarity pairs compute finish!
similarity pairs compute finish!
similarity finish ds_ireland_timelineXds_ireland_timeline
similarity finish ds_pwdbXds_eu_timeline
similarity pairs compute finish!


In [32]:
doc_sim.save_similarity_matrix()

TypeError: a bytes-like object is required, not 'DataFrame'

In [None]:
doc_sim.save_similarity_pairs()

In [None]:
doc_sim.plot_histograms()