In [5]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from sem_covid import config
from sem_covid.services.model_registry import embedding_registry
from pathlib import Path
from sem_covid.services.store_registry import store_registry
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean
from sem_covid.services.semantic_similarity_pipelines.document_embedding_pipeline import DocumentEmbeddingPipeline
from sem_covid.services.semantic_similarity_pipelines.document_similarity_pipeline import DocumentSimilarityPipeline

In [1]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))


def similarity_func(u, v):
    return 1 / (1 + euclidean(u, v))


In [3]:
TEXTUAL_COLUMNS = ['title', 'background_info_description', 'content_of_measure_description',
                   'use_of_measure_description', 'involvement_of_social_partners_description']

DOCUMENT_EMBEDDINGS_FEATURE_STORE_NAME = 'fs_doc_emb_tfidf'

DOCUMENTS_CONFIGS = {config.IRELAND_TIMELINE_ELASTIC_SEARCH_INDEX_NAME: ['title', 'content'],
                     config.EU_TIMELINE_ELASTIC_SEARCH_INDEX_NAME: ['title', 'abstract', 'detail_content'],
                     config.EU_CELLAR_ELASTIC_SEARCH_INDEX_NAME: ['title', 'content'],
                     config.PWDB_ELASTIC_SEARCH_INDEX_NAME: TEXTUAL_COLUMNS
                     }

In [None]:
for config_key in DOCUMENTS_CONFIGS.keys():
    DocumentEmbeddingPipeline(es_index_name=config_key,
                              textual_columns=DOCUMENTS_CONFIGS[config_key],
                              embedding_model=embedding_registry.sent2vec_tfidf_avg(),
                              embedding_model_name='TfIdfEmbeddingModel',
                              store_registry=store_registry,
                              doc_emb_feature_store_name=DOCUMENT_EMBEDDINGS_FEATURE_STORE_NAME
                              ).execute()


In [None]:

doc_sim = DocumentSimilarityPipeline(document_embeddings_index=DOCUMENT_EMBEDDINGS_FEATURE_STORE_NAME,
                                     similarity_metric=cosine,
                                     similarity_metric_name='cosine_similarity',
                                     store_registry=store_registry
                                     )

In [None]:
class SemanticSimilaritySamplingPipeline:

    def __init__(self, semantic_similarity_index_name:str,
                 doc_emb_feature_store_name:str,
                 sample_index_name:str,
                 sample_size: int,
                 metric_column_name: str
                 ,
                 ):
        self.semantic_similarity_index_name = semantic_similarity_index_name
        self.doc_emb_feature_store_name = doc_emb_feature_store_name
        self.metric_column_name = metric_column_name
        self.sample_index_name = sample_index_name
        self.sample_size = sample_size

    def load_data(self):
        es_store = store_registry.es_index_store()
        self.sm_eu_cellar_x_pwdb_df = es_store.get_dataframe(index_name=self.semantic_similarity_index_name)
        self.doc_emb_df = es_store.get_dataframe(index_name=self.doc_emb_feature_store_name)

    def compute_sampling(self):
        self.sm_eu_cellar_x_pwdb_df.sort_values(by=self.metric_column_name,inplace=True)
        step = math.floor(len(self.sm_eu_cellar_x_pwdb_df)/self.sample_size)
        self.sm_sample = self.sm_eu_cellar_x_pwdb_df.iloc[:n*step:step]
        self.sm_sample['text_left'] = self.sm_sample.apply(lambda df_row: self.doc_emb_df.loc[df_row[0],'text'], axis=1)
        self.sm_sample['text_right'] = self.sm_sample.apply(lambda df_row: self.doc_emb_df.loc[df_row[1],'text'], axis=1)

    def store_sample(self):
        es_store = store_registry.es_index_store()
        es_store.put_dataframe(index_name=self.sample_index_name,content=self.sm_sample)

    def execute(self):
        self.load_data()
        self.compute_sampling()
        self.store_sample()

In [None]:
doc_sim.execute()

In [6]:
es_store = store_registry.es_index_store()

In [8]:
SM_EU_CELLAR_X_PWDB = 'sm_ds_eu_cellar_x_ds_pwdb_tfidfembeddingmodel_cosine_similarity'

In [9]:
sm_eu_cellar_x_pwdb_df = es_store.get_dataframe(index_name=SM_EU_CELLAR_X_PWDB)

100% (835912 of 835912) |################| Elapsed Time: 0:00:13 Time:  0:00:13


In [10]:
doc_emb_df = es_store.get_dataframe(index_name=DOCUMENT_EMBEDDINGS_FEATURE_STORE_NAME)

100% (2490 of 2490) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


In [13]:
sm_eu_cellar_x_pwdb_df.sort_values(by='cosine_similarity',inplace=True)

In [67]:
import math
n = 10000

In [68]:
step = math.floor(len(sm_eu_cellar_x_pwdb_df)/n)

In [69]:
sm_slice = sm_eu_cellar_x_pwdb_df.iloc[:n*step:step]

In [71]:
sm_slice['text_left'] = sm_slice.apply(lambda df_row: doc_emb_df.loc[df_row[0],'text'], axis=1)
sm_slice['text_right'] = sm_slice.apply(lambda df_row: doc_emb_df.loc[df_row[1],'text'], axis=1)

In [73]:
es_store.put_dataframe(index_name='sm_eu_cellar_x_pwdb_sample_tf_idf_emb_cosine',content=sm_slice)


 98% (9874 of 10000) |################## | Elapsed Time: 0:00:00 ETA:   0:00:00

10000