## Import libraries

In [2]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from sem_covid import config
from sem_covid.services.model_registry import embedding_registry
from sem_covid.services.store_registry import store_registry
import numpy as np
from sem_covid.services.semantic_similarity_pipelines.document_embedding_pipeline import DocumentEmbeddingPipeline
from sem_covid.services.semantic_similarity_pipelines.document_similarity_pipeline import DocumentSimilarityPipeline
from sem_covid.services.semantic_similarity_pipelines.semantic_similarity_sampling_pipeline import (
    SemanticSimilaritySamplingPipeline)

2021-08-16 12:57:42.651600: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-08-16 12:57:42.651627: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Define constants

In [3]:
TEXTUAL_COLUMNS = ['title', 'background_info_description', 'content_of_measure_description',
                   'use_of_measure_description', 'involvement_of_social_partners_description']

DOCUMENT_EMBEDDINGS_FEATURE_STORE_NAME = 'fs_doc_emb_tfidf'

DOCUMENTS_CONFIGS = {config.IRELAND_TIMELINE_ELASTIC_SEARCH_INDEX_NAME: ['title', 'content'],
                     config.EU_TIMELINE_ELASTIC_SEARCH_INDEX_NAME: ['title', 'abstract', 'detail_content'],
                     config.EU_CELLAR_ELASTIC_SEARCH_INDEX_NAME: ['title', 'content'],
                     config.PWDB_ELASTIC_SEARCH_INDEX_NAME: TEXTUAL_COLUMNS
                     }

SM_EU_CELLAR_X_PWDB = 'sm_ds_eu_cellar_x_ds_pwdb_tfidfembeddingmodel_cosine_similarity'
METRIC_COLUMN_NAME = 'cosine_similarity'
SAMPLE_INDEX_NAME = 'sm_eu_cellar_x_pwdb_sample_tf_idf_emb_cosine'

## Define similarity functions

In [None]:
def cosine_similarity(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

## Execute document embedding pipeline for each configuration

In [None]:
for config_key in DOCUMENTS_CONFIGS.keys():
    DocumentEmbeddingPipeline(es_index_name=config_key,
                              textual_columns=DOCUMENTS_CONFIGS[config_key],
                              embedding_model=embedding_registry.sent2vec_tfidf_avg(),
                              embedding_model_name='TfIdfEmbeddingModel',
                              store_registry=store_registry,
                              doc_emb_feature_store_name=DOCUMENT_EMBEDDINGS_FEATURE_STORE_NAME
                              ).execute()


## Execute document similarity pipeline

In [None]:

DocumentSimilarityPipeline(document_embeddings_index=DOCUMENT_EMBEDDINGS_FEATURE_STORE_NAME,
                           similarity_metric=cosine_similarity,
                           similarity_metric_name='cosine_similarity',
                           store_registry=store_registry
                           ).execute()

## Execute document semantic similarity sampling pipeline

In [None]:
SemanticSimilaritySamplingPipeline(
            semantic_similarity_index_name=SM_EU_CELLAR_X_PWDB,
            doc_emb_feature_store_name=DOCUMENT_EMBEDDINGS_FEATURE_STORE_NAME,
            sample_index_name=SAMPLE_INDEX_NAME,
            sample_size=10000,
            metric_column_name=METRIC_COLUMN_NAME,
            store_registry=store_registry
            ).execute()