## Import libraries

In [76]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from sem_covid.services.store_registry import store_registry
from sem_covid import config
from sem_covid.services.model_registry import EmbeddingModelRegistry, embedding_registry

from sem_covid.services.model_registry import EmbeddingModelRegistryABC
from typing import Any
import numpy as np
from scipy.spatial import distance
import pandas as pd

import plotly.express as px
from plotly.graph_objs import Layout
from plotly.graph_objs.layout import XAxis, YAxis
import datetime
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from numpy import mean

In [21]:
UNIFIED_DATASET = 'ds_unified_datasets'

In [7]:
es_store = store_registry.es_index_store()

In [22]:
pwdb_df = es_store.get_dataframe(index_name=config.PWDB_ELASTIC_SEARCH_INDEX_NAME)
unified_df = es_store.get_dataframe(index_name=UNIFIED_DATASET)

100% (1368 of 1368) |####################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (4126 of 4126) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


In [207]:
emb_model = embedding_registry.doc2vec_tfidf_weight_avg()

In [63]:
unified_df = pd.DataFrame(unified_df[unified_df.Document_source=='pwdb'])

In [64]:
unified_df['text'] = unified_df[['Title', 'Content']].agg(' '.join, axis=1)

In [208]:
unified_df['emb'] = emb_model.encode(unified_df['text'].values)

In [10]:
countries = list(set(pwdb_df.country.values))

In [57]:
n = len(countries)
sim_matrix = np.zeros((n, n))

In [68]:
def prepare_df(unified_df: pd.DataFrame,
               full_df: pd.DataFrame,
               column_filter_name: str,
               column_filter_value: Any
               ):
    search_index = full_df[full_df[column_filter_name] == column_filter_value].index.values
    result_df = pd.DataFrame(unified_df[unified_df.index.isin(search_index)])
    return result_df

In [174]:
def top_k_mean(data:np.array,top_k: int):
    tmp_data = data.copy().tolist()
    tmp_data.sort(reverse=True)
    return mean(tmp_data[:top_k] + [0] * (top_k - len(data)))


In [209]:
for i in range(0, len(countries)):
    sim_matrix[i][i] = 0
    for j in range(i + 1, len(countries)):
        df_x = prepare_df(unified_df=unified_df,
                          full_df=pwdb_df,
                          column_filter_name='country',
                          column_filter_value=countries[i]
                          )
        df_y = prepare_df(unified_df=unified_df,
                          full_df=pwdb_df,
                          column_filter_name='country',
                          column_filter_value=countries[j]
                          )
        tmp_sim_matrix = cosine_similarity(df_x['emb'].values.tolist(),
                                          df_y['emb'].values.tolist())
        sim_mean = top_k_mean(tmp_sim_matrix[np.triu_indices_from(tmp_sim_matrix,k=1)],50)
        sim_matrix[i][j] = sim_matrix[j][i] = sim_mean

In [79]:
from sklearn.preprocessing import minmax_scale

In [203]:
sim_matrix = minmax_scale(sim_matrix,feature_range=(0,1))

In [179]:
sim_matrix = 1/(1+sim_matrix)

In [130]:
np.fill_diagonal(sim_matrix,np.min(sim_matrix))

In [210]:
fig = px.imshow(sim_matrix,
                labels=dict(color="Semantic similarity"),
                x=countries,
                y=countries,
                width=700,
                height=700
                )
fig.update_xaxes(side="top")
fig.show()
