## Import libraries

In [2]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from sem_covid.services.store_registry import store_registry
from sem_covid import config
from sem_covid.services.model_registry import embedding_registry

from typing import Any
import numpy as np
import pandas as pd

import plotly.express as px
from sklearn.metrics.pairwise import cosine_similarity
from numpy import mean
from more_itertools import unique_everseen
from typing import List

## Define constants

In [3]:
UNIFIED_DATASET = 'ds_unified_datasets'

In [4]:
es_store = store_registry.es_index_store()

## Load data

In [5]:
pwdb_df = es_store.get_dataframe(index_name=config.PWDB_ELASTIC_SEARCH_INDEX_NAME)
unified_df = es_store.get_dataframe(index_name=UNIFIED_DATASET)
emb_model = embedding_registry.sent2vec_universal_sent_encoding()

100% (1381 of 1381) |####################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (4126 of 4126) |####################| Elapsed Time: 0:00:00 Time:  0:00:00
INFO:absl:Using /tmp/tfhub_modules to cache modules.


## Compute embeddings based on textual fields

In [6]:
unified_df = pd.DataFrame(unified_df[unified_df.Document_source == 'pwdb'])
unified_df['text'] = unified_df[['Title', 'Content']].agg(' '.join, axis=1)
unified_df['emb'] = emb_model.encode(unified_df['text'].values)

In [56]:
tmp = np.hstack(pwdb_df['subcategory'].str.split(',').apply(lambda x: list(map(str.strip, x))).values.tolist())

In [62]:
unified_df.columns

Index(['Date', 'Title', 'Content', 'Document_source', 'text', 'emb'], dtype='object')

In [75]:
country_filter = pwdb_df[pwdb_df.country=='Spain'].index.tolist()

In [76]:
tmp_df = unified_df.loc[country_filter]

In [81]:
pd.DataFrame(tmp_df.groupby(['Date']))[1][0]

Unnamed: 0_level_0,Date,Title,Content,Document_source,text,emb
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tika/ff1e9f1110f125c9c37841268b37cc9ab2af55e3a091bdf0996e6395ab093943,2020-03-31,Extension of protection for unemployment benefits,This extraordinary legislation supplements exi...,pwdb,Extension of protection for unemployment benef...,"[0.04423779994249344, -0.06124040484428406, -0..."


In [59]:
t = pd.Series(tmp).value_counts(normalize=True).nlargest(5)

## Define auxiliary functions:
- prepare_df - function for data filtering
- top_k_mean - function for computing the mean from greatest k terms

In [11]:
def prepare_df(unified_df: pd.DataFrame,
               pwdb_df: pd.DataFrame,
               column_filter_name: str,
               column_filter_value: Any
               ):
    search_index = pwdb_df[pwdb_df[column_filter_name] == column_filter_value].index.values
    result_df = pd.DataFrame(unified_df[unified_df.index.isin(search_index)])
    return result_df


In [12]:
def top_k_mean(data: np.array, top_k: int):
    tmp_data = data.copy().tolist()
    tmp_data.sort(reverse=True)
    return mean(tmp_data[:top_k] + [0] * (top_k - len(data)))



## Define function for generating similarity matrix between all countries from PWDB dataset

In [13]:
def generate_countries_similarity_matrix(unified_df: pd.DataFrame,
                                         pwdb_df: pd.DataFrame,
                                         countries: List[str]):
    n = len(countries)
    sim_matrix = np.zeros((n, n))
    for i in range(0, len(countries)):
        sim_matrix[i][i] = 0
        df_x = prepare_df(unified_df=unified_df,
                          pwdb_df=pwdb_df,
                          column_filter_name='country',
                          column_filter_value=countries[i]
                          )
        for j in range(i + 1, len(countries)):
            df_y = prepare_df(unified_df=unified_df,
                              pwdb_df=pwdb_df,
                              column_filter_name='country',
                              column_filter_value=countries[j]
                              )
            tmp_sim_matrix = cosine_similarity(df_x['emb'].values.tolist(),
                                               df_y['emb'].values.tolist())
            sim_mean = top_k_mean(tmp_sim_matrix[np.triu_indices_from(tmp_sim_matrix, k=1)], 5)
            sim_matrix[i][j] = sim_matrix[j][i] = sim_mean
        return sim_matrix

In [14]:
countries = list(unique_everseen(pwdb_df.country.values))
countries_similarity_matrix = generate_countries_similarity_matrix(unified_df=unified_df, pwdb_df=pwdb_df,
                                                                   countries=countries)

## Create plot ( a heatmap) based on similarity matrix between countries from PWDB dataset

In [18]:
fig = px.imshow(countries_similarity_matrix,
                labels=dict(color="Semantic similarity"),
                x=countries,
                y=countries,
                width=700,
                height=700
                )
fig.update_xaxes(side="top")
fig.show()

## Define function what compute similarity matrix for time periods between two countries

In [26]:
def generate_2_country_similarity_matrix(data_x: pd.DataFrame, data_y: pd.DataFrame,
                                         start_date: str, end_date: str, periods: int):
    data_x['Date'] = pd.to_datetime(data_x['Date']).dt.date
    data_y['Date'] = pd.to_datetime(data_y['Date']).dt.date
    time_periods = pd.date_range(start=start_date,
                                 end=end_date,
                                 periods=periods).to_pydatetime().tolist()
    time_periods = list(map(lambda x: x.date(), time_periods))
    time_periods = list(zip(time_periods, time_periods[1:]))
    n = len(time_periods)
    sim_matrix = np.zeros((n, n))
    for i in range(0, n):
        start_y, end_y = time_periods[i]
        tmp_df_y = data_y[(data_y['Date'] >= start_y) & (data_y['Date'] < end_y)]
        if len(tmp_df_y):
            for j in range(0, n):
                start_x, end_x = time_periods[j]
                tmp_df_x = data_x[(data_x['Date'] >= start_x) & (data_x['Date'] < end_x)]
                if len(tmp_df_x):
                    tmp_sim_matrix = cosine_similarity(tmp_df_x['emb'].values.tolist(),
                                                       tmp_df_y['emb'].values.tolist())
                    sim_mean = top_k_mean(tmp_sim_matrix[np.triu_indices_from(tmp_sim_matrix, k=1)], 30)
                    sim_matrix[i][j] = sim_mean
    return sim_matrix, time_periods


## Select only first 4 countries for compute plots

In [28]:
countries = countries[:4]
countries

['Croatia', 'European Union', 'Latvia', 'Hungary']

## Draw plots for similarity between two countries distributed in time periods

In [19]:
for i in range(0, len(countries)):
    df_x = prepare_df(unified_df=unified_df,
                      pwdb_df=pwdb_df,
                      column_filter_name='country',
                      column_filter_value=countries[i]
                      )
    for j in range(i + 1, len(countries)):
        df_y = prepare_df(unified_df=unified_df,
                          pwdb_df=pwdb_df,
                          column_filter_name='country',
                          column_filter_value=countries[j])
        tmp_sim_matrix, tmp_periods = generate_2_country_similarity_matrix(df_x, df_y, start_date="2020-01-01",
                                                                           end_date="2021-07-1", periods=6)
        tmp_periods = [' '.join([str(x), str(y)]) for x, y in tmp_periods]
        color_scheme = [(0, "orange"),
                        (0.5, "yellow"),
                        (1, "lime")]
        fig = px.imshow(tmp_sim_matrix,
                        labels=dict(x=countries[i], y=countries[j], color="Semantic similarity"),
                        x=tmp_periods,
                        y=tmp_periods,
                        width=800,
                        height=700,
                        zmin=0,
                        zmax=1,
                        color_continuous_scale=color_scheme
                        )
        fig.update_xaxes(side="top")
        fig.update_xaxes(
            showticklabels=True,
            tickmode='linear',
            tickfont=dict(
                family='Old Standard TT, serif',
                size=8,
                color='black')
        )
        fig.update_yaxes(
            scaleanchor="x",
            scaleratio=1,
            showticklabels=True,
            tickmode='linear',
            tickfont=dict(
                family='Old Standard TT, serif',
                size=8,
                color='black'
            )
        )
        fig.show()

