## Import libraries

In [2]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from sem_covid.services.store_registry import store_registry
from sem_covid import config
from sem_covid.services.model_registry import embedding_registry

from typing import Any
import numpy as np
import pandas as pd

import plotly.express as px
from sklearn.metrics.pairwise import cosine_similarity
from numpy import mean
from more_itertools import unique_everseen
from typing import List

In [3]:
unified_df = store_registry.es_index_store().get_dataframe(index_name=config.UNIFIED_DATASET_ELASTIC_SEARCH_INDEX_NAME)

100% (6360 of 6360) |####################| Elapsed Time: 0:00:04 Time:  0:00:04


In [6]:
pwdb_df = unified_df[unified_df.doc_source=='ds_pwdb']

In [10]:
pwdb_df.document_embeddings

_id
0       [-0.0538220145, -0.0548402928, -0.0093910526, ...
1       [0.0431224369, -0.0080667678, 0.0412131101, -0...
2       [-0.011736772, 0.0533053987, 0.017932903, -0.0...
3       [-0.0202411115, -0.0540658794, -0.0253679529, ...
4       [0.0207592584, 0.0149038536, 0.0191072822, -0....
                              ...                        
1374    [-0.0541599803, -0.0546001196, -0.0630599409, ...
1375    [0.0146439541, -0.0553405508, 0.0198072512, -0...
1376    [-0.0057761753, -0.0255548619, 0.0176457968, -...
1377    [0.0245512538, 0.0053861849, -0.0316515602, -0...
1378    [-0.0571072996, -0.0576960146, -0.0334699862, ...
Name: document_embeddings, Length: 1379, dtype: object

In [41]:
DATE_COLUMN_NAME = 'date'
EMBEDDING_COLUMN_NAME = 'document_embeddings'
COUNTRY_COLUMN_NAME = 'country'

In [8]:
def top_k_mean(data: np.array, top_k: int):
    tmp_data = data.copy().tolist()
    tmp_data.sort(reverse=True)
    return mean(tmp_data[:top_k] + [0] * (top_k - len(data)))

In [29]:
def generate_countries_similarity_matrix(pwdb_dataset: pd.DataFrame,
                                         countries: List[str]):
    n = len(countries)
    sim_matrix = np.zeros((n, n))
    for i in range(0, len(countries)):
        sim_matrix[i][i] = 0
        df_x = pd.DataFrame(pwdb_dataset[pwdb_dataset[COUNTRY_COLUMN_NAME]==countries[i]])
        for j in range(i + 1, len(countries)):
            df_y = pd.DataFrame(pwdb_dataset[pwdb_dataset[COUNTRY_COLUMN_NAME]==countries[j]])
            tmp_sim_matrix = cosine_similarity(df_x[EMBEDDING_COLUMN_NAME].values.tolist(),
                                               df_y[EMBEDDING_COLUMN_NAME].values.tolist())
            sim_mean = top_k_mean(tmp_sim_matrix[np.triu_indices_from(tmp_sim_matrix, k=1)], 10)
            sim_matrix[i][j] = sim_matrix[j][i] = sim_mean
    return sim_matrix

In [32]:
countries = list(unique_everseen(pwdb_df.country.values))
countries_similarity_matrix = generate_countries_similarity_matrix(pwdb_dataset=pwdb_df,
                                                                   countries=countries)

In [33]:
fig = px.imshow(countries_similarity_matrix,
                labels=dict(color="Semantic similarity"),
                x=countries,
                y=countries,
                width=700,
                height=700
                )
fig.update_xaxes(side="top")
fig.show()

In [42]:
def generate_2_country_similarity_matrix(dataset_x: pd.DataFrame, dataset_y: pd.DataFrame,
                                         start_date: str, end_date: str, periods: int):
    dataset_x[DATE_COLUMN_NAME] = pd.to_datetime(dataset_x[DATE_COLUMN_NAME]).dt.date
    dataset_y[DATE_COLUMN_NAME] = pd.to_datetime(dataset_y[DATE_COLUMN_NAME]).dt.date
    time_periods = pd.date_range(start=start_date,
                                 end=end_date,
                                 periods=periods).to_pydatetime().tolist()
    time_periods = list(map(lambda x: x.date(), time_periods))
    time_periods = list(zip(time_periods, time_periods[1:]))
    n = len(time_periods)
    sim_matrix = np.zeros((n, n))
    for i in range(0, n):
        start_y, end_y = time_periods[i]
        tmp_df_y = dataset_y[(dataset_y[DATE_COLUMN_NAME] >= start_y) & (dataset_y[DATE_COLUMN_NAME] < end_y)]
        if len(tmp_df_y):
            for j in range(0, n):
                start_x, end_x = time_periods[j]
                tmp_df_x = dataset_x[(dataset_x[DATE_COLUMN_NAME] >= start_x) & (dataset_x[DATE_COLUMN_NAME] < end_x)]
                if len(tmp_df_x):
                    tmp_sim_matrix = cosine_similarity(tmp_df_x[EMBEDDING_COLUMN_NAME].values.tolist(),
                                                       tmp_df_y[EMBEDDING_COLUMN_NAME].values.tolist())
                    sim_mean = top_k_mean(tmp_sim_matrix[np.triu_indices_from(tmp_sim_matrix, k=1)], 30)
                    sim_matrix[i][j] = sim_mean
    return sim_matrix, time_periods

In [39]:
countries = countries[:4]
countries


['Slovenia', 'Italy', 'Finland', 'Portugal']

In [44]:

for i in range(0, len(countries)):
    df_x = pd.DataFrame(pwdb_df[pwdb_df[COUNTRY_COLUMN_NAME]==countries[i]])
    for j in range(i + 1, len(countries)):
        df_y = pd.DataFrame(pwdb_df[pwdb_df[COUNTRY_COLUMN_NAME]==countries[j]])
        tmp_sim_matrix, tmp_periods = generate_2_country_similarity_matrix(df_x, df_y, start_date="2020-01-01",
                                                                           end_date="2021-07-1", periods=6)
        tmp_periods = [' '.join([str(x), str(y)]) for x, y in tmp_periods]
        color_scheme = [(0, "orange"),
                        (0.5, "yellow"),
                        (1, "lime")]
        fig = px.imshow(tmp_sim_matrix,
                        labels=dict(x=countries[i], y=countries[j], color="Semantic similarity"),
                        x=tmp_periods,
                        y=tmp_periods,
                        width=800,
                        height=700,
                        zmin=0,
                        zmax=1,
                        color_continuous_scale=color_scheme
                        )
        fig.update_xaxes(side="top")
        fig.update_xaxes(
            showticklabels=True,
            tickmode='linear',
            tickfont=dict(
                family='Old Standard TT, serif',
                size=8,
                color='black')
        )
        fig.update_yaxes(
            scaleanchor="x",
            scaleratio=1,
            showticklabels=True,
            tickmode='linear',
            tickfont=dict(
                family='Old Standard TT, serif',
                size=8,
                color='black'
            )
        )
        fig.show()

In [47]:
set(pwdb_df.pwdb_category.values)

{'Employment protection and retention',
 'Ensuring business continuity and support for essential services',
 'Income protection beyond short-time work',
 'Measures to prevent social hardship',
 'Promoting the economic, labour market and social recovery',
 'Protection of workers, adaptation of workplace',
 'Reorientation of business activities',
 'Supporting businesses to get back to normal',
 'Supporting businesses to stay afloat'}

In [None]:
countries = list(unique_everseen(pwdb_df.country.values))
for i in range(0, len(countries)):
    df_x = pd.DataFrame(pwdb_df[pwdb_df[COUNTRY_COLUMN_NAME]==countries[i]])
    for j in range(i + 1, len(countries)):
        df_y = pd.DataFrame(pwdb_df[pwdb_df[COUNTRY_COLUMN_NAME]==countries[j]])

In [65]:
countries = list(unique_everseen(pwdb_df.country.values))

In [86]:
def foo_calc(pwdb_dataset: pd.DataFrame,
             countries: List[str],
             filter_field: str,
             top_k: int
             ):
    tmp_df = {}
    for country in countries:
        df_x = pd.DataFrame(pwdb_dataset[pwdb_dataset[COUNTRY_COLUMN_NAME]==country])
        tmp_df[country] = df_x[filter_field].explode().value_counts(normalize=True).nlargest(top_k)
    return tmp_df

In [133]:
plot_df = pd.DataFrame(foo_calc(pwdb_df,countries[:10],'pwdb_category',5)).T
common_columns = plot_df.dropna(axis=1).columns

differ_columns = [column for column in plot_df.columns if column not in common_columns]

In [134]:
fig = px.bar(plot_df, x=plot_df.index, y=differ_columns,
            barmode='group',
             height=800, width=800,)
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="top",
    y=-0.10,
    xanchor="left",
    x=0
))
fig.show()

In [249]:
spain_df = pd.DataFrame(pwdb_df.copy())

spain_df.date = pd.to_datetime(spain_df.date).dt.date

dates = spain_df.date.unique()

result_df ={}
for date in dates:
   result_df[date] = spain_df[spain_df.date == date]['pwdb_category'].explode().value_counts(normalize=True)
time_plot_df = pd.DataFrame(result_df).T

time_plot_df.sort_index(inplace=True)

fig = px.bar(time_plot_df, x=time_plot_df.index, y=time_plot_df.columns,
              range_x=['2020-03-01','2020-5-20'],
             height=600, width=800,
              )
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="top",
    y=-0.20,
    xanchor="left",
    x=0
)
)
# fig.update_traces(connectgaps=True,
#                    fill='tozeroy')
fig.show()

In [214]:
pwdb_df.pwdb_type_of_measure

_id
0       Legislations or other statutory regulations
1       Legislations or other statutory regulations
2       Legislations or other statutory regulations
3       Legislations or other statutory regulations
4       Legislations or other statutory regulations
                           ...                     
1374    Legislations or other statutory regulations
1375                Bipartite collective agreements
1376    Legislations or other statutory regulations
1377    Legislations or other statutory regulations
1378    Legislations or other statutory regulations
Name: pwdb_type_of_measure, Length: 1379, dtype: object