## Import libraries

In [2]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from sem_covid.services.store_registry import store_registry
from sem_covid import config
from sem_covid.services.model_registry import embedding_registry

from typing import Any
import numpy as np
import pandas as pd

import plotly.express as px
from sklearn.metrics.pairwise import cosine_similarity
from numpy import mean
from more_itertools import unique_everseen
from typing import List

In [3]:
unified_df = store_registry.es_index_store().get_dataframe(index_name=config.UNIFIED_DATASET_ELASTIC_SEARCH_INDEX_NAME)

100% (6360 of 6360) |####################| Elapsed Time: 0:00:04 Time:  0:00:04


In [6]:
pwdb_df = unified_df[unified_df.doc_source == 'ds_pwdb']

In [10]:
pwdb_df.document_embeddings_use

_id
0       [-0.0538220145, -0.0548402928, -0.0093910526, ...
1       [0.0431224369, -0.0080667678, 0.0412131101, -0...
2       [-0.011736772, 0.0533053987, 0.017932903, -0.0...
3       [-0.0202411115, -0.0540658794, -0.0253679529, ...
4       [0.0207592584, 0.0149038536, 0.0191072822, -0....
                              ...                        
1374    [-0.0541599803, -0.0546001196, -0.0630599409, ...
1375    [0.0146439541, -0.0553405508, 0.0198072512, -0...
1376    [-0.0057761753, -0.0255548619, 0.0176457968, -...
1377    [0.0245512538, 0.0053861849, -0.0316515602, -0...
1378    [-0.0571072996, -0.0576960146, -0.0334699862, ...
Name: document_embeddings, Length: 1379, dtype: object

In [41]:
DATE_COLUMN_NAME = 'date'
EMBEDDING_COLUMN_NAME = 'document_embeddings_use'
COUNTRY_COLUMN_NAME = 'country'

In [8]:
def top_k_mean(data: np.array, top_k: int):
    tmp_data = data.copy().tolist()
    tmp_data.sort(reverse=True)
    return mean(tmp_data[:top_k] + [0] * (top_k - len(data)))

In [29]:
def generate_countries_similarity_matrix(pwdb_dataset: pd.DataFrame,
                                         countries: List[str]):
    n = len(countries)
    sim_matrix = np.zeros((n, n))
    for i in range(0, len(countries)):
        sim_matrix[i][i] = 0
        df_x = pd.DataFrame(pwdb_dataset[pwdb_dataset[COUNTRY_COLUMN_NAME] == countries[i]])
        for j in range(i + 1, len(countries)):
            df_y = pd.DataFrame(pwdb_dataset[pwdb_dataset[COUNTRY_COLUMN_NAME] == countries[j]])
            tmp_sim_matrix = cosine_similarity(df_x[EMBEDDING_COLUMN_NAME].values.tolist(),
                                               df_y[EMBEDDING_COLUMN_NAME].values.tolist())
            sim_mean = top_k_mean(tmp_sim_matrix[np.triu_indices_from(tmp_sim_matrix, k=1)], 10)
            sim_matrix[i][j] = sim_matrix[j][i] = sim_mean
    return sim_matrix

In [32]:
countries = list(unique_everseen(pwdb_df.country.values))
countries_similarity_matrix = generate_countries_similarity_matrix(pwdb_dataset=pwdb_df,
                                                                   countries=countries)

In [33]:
fig = px.imshow(countries_similarity_matrix,
                labels=dict(color="Semantic similarity"),
                x=countries,
                y=countries,
                width=700,
                height=700
                )
fig.update_xaxes(side="top")
fig.show()

In [42]:
def generate_2_country_similarity_matrix(dataset_x: pd.DataFrame, dataset_y: pd.DataFrame,
                                         start_date: str, end_date: str, periods: int):
    dataset_x[DATE_COLUMN_NAME] = pd.to_datetime(dataset_x[DATE_COLUMN_NAME]).dt.date
    dataset_y[DATE_COLUMN_NAME] = pd.to_datetime(dataset_y[DATE_COLUMN_NAME]).dt.date
    time_periods = pd.date_range(start=start_date,
                                 end=end_date,
                                 periods=periods).to_pydatetime().tolist()
    time_periods = list(map(lambda x: x.date(), time_periods))
    time_periods = list(zip(time_periods, time_periods[1:]))
    n = len(time_periods)
    sim_matrix = np.zeros((n, n))
    for i in range(0, n):
        start_y, end_y = time_periods[i]
        tmp_df_y = dataset_y[(dataset_y[DATE_COLUMN_NAME] >= start_y) & (dataset_y[DATE_COLUMN_NAME] < end_y)]
        if len(tmp_df_y):
            for j in range(0, n):
                start_x, end_x = time_periods[j]
                tmp_df_x = dataset_x[(dataset_x[DATE_COLUMN_NAME] >= start_x) & (dataset_x[DATE_COLUMN_NAME] < end_x)]
                if len(tmp_df_x):
                    tmp_sim_matrix = cosine_similarity(tmp_df_x[EMBEDDING_COLUMN_NAME].values.tolist(),
                                                       tmp_df_y[EMBEDDING_COLUMN_NAME].values.tolist())
                    sim_mean = top_k_mean(tmp_sim_matrix[np.triu_indices_from(tmp_sim_matrix, k=1)], 30)
                    sim_matrix[i][j] = sim_mean
    return sim_matrix, time_periods

In [39]:
countries = countries[:4]
countries


['Slovenia', 'Italy', 'Finland', 'Portugal']

In [44]:

for i in range(0, len(countries)):
    df_x = pd.DataFrame(pwdb_df[pwdb_df[COUNTRY_COLUMN_NAME] == countries[i]])
    for j in range(i + 1, len(countries)):
        df_y = pd.DataFrame(pwdb_df[pwdb_df[COUNTRY_COLUMN_NAME] == countries[j]])
        tmp_sim_matrix, tmp_periods = generate_2_country_similarity_matrix(df_x, df_y, start_date="2020-01-01",
                                                                           end_date="2021-07-1", periods=6)
        tmp_periods = [' '.join([str(x), str(y)]) for x, y in tmp_periods]
        color_scheme = [(0, "orange"),
                        (0.5, "yellow"),
                        (1, "lime")]
        fig = px.imshow(tmp_sim_matrix,
                        labels=dict(x=countries[i], y=countries[j], color="Semantic similarity"),
                        x=tmp_periods,
                        y=tmp_periods,
                        width=800,
                        height=700,
                        zmin=0,
                        zmax=1,
                        color_continuous_scale=color_scheme
                        )
        fig.update_xaxes(side="top")
        fig.update_xaxes(
            showticklabels=True,
            tickmode='linear',
            tickfont=dict(
                family='Old Standard TT, serif',
                size=8,
                color='black')
        )
        fig.update_yaxes(
            scaleanchor="x",
            scaleratio=1,
            showticklabels=True,
            tickmode='linear',
            tickfont=dict(
                family='Old Standard TT, serif',
                size=8,
                color='black'
            )
        )
        fig.show()

In [47]:
set(pwdb_df.pwdb_category.values)

{'Employment protection and retention',
 'Ensuring business continuity and support for essential services',
 'Income protection beyond short-time work',
 'Measures to prevent social hardship',
 'Promoting the economic, labour market and social recovery',
 'Protection of workers, adaptation of workplace',
 'Reorientation of business activities',
 'Supporting businesses to get back to normal',
 'Supporting businesses to stay afloat'}

In [None]:
countries = list(unique_everseen(pwdb_df.country.values))
for i in range(0, len(countries)):
    df_x = pd.DataFrame(pwdb_df[pwdb_df[COUNTRY_COLUMN_NAME] == countries[i]])
    for j in range(i + 1, len(countries)):
        df_y = pd.DataFrame(pwdb_df[pwdb_df[COUNTRY_COLUMN_NAME] == countries[j]])

In [65]:
countries = list(unique_everseen(pwdb_df.country.values))

In [86]:
def foo_calc(pwdb_dataset: pd.DataFrame,
             countries: List[str],
             filter_field: str,
             top_k: int
             ):
    tmp_df = {}
    for country in countries:
        df_x = pd.DataFrame(pwdb_dataset[pwdb_dataset[COUNTRY_COLUMN_NAME] == country])
        tmp_df[country] = df_x[filter_field].explode().value_counts(normalize=True).nlargest(top_k)
    return tmp_df

In [257]:
plot_df = pd.DataFrame(foo_calc(pwdb_df, countries[:2], 'pwdb_category', 5)).T
common_columns = plot_df.dropna(axis=1).columns

differ_columns = [column for column in plot_df.columns if column not in common_columns]

In [258]:
fig = px.bar(plot_df, x=plot_df.index, y=common_columns,
             barmode='group',
             height=800, width=800, )
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="top",
    y=-0.10,
    xanchor="left",
    x=0
))
fig.show()

In [254]:
spain_df = pd.DataFrame(pwdb_df.copy())

spain_df.date = pd.to_datetime(spain_df.date).dt.date

dates = spain_df.date.unique()

result_df = {}
for date in dates:
    result_df[date] = spain_df[spain_df.date == date]['pwdb_category'].explode().value_counts(normalize=True)
time_plot_df = pd.DataFrame(result_df).T

time_plot_df.sort_index(inplace=True)

fig = px.line(time_plot_df, x=time_plot_df.index, y=time_plot_df.columns,
              range_x=['2020-03-01', '2020-5-20'],
              height=600, width=800,
              )
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="top",
    y=-0.20,
    xanchor="left",
    x=0
)
)
fig.update_traces(connectgaps=True,
                  fill='tozeroy')
fig.show()

In [259]:
pwdb_df.document_embeddings_use

_id
0       [-0.0538220145, -0.0548402928, -0.0093910526, ...
1       [0.0431224369, -0.0080667678, 0.0412131101, -0...
2       [-0.011736772, 0.0533053987, 0.017932903, -0.0...
3       [-0.0202411115, -0.0540658794, -0.0253679529, ...
4       [0.0207592584, 0.0149038536, 0.0191072822, -0....
                              ...                        
1374    [-0.0541599803, -0.0546001196, -0.0630599409, ...
1375    [0.0146439541, -0.0553405508, 0.0198072512, -0...
1376    [-0.0057761753, -0.0255548619, 0.0176457968, -...
1377    [0.0245512538, 0.0053861849, -0.0316515602, -0...
1378    [-0.0571072996, -0.0576960146, -0.0334699862, ...
Name: document_embeddings, Length: 1379, dtype: object

In [261]:
eu_cellar_df = unified_df[unified_df.doc_source == 'ds_eu_cellar']

In [331]:
def get_dataset_from_unified(unified_dataset: pd.DataFrame,
                             dataset_name: str):
    return pd.DataFrame(unified_dataset[unified_dataset.doc_source == dataset_name].copy())

In [359]:
def plot_sim_histogram(unified_dataset: pd.DataFrame,
                       dataset_name_x: str,
                       dataset_name_y: str,
                       bins_step:float
                       ):
    dataset_x = get_dataset_from_unified(unified_dataset, dataset_name_x)
    dataset_y = get_dataset_from_unified(unified_dataset, dataset_name_y)
    tmp_sim_array = cosine_similarity(dataset_x[EMBEDDING_COLUMN_NAME].values.tolist(),
                                      dataset_y[EMBEDDING_COLUMN_NAME].values.tolist())
    tmp_sim_array.sort()
    counts, bins = np.histogram(tmp_sim_array,
                                bins=np.arange(-1, 1, bins_step))
    counts = counts / tmp_sim_array.size
    bins = 0.5 * (bins[:-1] + bins[1:])
    fig = px.bar(x=bins, y=counts,
                 labels={'x': f'similarity distribution between {dataset_name_x} and {dataset_name_y} ',
                         'y': 'count'})
    fig.show()

In [360]:
plot_sim_histogram(unified_df,'ds_pwdb', 'ds_pwdb',bins_step=0.05)

In [383]:
tmp_df = pwdb_df.pwdb_funding.explode()
pwdb_df.loc[tmp_df[tmp_df == 'European Funds'].index]

Unnamed: 0_level_0,title,content,date,doc_source,country,pwdb_category,pwdb_target_group_l1,pwdb_funding,pwdb_type_of_measure,pwdb_actors,...,eu_cellar_subject_matter_labels,eu_cellar_resource_type_labels,eu_cellar_directory_code_labels,eu_cellar_author_labels,pwdb_target_group_l2,ireland_keyword,ireland_department_data,ireland_campaign,ireland_page_type,eu_timeline_topic
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Temporary layoff scheme and reimbursement of r...,Temporary layoff scheme and reimbursement of r...,2020-03-13,ds_pwdb,Slovenia,Employment protection and retention,[],"[European Funds, National funds]",Legislations or other statutory regulations,"[National government, Social insurance]",...,[],[],[],[],[],[],[],[],[],[]
8,COVID-19: Assistance for frontline workers in ...,COVID-19: Assistance for frontline workers in ...,2020-08-01,ds_pwdb,Bulgaria,Ensuring business continuity and support for e...,"[businesses, workers]","[European Funds, National funds]",Tripartite agreements,"[National government, Trade unions, Other soci...",...,[],[],[],[],"[Particular professions, Sector specific set o...",[],[],[],[],[]
10,Financial support line for micro and small tou...,Financial support line for micro and small tou...,2021-01-12,ds_pwdb,Portugal,Supporting businesses to stay afloat,[businesses],[European Funds],Legislations or other statutory regulations,[National government],...,[],[],[],[],"[Sector specific set of companies, SMEs, One p...",[],[],[],[],[]
21,State sponsorship scheme for businesses and se...,State sponsorship scheme for businesses and se...,2021-01-01,ds_pwdb,Cyprus,Supporting businesses to stay afloat,[],"[European Funds, National funds]",Legislations or other statutory regulations,"[National government, Trade unions, Employers'...",...,[],[],[],[],[],[],[],[],[],[]
22,Plan for Economic Recovery and Social Protection,Plan for Economic Recovery and Social Protecti...,2020-07-21,ds_pwdb,Spain,"Promoting the economic, labour market and soci...",[],"[European Funds, National funds, Regional funds]",Other initiatives or policies,"[Social partners jointly, Local / regional gov...",...,[],[],[],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1341,Measure: 'Employment for you': subsidised empl...,Measure: 'Employment for you': subsidised empl...,2020-06-26,ds_pwdb,Bulgaria,"Promoting the economic, labour market and soci...",[workers],"[European Funds, National funds]",Tripartite agreements,"[National government, Company / Companies]",...,[],[],[],[],[Unemployed],[],[],[],[],[]
1343,Set of measures to sustain the sectors most hi...,Set of measures to sustain the sectors most hi...,2020-05-01,ds_pwdb,Slovenia,"Promoting the economic, labour market and soci...",[businesses],"[European Funds, National funds]",Legislations or other statutory regulations,"[National government, Social partners jointly]",...,[],[],[],[],[Sector specific set of companies],[],[],[],[],[]
1345,Monthly basic income for self-employed,Monthly basic income for self-employed The leg...,2020-03-13,ds_pwdb,Slovenia,Income protection beyond short-time work,[workers],"[European Funds, National funds]",Legislations or other statutory regulations,[National government],...,[],[],[],[],[Self-employed],[],[],[],[],[]
1360,Technology-COVID call within OPPIK,Technology-COVID call within OPPIK The Technol...,2020-04-15,ds_pwdb,Czechia,Reorientation of business activities,[businesses],"[Companies, European Funds, National funds]",Legislations or other statutory regulations,"[National government, Company / Companies]",...,[],[],[],[],[SMEs],[],[],[],[],[]


In [385]:
list(unique_everseen(pwdb_df.pwdb_target_group_l1.explode().dropna().values))

['businesses', 'workers', 'citizens']