# This notebook is a POC:
### Scope is to validate the hypothesis of the existence of a pattern in the similarity of documents issued by EU member states.
### In order to achieve the proposed goal, the following steps have been proposed:
- to calculate document embeddings for each document.
- to calculate the cosine similarity of the document embedding between 2 countries.
- to arrange the similarity obtained in a plot with two time axes.
- where the X axis represents the time for country X.
- where the Y axis represents the time for country Y.
- each point in the graph represents the similarity of two documents issued by countries X and Y.
- the similarity of the documents will be indicated by the color of the dot.

## Import libraries

In [6]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from sem_covid.services.store_registry import store_registry
from sem_covid import config
from sem_covid.services.model_registry import EmbeddingModelRegistry

from sem_covid.services.model_registry import EmbeddingModelRegistryABC
from typing import Any
import numpy as np
from scipy.spatial import distance
import pandas as pd

import plotly.express as px
from plotly.graph_objs import Layout
from plotly.graph_objs.layout import XAxis, YAxis
import datetime



## Define constants

In [3]:
DOCUMENT_EMBEDDINGS_FEATURE_STORE_NAME = 'fs_doc_emb_tfidf'
UNIFIED_DATASET = 'ds_unified_datasets'

## Load datasets.

In [4]:
es_store = store_registry.es_index_store()
full_pwdb_df = es_store.get_dataframe(index_name=config.PWDB_ELASTIC_SEARCH_INDEX_NAME)
df = es_store.get_dataframe(index_name=UNIFIED_DATASET)
doc_emb = es_store.get_dataframe(index_name=DOCUMENT_EMBEDDINGS_FEATURE_STORE_NAME)
emb_model = EmbeddingModelRegistry().sent2vec_universal_sent_encoding()

## Declaration of auxiliary functions.

In [7]:
def prepare_df(unified_df: pd.DataFrame,
               emb_model: EmbeddingModelRegistryABC,
               full_df: pd.DataFrame,
               column_filter_name: str,
               column_filter_value: Any
               ):
    search_index = full_df[full_df[column_filter_name] == column_filter_value].index.values
    result_df = pd.DataFrame(unified_df[unified_df.index.isin(search_index)])
    result_df['text'] = result_df[['Title', 'Content']].agg(' '.join, axis=1)
    result_df['emb'] = emb_model.encode(result_df['text'].values)
    return result_df


def cosine_similarity(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))


def euclidean_similarity(a, b):
    return 1 / (1 + distance.euclidean(a, b))



In [35]:
def plot_similarity_over_time(data: pd.DataFrame,
                              start_date: datetime.date,
                              end_date: datetime.date):
    columns = data.columns.values
    fig = px.scatter(data, x=columns[0], y=columns[1], color=columns[2],
                     range_x=[start_date, end_date],
                     range_y=[start_date, end_date])
    layout = Layout(
        xaxis=XAxis(
            range=[start_date, end_date],
            showgrid=True,
            zeroline=True,
            showline=True,
            gridcolor='#bdbdbd',
            gridwidth=2,
            zerolinecolor='#969696',
            zerolinewidth=4,
            linecolor='#636363',
            linewidth=6
        ),
        yaxis=YAxis(
            range=[start_date, end_date],
            showgrid=True,
            zeroline=True,
            showline=True,
            gridcolor='#bdbdbd',
            gridwidth=2,
            zerolinecolor='#969696',
            zerolinewidth=4,
            linecolor='#636363',
            linewidth=6
        ),
        height=600,
        width=600,
    )
    fig.update_layout(layout)
    fig.show()

## For the analysis, the EU countries well represented in the PWDB dataset were selected.

In [37]:
COUNTRIES = ['Spain', 'Italy', 'Greece', 'Germany', 'Austria', 'Portugal']

## Display the similarity plot based on documents issued in certain time periods.


In [44]:
for country_x in COUNTRIES:
    country_index = COUNTRIES.index(country_x) + 1
    for country_y in COUNTRIES[country_index:]:
        df_x = prepare_df(unified_df=df,
                          emb_model=emb_model,
                          full_df=full_pwdb_df,
                          column_filter_name='country',
                          column_filter_value=country_x
                          )
        df_y = prepare_df(unified_df=df,
                          emb_model=emb_model,
                          full_df=full_pwdb_df,
                          column_filter_name='country',
                          column_filter_value=country_y
                          )
        test_df = pd.DataFrame([(row_x['Date'], row_y['Date'],
                                 cosine_similarity(row_x['emb'], row_y['emb']))
                                for iter_x, row_x in df_x.iterrows()
                                for iter_y, row_y in df_y.iterrows()]
                               , columns=[f'date_{country_x}', f'date_{country_y}', 'sim'])
        start_date = datetime.date(2020, 1, 1)
        end_date = datetime.date(2021, 1, 1)
        plot_similarity_over_time(test_df,start_date,end_date)

# Conclusions
### Following this experiment, a set of problems was identified that does not allow to identify a pattern of similarity.
### The problems identified are:
- several documents were issued on the same date, which does not provide us with a single point of intersection.
- countries did not issue documents evenly distributed, but several documents were issued on the same date.

### As solutions to the identified problems:
- the AVG-pooling operation can be applied over periods of time, in order to obtain the average similarity over periods.
- to make the average similarity between countries without taking into account time.