## Import libraries

In [135]:
import sys

import pandas as pd

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from sem_covid.services.store_registry import store_registry
from sem_covid import config
from sem_covid.services.model_registry import EmbeddingModelRegistry

## Define constants

In [5]:
DOCUMENT_EMBEDDINGS_FEATURE_STORE_NAME = 'fs_doc_emb_tfidf'
UNIFIED_DATASET = 'ds_unified_datasets'

In [2]:
es_store = store_registry.es_index_store()

In [154]:
full_pwdb_df = es_store.get_dataframe(index_name=config.PWDB_ELASTIC_SEARCH_INDEX_NAME)

100% (1368 of 1368) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


In [141]:
query = 'the impact of covid-19–related restrictions on social and daily activities of parents, people with disabilities, and older adults.'

In [173]:
from sem_covid.services.model_registry import EmbeddingModelRegistryABC
from typing import Any


def prepare_df(unified_df: pd.DataFrame,
        emb_model: EmbeddingModelRegistryABC,
        full_df: pd.DataFrame,
        column_filter_name: str,
        column_filter_value: Any
        ):
    search_index = full_df[full_df[column_filter_name] == column_filter_value].index.values
    result_df = pd.DataFrame(unified_df[unified_df.index.isin(search_index)])
    result_df['text'] = result_df[['Title', 'Content']].agg(' '.join, axis=1)
    result_df['emb'] = emb_model.encode(result_df['text'].values)
    return result_df

In [174]:
spain_df = prepare_df(unified_df=df,
                      emb_model=emb_model,
                      full_df=full_pwdb_df,
                      column_filter_name='country',
                      column_filter_value='Spain'
                      )
italy_df = prepare_df(unified_df=df,
                      emb_model=emb_model,
                      full_df=full_pwdb_df,
                      column_filter_name='country',
                      column_filter_value='Italy'
                      )

In [188]:
test_result = [(row_x['Date'], row_y['Date'], cosine_similarity(row_x['emb'],row_y['emb']))
    for iter_x, row_x in spain_df.iterrows()
    for iter_y, row_y in italy_df.iterrows()]

In [189]:
test_df = pd.DataFrame(test_result,columns=['date_spain','date_italy','sim'])

In [132]:
pwdb_df = pd.DataFrame(df[df.Document_source == 'pwdb'])

In [133]:
pwdb_df['text'] = pwdb_df[['Title', 'Content']].agg(' '.join, axis=1)

In [137]:
emb_model = EmbeddingModelRegistry().sent2vec_universal_sent_encoding()

INFO:absl:Using /tmp/tfhub_modules to cache modules.
2021-09-02 16:37:50.720480: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-09-02 16:37:50.720506: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-09-02 16:37:50.720530: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (c004c8213d64): /proc/driver/nvidia/version does not exist
2021-09-02 16:37:50.720663: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-09-02 16:37:52.254642: I tensorflow/compiler/mlir/ml

In [139]:
pwdb_df['emb'] = emb_model.encode(pwdb_df.text.values)

2021-09-02 16:38:16.427247: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1300638720 exceeds 10% of free system memory.


In [142]:
emb_query = emb_model.encode([query])[0]

In [140]:
pwdb_df['emb']

_id
tika/2ec585e04df9d361a951e7d26ce5d3ab06e4d17612e36ee84696c1451c972c4b    [-0.05382201448082924, -0.054840292781591415, ...
tika/66bc32e9135b8e8d1babac8dafd6c64119a519761e0e55555105a66c759e8130    [-0.05767642334103584, -0.06128883734345436, -...
tika/aaf8d11193277c5adc773cb6d57ef672483c6feb3f23f8316df3680f526e47fc    [-0.059150487184524536, -0.060041725635528564,...
tika/74734d4700b8900d64e8692f8e59635b222c9035d133b90715d68ba2599ab1da    [-0.059596702456474304, -0.06233103573322296, ...
tika/e53e7ed3ea23384e272cce94831e5c2e4b5a09dd8d1c05c762e5c99cdf1c6a7f    [-0.06001204624772072, -0.04874153062701225, 0...
                                                                                               ...                        
tika/f86bbfb9edadfcfaaa9fa1313a507970b36fb2db067e5b02f5719344dc750d61    [-0.057859156280756, -0.06392540037631989, 0.0...
tika/71fc1cad3a3a8b9de4da7080acc3a8251fa8ff8c5b7514f082f18c1cf69ab140    [0.024551253765821457, 0.005386184900999069, -...
tika/05dff4e

In [146]:
import numpy as np


def cosine_similarity(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))


In [163]:
from scipy.spatial import distance


def euclidean_similarity(a, b):
    return 1 / (1 + distance.euclidean(a, b))

In [164]:
pwdb_df['sim'] = pwdb_df.emb.apply(lambda x: euclidean_similarity(x, emb_query))

In [191]:
pwdb_df.columns

Index(['Date', 'Title', 'Content', 'Document_source', 'text', 'emb', 'sim'], dtype='object')

In [158]:
search_index = full_pwdb_df[full_pwdb_df.country == 'Spain'].index.values

In [199]:
t_df = pwdb_df[pwdb_df.index.isin(search_index)]

In [197]:
pwdb_df['Date'] = pd.to_datetime(pwdb_df['Date'])

In [201]:
t_df.groupby('Date')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f1ec38209a0>

In [202]:
from plotly.graph_objs import Layout
from plotly.graph_objs.layout import XAxis, YAxis

start_date = datetime.date(2020, 1, 1)
end_date = datetime.date(2021, 1, 1)
fig = px.scatter(pwdb_df[pwdb_df.index.isin(search_index)], x='Date', y='sim', color='sim',
                 range_x=[start_date, end_date],
                 range_y=[start_date, end_date])
layout = Layout(
    xaxis=XAxis(
        range=[start_date, end_date],
        showgrid=True,
        zeroline=True,
        showline=True,
        gridcolor='#bdbdbd',
        gridwidth=2,
        zerolinecolor='#969696',
        zerolinewidth=4,
        linecolor='#636363',
        linewidth=6
    ),
    yaxis=YAxis(
        range=[start_date, end_date],
        showgrid=True,
        zeroline=True,
        showline=True,
        gridcolor='#bdbdbd',
        gridwidth=2,
        zerolinecolor='#969696',
        zerolinewidth=4,
        linecolor='#636363',
        linewidth=6
    ),
    height=600,
    width=600,
)
fig.update_layout(layout)
fig.show()


In [166]:
sim_matrix_name = 'sm_ds_pwdb_x_ds_pwdb_tfidfembeddingmodel_cosine_similarity'

In [167]:
sm_matrix = es_store.get_dataframe(index_name=sim_matrix_name)

100% (1658944 of 1658944) |##############| Elapsed Time: 0:00:25 Time:  0:00:25


In [170]:
sm_matrix.head(5)

Unnamed: 0_level_0,ds_pwdb,cosine_similarity
_id,Unnamed: 1_level_1,Unnamed: 2_level_1
199513,a3f5f6fc94065eb836615cdeff18a02bec206d39c862b5...,0.955423
200011,b59a4f705e6901a234e9dfb327ffca16a139f10a22a28c...,0.789576
199514,4edffa9e4df06447a6bb15a6798fb124966f87e04fe075...,0.910267
200012,2ff0f7615f65fbd064b5bcd1050e757580f2596eafa28f...,0.906806
199516,ae4977c3d12f46a920ff942f32349d4d03120fab44c530...,0.971484


In [6]:
df = es_store.get_dataframe(index_name=UNIFIED_DATASET)

100% (4126 of 4126) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


In [14]:
doc_emb = es_store.get_dataframe(index_name=DOCUMENT_EMBEDDINGS_FEATURE_STORE_NAME)

100% (2490 of 2490) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


In [77]:
import random
import datetime


def random_date() -> datetime.date:
    start_date = datetime.date(2019, 1, 1)
    end_date = datetime.date(2021, 1, 1)
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = random.randrange(days_between_dates)
    result_date = start_date + datetime.timedelta(days=random_number_of_days)
    return result_date

In [69]:
random_date()

731


datetime.date(2020, 5, 13)

In [90]:
import pandas as pd

sample_size = 100
date_x = [random_date() for n in range(0, sample_size)]
date_y = [random_date() for n in range(0, sample_size)]
date_x.sort()
date_y.sort()
test_df = pd.DataFrame([(dx, dy, random.random())
                        for dx in date_x for dy in date_y
                        ], columns=['date_x', 'date_y', 'similarity'])
#test_df['date_x'] = date_x
#test_df['date_y'] = date_y
#test_df['similarity'] = [random.random() for n in range(0,sample_size)]

In [85]:
test_df

Unnamed: 0,date_x,date_y,similarity
0,2019-05-26,2019-01-22,0.471705
1,2019-05-26,2019-03-15,0.400265
2,2019-05-26,2019-05-24,0.825433
3,2019-05-26,2019-08-28,0.581902
4,2019-05-26,2020-01-25,0.716074
...,...,...,...
95,2020-11-25,2020-04-13,0.877905
96,2020-11-25,2020-04-30,0.671117
97,2020-11-25,2020-06-03,0.440572
98,2020-11-25,2020-09-09,0.961443


In [87]:
import plotly.express as px

fig = px.scatter_3d(test_df, x='date_x', y='date_y', z='similarity', color='similarity')
fig.show()

In [105]:
from plotly.graph_objs import Layout
from plotly.graph_objs.layout import XAxis, YAxis

start_date = datetime.date(2019, 1, 1)
end_date = datetime.date(2021, 1, 1)
fig = px.scatter(test_df, x='date_x', y='date_y', color='similarity',
                 range_x=[start_date, end_date],
                 range_y=[start_date, end_date])
layout = Layout(
    xaxis=XAxis(
        range=[start_date, end_date],
        showgrid=True,
        zeroline=True,
        showline=True,
        gridcolor='#bdbdbd',
        gridwidth=2,
        zerolinecolor='#969696',
        zerolinewidth=4,
        linecolor='#636363',
        linewidth=6
    ),
    yaxis=YAxis(
        range=[start_date, end_date],
        showgrid=True,
        zeroline=True,
        showline=True,
        gridcolor='#bdbdbd',
        gridwidth=2,
        zerolinecolor='#969696',
        zerolinewidth=4,
        linecolor='#636363',
        linewidth=6
    ),
    height=600,
    width=600,
)
fig.update_layout(layout)
fig.show()



In [18]:
doc_emb.index.values

array(['003ea2a9313db86a3e0b9cd272d4dd861ed92cfab3a5d120725211cc113b1f93',
       '004a77d6db5f3be729b4b3d8b7eb0ccb564defb037a0edcba96eb5c164d87a37',
       '00abfb8547a6cfc95a4591e489431fd7803cbb8f51db632db1e462096e914c75',
       ...,
       '77d7e3c52aaf78bdfb1a1667641db1293bbff862440c547fc3f6e8ab4fbd0d4e',
       'be8e21b382b63dd838087d7864e4d49a4807d9cc5d71aaa82577383ece92c581',
       'c434820f8f8c0b1447ba2f6f62aed177b9597532d7fcd156c6f556532dd7d11e'],
      dtype=object)

In [20]:
set1 = set(df.index.values)
set2 = set(doc_emb.index.values)

In [23]:
set1.intersection(set2)

{'e05b2722522b275e87cff18bb2cbb38a6648d08c3524343b769d50893ab5f4d9',
 '9a0674489638f34055d7f103731b7d4f4ca1c716c91b0731294bd3785c042026',
 '6d32c0115b65d0a184dc30c7aa8f1a16862cee7ed04dabae99a25aeaa809ba7e',
 'acb785060c5c8e8f09ce9024267047f7a47c13e70a34cd4467dc9b7027f9b07c',
 'c70a6b037776d04b1cdde3656c08e40f8d2738f05c6249f00009af51a72b1c0b',
 '8518fe3bffd37a7083b5f0a80dc0151afbd21c4965a7daf83f67b242bcfaddc3',
 '7f52ed4d80a57b9b4d498a2c677117a56147a7d85bc56a7fc61b69c5858c3f42',
 '635a75bf3be44401a899f12cb8377ccdf0e5f34146e2ae577534cb6fef293dd6',
 'bd728a7a5aee52c9db6ede00b3dd78d34ddac29d1d2aeec8ae0c8f005ee02db6',
 '9f4d58fb1b700b8bb7f2576667894a3a255b4d5593b253dc70196360d81591ef',
 'e811c111e6aa2b9ead0196132a53accef97cedd65f3ae64e1eddd1c1d96ef191',
 '2094248a1a106cc1f170ec23b2f1aef2a0b7bf0847783d9d3a077d40e6938865',
 '280ae582f17ad680a474c9fea81cad05824fefc0feaa4ec2a17abcedd23e4018',
 '16d3353567d1d20355e9b452c1ded68d053c14c6a117e793525eb2df4775855a',
 '7885321da9490c42391c8cb00e928df9

In [24]:
configs = [config.PWDB_ELASTIC_SEARCH_INDEX_NAME,
           config.EU_CELLAR_ELASTIC_SEARCH_INDEX_NAME,
           config.EU_TIMELINE_ELASTIC_SEARCH_INDEX_NAME,
           config.IRELAND_TIMELINE_ELASTIC_SEARCH_INDEX_NAME
           ]

In [25]:
df_list = [es_store.get_dataframe(index_name=index) for index in configs]

100% (1288 of 1288) |####################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (2818 of 2818) |####################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (171 of 171) |######################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (1859 of 1859) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


In [28]:
for df in df_list:
    print(df.columns)

Index(['identifier', 'title', 'title_national_language', 'country',
       'start_date', 'end_date', 'date_type', 'type_of_measure',
       'status_of_regulation', 'category', 'subcategory', 'creation_date',
       'background_info_description', 'content_of_measure_description',
       'use_of_measure_description', 'actors', 'target_groups', 'funding',
       'involvement_of_social_partners_description',
       'social_partner_involvement_form', 'social_partner_role',
       'is_sector_specific', 'private_or_public_sector',
       'is_occupation_specific', 'sectors', 'occupations', 'sources'],
      dtype='object')
Index(['work', 'title', 'cdm_types', 'cdm_type_labels', 'resource_types',
       'resource_type_labels', 'eurovoc_concepts', 'eurovoc_concept_labels',
       'subject_matters', 'subject_matter_labels', 'directory_codes',
       'directory_codes_labels', 'celex_numbers', 'legal_elis', 'id_documents',
       'same_as_uris', 'authors', 'author_labels', 'full_ojs', 'oj_sectors',

In [27]:
df_list[0].start_date

_id
adc5c75937bc7f7198f534d08b85bd50c9521bfd3f319a090932b5d0bae54de0    10/20/2020
2372d71eb9ad6e6a70982e02bbe802db004ed49d91b2264c0a2e8e41571002cc    05/06/2020
8735e268191e9e5cbd3d2a44ca53d297e31746b5f1e24b941db6225a25848353    08/25/2020
18bcd22116c46919e03a3345f793c3859855227ac942e69dd13cbfcd588e1044    03/01/2020
b94d8aa95fbdeb1bb832b01fbe5d6e9bf9fc36fceb14f7ba370a963f472fe35b    01/01/2021
                                                                       ...    
cb014a456b14c3621dd318a12e611f70c2a9636be9fe181072bd4bf5917a40fa    03/18/2020
d233b17dc2b98f14269c2b22be78d93ec5ccf2a0013b86f09175c69353c5800b    05/20/2020
77d7e3c52aaf78bdfb1a1667641db1293bbff862440c547fc3f6e8ab4fbd0d4e    03/17/2020
be8e21b382b63dd838087d7864e4d49a4807d9cc5d71aaa82577383ece92c581    07/01/2020
c434820f8f8c0b1447ba2f6f62aed177b9597532d7fcd156c6f556532dd7d11e    09/03/2020
Name: start_date, Length: 1288, dtype: object

In [17]:
import time
from datetime import datetime

In [21]:
df.updated_date

_id
003ea2a9313db86a3e0b9cd272d4dd861ed92cfab3a5d120725211cc113b1f93    17 February 2021
004a77d6db5f3be729b4b3d8b7eb0ccb564defb037a0edcba96eb5c164d87a37       22 April 2021
00ab798bb7ed96ce3b93b9a3488522a58853bcaaf1bfa1bcf05fd8ffd8af0ce1    18 December 2020
00abfb8547a6cfc95a4591e489431fd7803cbb8f51db632db1e462096e914c75        5 March 2021
01361189adf68b9a6d91beffd9f0b8259d5764ba865aab9dae3982545deeeb8f    16 November 2020
                                                                          ...       
fddf8d1e3ce04ef12941ac7a83a2e85806c84d83496f41b0cc6d98bb0d8910e8     30 January 2021
fde72094e795e2f9a055d1c3fc581985e028ddcaeb436c1d5056f109c6e853d0       15 March 2021
fefa88e8bcb8f19c1794526db7e241a1201a7aa6f05fedd25c09cee4b35c46f0         18 May 2020
ff474a2f55ba0a59240e4e986870ba12be9f2238bec4f4f8ca2c9c712172ed43     8 February 2021
ff4ea2ef933287c4125d898245e878ae9fcb1b356b3c60de288dd97020bdd3a5     23 January 2021
Name: updated_date, Length: 846, dtype: object

In [22]:
df.updated_date.apply(lambda x: datetime.strptime(x, '%d %B %Y')).sort_values()

_id
bda5b336f5422961710173eb8d3410f9a7de84730384e5a0a9e4bca7944d8ef2   2020-02-06
3081ea5944895377e13ba7099afd23af5e5e6758353487c34b5b66a7208eb320   2020-02-11
07e10eabe926db01330fe75c8e24204d2b2b0f7b452b22a9819bd942f589068a   2020-02-11
f16d3a88f070b5bffcd6468060b7cd65aec942fdb7bed206554a35e7397f59f3   2020-02-18
48d020cbb50ca48a690b248341563ef320baa2dc0c733c8f46d4f9e99e0a0109   2020-02-26
                                                                      ...    
2ef0d1006a3ae21968f4805e4427a93f33dea7bd576dfa2b1d51c721dbbce336   2021-07-30
840f37b71ae8997f9708ee15e84424323edb8133fd13df26fa16b5fdd444c15a   2021-08-03
7b90e6440a9e2bd26a05f6f56f49dd7c80814e43480854be1debe461c63bdbb7   2021-08-04
ef6430aadbb96ea6654b296f6ba51a6d450a398bc6e5e7d9262f60a4fc3a6a2a   2021-08-05
f6adde50065d7eb9bbc6ee2585c90c5e90810aba4d966ceb3282bf0961aef0a3   2021-08-10
Name: updated_date, Length: 846, dtype: datetime64[ns]

In [10]:
doc_emb = es_store.get_dataframe(index_name=DOCUMENT_EMBEDDINGS_FEATURE_STORE_NAME)

100% (2490 of 2490) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


In [16]:
doc_emb.columns

Index(['text', 'source', 'segment_type', 'segment_id', 'prepare_method',
       'textual_columns', 'embedding', 'embedding_method'],
      dtype='object')