In [1]:
import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')


import faiss
import pickle
import pandas as pd

from sem_covid.entrypoints.notebooks.legal_radar.services.split_documents_pipeline import DOCUMENT_ID_SOURCE
from sem_covid.services.store_registry import store_registry
from sem_covid.services.model_registry import embedding_registry
from sem_covid import config
import numpy as np
from more_itertools import unique_everseen

In [63]:
sample_questions = pd.read_csv('sem_covid/entrypoints/notebooks/legal_radar/docs/sample_questions_v4.csv').dropna()
source_documents = sample_questions['Source Document'].to_list()
celex_number = sample_questions['Celex No'].to_list()
work_reference = sample_questions['Work Reference'].to_list()

In [56]:
sample_questions.columns

Index(['No', 'Questions/Text Extracts', 'Source Document', 'Article No',
       'Celex No', 'Work Reference', 'ELI'],
      dtype='object')

In [2]:
FAISS_BUCKET_NAME = 'faiss-index'
FAISS_INDEX_FINREG_NAME = 'faiss_index_finreg.pkl'
FIN_REG_SPLITTED_ES_INDEX = 'ds_finreg_splitted'
DATES_DOCUMENT = 'dates_document'
HTML_LINKS = 'htmls_to_download'
DEFAULT_SEARCH = """The Semantic Interoperability Community develops solutions to help European public administrations perform seamless and meaningful cross-border and cross-domain data exchanges."""
TEXT_PIECE = 'text_piece'

In [3]:
def load_documents():
    """Read the data from ES."""
    es_store = store_registry.es_index_store()
    df = es_store.get_dataframe(index_name=config.EU_FINREG_CELLAR_ELASTIC_SEARCH_INDEX_NAME)
    df[DATES_DOCUMENT] = pd.to_datetime(df[DATES_DOCUMENT]).dt.date
    return df

def load_splitted_documents():
    """Read the data from ES."""
    es_store = store_registry.es_index_store()
    return es_store.get_dataframe(index_name=FIN_REG_SPLITTED_ES_INDEX)

def load_emb_model():
    return embedding_registry.sent2vec_universal_sent_encoding()

def load_faiss_index():
    """Load and deserialize the Faiss index."""
    minio_store = store_registry.minio_object_store(minio_bucket=FAISS_BUCKET_NAME)
    data = pickle.loads(minio_store.get_object(object_name=FAISS_INDEX_FINREG_NAME))
    return faiss.deserialize_index(data)

def main(user_input: str):
    documents = load_documents()
    splitted_documents = load_splitted_documents()
    model = load_emb_model()
    faiss_index = load_faiss_index()
    num_results = 100
    embeddings = model.encode(sentences=[user_input])
    D, I = faiss_index.search(np.array(embeddings).astype("float32"), k=num_results)
    document_parts = splitted_documents.iloc[I.flatten().tolist()]
    document_parts['similarity'] = D.flatten().tolist()
    documents_id = list(unique_everseen(
    splitted_documents.iloc[I.flatten().tolist()][DOCUMENT_ID_SOURCE].values))

    return documents.loc[documents_id]

In [16]:
explorer = main('REGULATION (EU) No 648/2012 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL of 4 July 2012 on OTC derivatives, central counterparties and trade repositories')

100% (5779 of 5779) |####################| Elapsed Time: 0:00:03 Time:  0:00:03
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [59]:
explorer

Unnamed: 0_level_0,work,title,cdm_types,cdm_type_labels,resource_types,resource_type_labels,eurovoc_concepts,eurovoc_concept_labels,subject_matters,subject_matter_labels,...,pdfs_to_download,htmls_to_download,dossiers,related_works,work_sequences,core,metadata,content_path,content,language
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2225a7e0331142c1ff8c370ab0ff8d7f07bbb85fcbd5f6f15e17dfdf4decbeba,http://publications.europa.eu/resource/cellar/...,Regulation (EU) 2021/168 of the European Parli...,[http://publications.europa.eu/ontology/cdm#le...,,[http://publications.europa.eu/resource/author...,[Regulation],"[http://eurovoc.europa.eu/1354, http://eurovoc...","[benchmarking, consumer protection, financial ...",[http://publications.europa.eu/resource/author...,[Freedom of establishment],...,[http://publications.europa.eu/resource/cellar...,[http://publications.europa.eu/resource/cellar...,,,,True,True,[res/ad21a7733098ab518075bf26d392ef36881a5610c...,L_2021049EN.01000601.xml 12.2.2021 EN Official...,
1711ad72d00bfc03927e7446cb4773a04fd2a5b5288fcb48102265a657219d81,http://publications.europa.eu/resource/cellar/...,Regulation (EU) 2019/2099 of the European Parl...,[http://publications.europa.eu/ontology/cdm#le...,,[http://publications.europa.eu/resource/author...,[Regulation],"[http://eurovoc.europa.eu/1459, http://eurovoc...","[European Securities and Markets Authority, Eu...",[http://publications.europa.eu/resource/author...,"[Free movement of capital, Internal market - P...",...,[http://publications.europa.eu/resource/cellar...,[http://publications.europa.eu/resource/cellar...,,,,True,True,[res/d62f90a9795a021c790c3443e50173141acb86ac9...,L_2019322EN.01000101.xml 12.12.2019 EN Officia...,
833c1dd78f9a649f9b8b77128e4783df14ea224930697bb4de6c76662cb55884,http://publications.europa.eu/resource/cellar/...,Commission Delegated Regulation (EU) 2020/1732...,[http://publications.europa.eu/ontology/cdm#le...,,[http://publications.europa.eu/resource/author...,[Delegated regulation],"[http://eurovoc.europa.eu/1488, http://eurovoc...","[European Securities and Markets Authority, ba...",[http://publications.europa.eu/resource/author...,"[Freedom of establishment, Internal market - P...",...,[http://publications.europa.eu/resource/cellar...,[http://publications.europa.eu/resource/cellar...,,,,True,True,[res/82fec7b7c11313f3ddf488dd25d4e9ef91129772d...,L_2020390EN.01000101.xml 20.11.2020 EN Officia...,
1b5190ed21009069932abbd6a5879fb9a3b94437f55578b1c5cab889fcbbaf2f,http://publications.europa.eu/resource/cellar/...,Commission Delegated Regulation (EU) 2021/962 ...,[http://publications.europa.eu/ontology/cdm#le...,,[http://publications.europa.eu/resource/author...,[Delegated regulation],"[http://eurovoc.europa.eu/3751, http://eurovoc...","[financial derivative, financial legislation, ...",[http://publications.europa.eu/resource/author...,"[Free movement of capital, Freedom of establis...",...,[http://publications.europa.eu/resource/cellar...,[http://publications.europa.eu/resource/cellar...,,,,True,True,[res/757e7b519b130776c91830ba34ec2e2f2df2a20cc...,L_2021213EN.01000101.xml 16.6.2021 EN Official...,
4b2303f43b73ea0945a644e815573f15814615d207681459a7e80c576a3ff053,http://publications.europa.eu/resource/cellar/...,Directive 2014/59/EU of the European Parliamen...,[http://publications.europa.eu/ontology/cdm#di...,,[http://publications.europa.eu/resource/author...,[Directive],"[http://eurovoc.europa.eu/1485, http://eurovoc...","[aid for restructuring, company in difficultie...",[http://publications.europa.eu/resource/author...,[Economic policy],...,[http://publications.europa.eu/resource/cellar...,[http://publications.europa.eu/resource/cellar...,,,,True,True,[res/c2cdc915102721e4211b4d6d7b228ef2cc6b20d68...,L_2014173EN.01019001.xml 12.6.2014 EN Official...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
932a4b13cf8205003ad9ebbdc03d845c111594bbf834841fbeca29131b9c3fc2,http://publications.europa.eu/resource/cellar/...,Regulation (EU) 2021/1056 of the European Parl...,[http://publications.europa.eu/ontology/cdm#le...,,[http://publications.europa.eu/resource/author...,[Regulation],"[http://eurovoc.europa.eu/1052, http://eurovoc...","[Community support framework, adaptation to cl...",[http://publications.europa.eu/resource/author...,"[Economic, social and territorial cohesion, En...",...,[http://publications.europa.eu/resource/cellar...,[http://publications.europa.eu/resource/cellar...,,,,True,True,[res/0ecc04a48a56bf6072c0dcfc92358222add1e236c...,L_2021231EN.01000101.xml 30.6.2021 EN Official...,
e129e270ac52d1df145b5bb148669ab549e6e86602ac49314a8e0854944f6d06,http://publications.europa.eu/resource/cellar/...,Council Regulation (EU) 2016/369 of 15 March 2...,[http://publications.europa.eu/ontology/cdm#le...,,[http://publications.europa.eu/resource/author...,[Regulation],"[http://eurovoc.europa.eu/1005, http://eurovoc...","[EU aid, EU financing, aid to disaster victims...",[http://publications.europa.eu/resource/author...,"[Financial provisions, Humanitarian aid, Immig...",...,[http://publications.europa.eu/resource/cellar...,[http://publications.europa.eu/resource/cellar...,,,,True,True,[res/30b93b5252cba1f2c5da9c69236716e5b4a0e34d8...,L_2016070EN.01000101.xml 16.3.2016 EN Official...,
9c59c922147534917d4f21266e733e404f80aca6b382f563d1876797c3bf2701,http://publications.europa.eu/resource/cellar/...,Regulation (EU) 2019/2033 of the European Parl...,[http://publications.europa.eu/ontology/cdm#le...,,[http://publications.europa.eu/resource/author...,[Regulation],"[http://eurovoc.europa.eu/189, http://eurovoc....","[capital market, disclosure of information, fi...",[http://publications.europa.eu/resource/author...,"[Financial provisions, Freedom of establishmen...",...,[http://publications.europa.eu/resource/cellar...,[http://publications.europa.eu/resource/cellar...,,,,True,True,[res/51c0b8d25fdc0ba731032b34ec8782e5a48ca3bfa...,L_2019314EN.01000101.xml 5.12.2019 EN Official...,
6d2890bbcb0d611c17dcbee8a697195615ea0c39c3dfb6e0a2d2f283e6e498d1,http://publications.europa.eu/resource/cellar/...,Directive (EU) 2019/2034 of the European Parli...,[http://publications.europa.eu/ontology/cdm#le...,,[http://publications.europa.eu/resource/author...,[Directive],"[http://eurovoc.europa.eu/189, http://eurovoc....","[capital market, disclosure of information, fi...",[http://publications.europa.eu/resource/author...,"[Approximation of laws, Financial provisions, ...",...,[http://publications.europa.eu/resource/cellar...,[http://publications.europa.eu/resource/cellar...,,,,True,True,[res/0955ce43b7565b433e3868b7ab55e5ba0ac30a742...,L_2019314EN.01006401.xml 5.12.2019 EN Official...,


_id
2225a7e0331142c1ff8c370ab0ff8d7f07bbb85fcbd5f6f15e17dfdf4decbeba    http://publications.europa.eu/resource/cellar/...
1711ad72d00bfc03927e7446cb4773a04fd2a5b5288fcb48102265a657219d81    http://publications.europa.eu/resource/cellar/...
833c1dd78f9a649f9b8b77128e4783df14ea224930697bb4de6c76662cb55884    http://publications.europa.eu/resource/cellar/...
1b5190ed21009069932abbd6a5879fb9a3b94437f55578b1c5cab889fcbbaf2f    http://publications.europa.eu/resource/cellar/...
4b2303f43b73ea0945a644e815573f15814615d207681459a7e80c576a3ff053    http://publications.europa.eu/resource/cellar/...
                                                                                          ...                        
932a4b13cf8205003ad9ebbdc03d845c111594bbf834841fbeca29131b9c3fc2    http://publications.europa.eu/resource/cellar/...
e129e270ac52d1df145b5bb148669ab549e6e86602ac49314a8e0854944f6d06    http://publications.europa.eu/resource/cellar/...
9c59c922147534917d4f21266e733e404f80aca6b382f563d187

In [66]:
id_documents = explorer['id_documents'].reset_index(drop=True).to_list()
work = explorer['work'].reset_index(drop=True).to_list()

In [52]:
celex_id = [document_id[0][6:] for document_id in id_documents]

In [None]:
for reference in work:
    print('http://publications.europa.eu/resource/cellar/665d41b1-2085-4584-af8a-45c605d62b94' in reference)

In [71]:
for id in celex_id:
    print('32013R1286' in id)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [73]:
from sem_covid.services.data_registry import Dataset

In [80]:
df1 = Dataset.EU_ACTION_TIMELINE.fetch()

100% (210 of 210) |######################| Elapsed Time: 0:00:00 Time:  0:00:00


In [82]:
df1['detail_content']

_id
6f7b5c48577b676dfe245c235f95d908de88065835d3af3502227eb167af578c                                 Today the Commission
7048376c33c84aacd6cd6afc3b30c9d0c66daf2cf1e889437bd7a35d9dbab3be    The European Commission has disbursed 13 billi...
7245825eaf114273bfb8056794db8636c14d08eb5bccf245139547b1b34c256c    Today, the Commission presents a package of gu...
72d78db8a2d763bd09c7673886a53b9eb89916fb64eb939b4d851b9bf13bf418    The European Commission has set up an EU Human...
72ec8adbd87ac5a7d6b0aa0f0efce3eee0b0e2c09ef33c9b9b6a3159ce1dde77    Today, the President of the European Commissio...
                                                                                          ...                        
423ca5cb89149ce41a92b141d5739b6559052af9cec0ee95918f7fcc0072ff7a    The Commission has today adopted a banking pac...
4260d4c621ffcbe1d78a579093b37e172aea0913fb09bef2fe2b4bfec55d91be    The Commission has today proposed modification...
43509bc1f4de25504deb7e0b95069e2813b9b518fc68216cc29f