In [2]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import pickle
from more_itertools import unique_everseen

import faiss
import pandas as pd
import numpy as np

from sem_covid.entrypoints.notebooks.legal_radar.services.split_documents_pipeline import DOCUMENT_ID_SOURCE
from sem_covid.services.store_registry import store_registry
from sem_covid.services.model_registry import embedding_registry
from sem_covid import config
from sem_covid.services.sc_wrangling.feature_selector import reduce_array_column

In [3]:
filter_year = [1900, 2021, (1900, 2021), 1]

In [4]:
FAISS_BUCKET_NAME = 'faiss-index'
FAISS_INDEX_FINREG_NAME = 'faiss_index_finreg.pkl'
FIN_REG_SPLITTED_ES_INDEX = 'ds_finreg_splitted'
DATES_DOCUMENT = 'dates_document'
HTML_LINKS = 'htmls_to_download'
DEFAULT_SEARCH = """The Semantic Interoperability Community develops solutions to help European public administrations perform seamless and meaningful cross-border and cross-domain data exchanges."""
TEXT_PIECE = 'text_piece'

In [5]:
def load_documents():
    """Read the data from ES."""
    es_store = store_registry.es_index_store()
    df = es_store.get_dataframe(index_name=config.EU_FINREG_CELLAR_ELASTIC_SEARCH_INDEX_NAME)
    df[DATES_DOCUMENT] = pd.to_datetime(df[DATES_DOCUMENT]).dt.date
    return df


def load_splitted_documents():
    """Read the data from ES."""
    es_store = store_registry.es_index_store()
    return es_store.get_dataframe(index_name=FIN_REG_SPLITTED_ES_INDEX)


def load_emb_model():
    return embedding_registry.sent2vec_universal_sent_encoding()


def load_faiss_index():
    """Load and deserialize the Faiss index."""
    minio_store = store_registry.minio_object_store(minio_bucket=FAISS_BUCKET_NAME)
    data = pickle.loads(minio_store.get_object(object_name=FAISS_INDEX_FINREG_NAME))
    return faiss.deserialize_index(data)


documents = load_documents()
splitted_documents = load_splitted_documents()
model = load_emb_model()
faiss_index = load_faiss_index()


100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:25 Time:  0:00:25
INFO:absl:Using /tmp/tfhub_modules to cache modules.


In [739]:
def semantic_search(user_input: str):
    num_results = 100
    embeddings = model.encode(sentences=[user_input])
    D, I = faiss_index.search(np.array(embeddings).astype("float32"), k=num_results)
    document_parts = pd.DataFrame(splitted_documents.iloc[I.flatten().tolist()])
    document_parts['similarity'] = D.flatten().tolist()
    document_parts['similarity'] = pd.Series(D.flatten().tolist()).values
    documents_id = list(unique_everseen(
        splitted_documents.iloc[I.flatten().tolist()][DOCUMENT_ID_SOURCE].values))
    frame = documents.loc[documents_id]  # 28 values
    result = []
    for index in frame.index:
        document_parts_by_index = document_parts[document_parts[DOCUMENT_ID_SOURCE] == index]  # 8 values || 1 in list
        # documents_id = document_parts_by_index[DOCUMENT_ID_SOURCE].values
        #document_parts = document_parts_by_index.drop_duplicates(DOCUMENT_ID_SOURCE).reset_index(drop=True)
        result.append(document_parts)
        # documents_id = document_parts_by_index[DOCUMENT_ID_SOURCE].values
        # frame = documents.loc[documents_id]
        # result_documents = pd.DataFrame(documents.loc[documents_id]).reset_index(drop=True)
    # result_documents['similarity'] = document_parts_by_index['similarity'].to_list()
    # result_documents['text_piece'] = document_parts_by_index['text_piece']

    return result
    # return document_parts_by_index

In [6]:
user_input = "Hello World!"
num_results = 100
embeddings = model.encode(sentences=[user_input])
D, I = faiss_index.search(np.array(embeddings).astype("float32"), k=num_results)
document_parts = pd.DataFrame(splitted_documents.iloc[I.flatten().tolist()])
#document_parts['similarity'] = D.flatten().tolist()
document_parts['similarity'] = pd.Series(D.flatten().tolist()).values
documents_id = list(unique_everseen(
    splitted_documents.iloc[I.flatten().tolist()][DOCUMENT_ID_SOURCE].values))
frame = documents.loc[documents_id]

In [22]:
documents_slices = {}
for index in frame.index:
    tmp_dict = {}
    tmp_document_parts = document_parts[document_parts[DOCUMENT_ID_SOURCE] == index]
    tmp_dict['document_slices'] = tmp_document_parts.text_piece.values
    tmp_dict['document_slices_similarity'] = tmp_document_parts.similarity.values
    documents_slices[index] = tmp_dict

In [28]:
tmp_df = pd.DataFrame(documents_slices).T

In [35]:
tmp_df.document_slices_similarity = tmp_df.document_slices_similarity.apply(
    lambda x: np.array(list(map(lambda y: round(y, 4), x))))

In [49]:
result_df  = frame.join(tmp_df)

In [53]:
result_df[['content', 'document_slices', 'document_slices_similarity']]

Unnamed: 0_level_0,content,document_slices,document_slices_similarity
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
69d03cfd8f7c89d99cb194d405a516d8d9e204e514165a4f504435832ba026ab,23.12.2016 EN Official Journal of the European...,[The obligation to transpose the provisions wh...,[1.5111]
26544e68ef92c113faea37d761bb4505026e1e549b7f36d62ecd5c733bb84b9f,3.3.2015 EN Official Journal of the European U...,[(22) In order to allow for timely programming...,"[1.5199, 1.5644]"
01d0a59e47da00c8c01280dda46a7a1eb6a5406fe3ca0a2edd3a8d46ab5deadc,2.12.2020 EN Official Journal of the European ...,[That committee shall be a committee within th...,[1.5328]
109a6c3c02c83073c6c9215f7d31ee57a85276aec657cd4ff7da743f09bf7f64,2.7.2019 EN Official Journal of the European U...,[Yes Yes 3 Installation name M Free Yes Yes Ye...,"[1.5432, 1.5559, 1.5571, 1.5801]"
d049ef633bc0680ba095d35f8bec7d4f2f552e2b3dc1443429cf66acf4781a0b,28.12.2020 EN Official Journal of the European...,[The thematic objective referred to in the fir...,"[1.5448, 1.6106]"
...,...,...,...
7a83ba354a82be4c790295a74e1e67935efec6806ff71febacb96dc8baf8b5c3,29.10.2008 EN Official Journal of the European...,"[Where reference is made to this paragraph, Ar...",[1.6136]
34646feed14bb1b91dd1a94e45a26576e5b955b9644f721b8b402b086d2fa79d,5.8.2020 EN Official Journal of the European U...,[2. Notwithstanding paragraph 1 of this Articl...,[1.6142]
88efb39c5ccf58e7447708afabbe380c2ee77c70687e5f4384193a55c293dd49,12.6.2014 EN Official Journal of the European ...,[Where a new benchmark is developed after 3 Ja...,[1.6146]
edccd1d0813f3171edd3d29c81afb4c25e863139c7a376bad3e3ab1c2a8191d8,7.6.2019 EN Official Journal of the European U...,[Where an institution has a subsidiary which i...,[1.6149]


In [386]:
doc = load_splitted_documents()

100% (69866 of 69866) |##################| Elapsed Time: 0:00:25 Time:  0:00:25


In [390]:
doc['text_piece']

_id
b1b019e1b3e8acaada8bf3a6f1375559f7c055503a998e18929c083874f20845    Council Regulation (EC) No 695/2005 of 26 Apri...
f0a9df6bcc52a8c3141143e6f5c930f1229b0cdd86c410f81ba5ac36382c422c    THE COUNCIL OF THE EUROPEAN UNION, Having rega...
918fae7f7068d4543bd02df2a3cc71074c6e7c00c828c5ad03e2bb00229eadf5    (5) Application of this mechanism should be te...
5100524aeb02b779514c85d9c16996f642350643699b1312bee1ea5cf550d224    It shall apply to expenditure incurred from 1 ...
ab3f9e3bb321a7a03419439e620abc5628effc8928589da662a04967269a90be    Council Regulation (EC) No 3234/94 of 20 Decem...
                                                                                          ...                        
bbfe3907ed0ab3745e63d568a501cc492bb9a6a152e0fb492741f06e559a6f26    OSOBA/ (SL) V JAVNI LISTINI, KI JI JE PRILOZEN...
4c083494d2d65d1bf113d32aba6f69169a8af7c91dfd48375dd803334bd2ece2    FORMULAR AR BIFOGAT NAMNS INGEN UPPGIFT OM DOM...
62bef0c8780712d7d68fa61626b5e9ba2b5e494b579a2e88a8c1

In [385]:
sample_questions['Questions/Text Extracts'][0]

'‘clearing’ means the process of establishing positions, including the calculation of net obligations, and ensuring that financial instruments, cash, or both, are available to secure the exposures arising from those positions;'

In [356]:
# documents, celex number and url reference from sample question file
sample_questions = pd.read_csv('sem_covid/entrypoints/notebooks/legal_radar/docs/sample_questions_v4.csv').dropna()

In [430]:
# def execution_verification(dataset: pd.DataFrame, source_documents_column: str, celex_number_column: str) -> pd.DataFrame:
# """
#     This function helps us to compare the celex id from our source and legal radar semantic search.
#     Inserting source document text and his celex number, checks if each document is found in the first
#     10 results.
# Args:
#     dataset: dataframe with required source document and his celex number
#     source_documents_column: dataframe's source document column name
#     celex_number_column: dataframe's celex number column name

# Returns: the name of inserted document and how much results that are having their celex number.
# """
#     results = []

#     for (source_document, celex_number) in zip(dataset[source_documents_column], dataset[celex_number_column]):
#         executed_search_engine = main(source_document)
#         reduced_execution = reduce_array_column(executed_search_engine, 'celex_numbers').reset_index(drop=True)
#         sum_celex_number_found = reduced_execution['celex_numbers'].isin([celex_number])
#         index = reduced_execution[sum_celex_number_found].index.to_list()

#         results.append(
#             {
#                 'position_in_documents': reduced_execution['content'].loc[index].index.to_list(),
#                 'position_in_slices': reduced_execution['text_piece'].loc[index].index.to_list(),
#                 'sum_of_values': sum_celex_number_found.sum(),
#                 "similarity_percentage": reduced_execution['similarity'].apply(lambda x: 1 / (1 + x)).loc[index].to_list()
#             }
#         )
#     result = pd.DataFrame(results).applymap(lambda x: x[0] if isinstance(x, list) else x)

#     result['in_top_5_slices'] = result['position_in_slices'] <= 5
#     result['in_top_10_slices'] = result['position_in_slices'] <= 10
#     result['in_top_5_documents'] = result['position_in_documents'] <= 5
#     result['in_top_10_documents'] = result['position_in_documents'] <= 10
#     result['in_q3'] = result['similarity_percentage'] >= 0.75

#     return pd.concat([dataset, result], axis=1)


# def execute_semantic_search(dataset: pd.DataFrame, input_query: str, celex_number: str) -> list:
#         results = []

#         for (query_text, celex_number) in zip(dataset[input_query], dataset[celex_number]):
#             executed_search_engine = semantic_search(query_text)
#             reduced_execution = reduce_array_column(executed_search_engine, 'celex_numbers').reset_index(drop=True)
#             sum_celex_number_found = reduced_execution['celex_numbers'].isin([celex_number])
#             index = reduced_execution[sum_celex_number_found].index.to_list()

#             results.append({
#                     'position_in_documents': reduced_execution['content'].loc[index].index.to_list(),
#                     'position_in_slices': reduced_execution['text_piece'].loc[index].index.to_list(),
#                     'sum_of_values': sum_celex_number_found.sum(),
#                     "similarity_percentage": reduced_execution['similarity'].apply(lambda x: 1 / (1 + x)).loc[index].to_list()})

#         return results


# def view_semantic_search_results(semantic_search_results) -> pd.DataFrame:
#         result = pd.DataFrame(semantic_search_results).applymap(lambda x: x[0] if isinstance(x, list) else x)
#         result = result.assign(in_top_5_slices=result['position_in_slices'] <= 5,
#                                in_top_10_slices=result['position_in_slices'] <= 10,
#                                in_top_5_documents=result['position_in_documents'] <= 5,
#                                in_top_10_documents=result['position_in_documents'] <= 10,
#                                in_q3=result['similarity_percentage'] >= 0.75)

#         return pd.concat([dataset, result], axis=1)


class SemanticSearchEvaluation:
    """Test evaluation for Legal Radar - Semantic Search"""

    def __init__(self, dataset: pd.DataFrame, input_query: str, celex_number: str) -> None:
        """
            dataset: dataframe with required source document and his celex number
            source_documents_column: dataframe's source document column name
            celex_number_column: dataframe's celex number column name
        """
        self.dataset = dataset
        self.input_query = input_query
        self.celex_number = celex_number

    def execute_semantic_search(self) -> list:
        """
            From inserted dataframe, it finds the column with queries and the values 
            and executes the semantic search function. IT calculates the sum of found numbers
            and gets their indexes. After that it appends into a list the possitions of 
            documents and documents' slices, sum of found values and similarity percentage.
            
        """
        results = []

        for (query_text, celex_number) in zip(self.dataset[self.input_query], self.dataset[self.celex_number]):
            executed_search_engine = semantic_search(query_text)
            reduced_execution = reduce_array_column(executed_search_engine, 'celex_numbers').reset_index(drop=True)
            sum_celex_number_found = reduced_execution['celex_numbers'].isin([celex_number])
            index = reduced_execution[sum_celex_number_found].index.to_list()

            results.append({
                'position_in_documents': reduced_execution['content'].loc[index].index.to_list(),
                'position_in_slices': reduced_execution['text_piece'].loc[index].index.to_list(),
                'sum_of_values': sum_celex_number_found.sum(),
                "similarity_percentage": reduced_execution['similarity'].apply(lambda x: 1 / (1 + x)).loc[
                    index].to_list()})

        return results

    def view_semantic_search_results(self) -> pd.DataFrame:
        """
            With founded results, it drops the lists from the table and assigns new columns with
            boolean statement of founded celex numbers in first 5, 10 search results of documents and
            documents' slices and checks if the similarity of those results are more than Q3, and concatenates
            the original inserted dataset and new generated one with found data.
        """
        result = pd.DataFrame(self.execute_semantic_search())
        result = result.assign(in_top_5_slices=result['position_in_slices'].apply(lambda x: any(np.array(x) <= 5)),
                               in_top_10_slices=result['position_in_slices'].apply(lambda x: any(np.array(x) <= 10)),
                               in_top_5_documents=result['position_in_documents'].apply(
                                   lambda x: any(np.array(x) <= 5)),
                               in_top_10_documents=result['position_in_documents'].apply(
                                   lambda x: any(np.array(x) <= 10)),
                               in_q3=result['similarity_percentage'].apply(lambda x: any(np.array(x) >= 0.75)))

        return pd.concat([self.dataset, result], axis=1)


In [431]:
execution = SemanticSearchEvaluation(sample_questions.iloc[[0, 1, 2]], 'Questions/Text Extracts',
                                     'Celex No').view_semantic_search_results()

100% (5791 of 5791) |####################| Elapsed Time: 0:00:03 Time:  0:00:03
100% (69866 of 69866) |##################| Elapsed Time: 0:00:26 Time:  0:00:26
100% (5791 of 5791) |####################| Elapsed Time: 0:00:05 Time:  0:00:05
100% (69866 of 69866) |##################| Elapsed Time: 0:00:25 Time:  0:00:25
100% (5791 of 5791) |####################| Elapsed Time: 0:00:05 Time:  0:00:05
100% (69866 of 69866) |##################| Elapsed Time: 0:00:25 Time:  0:00:25


In [433]:
execution

Unnamed: 0,No,Questions/Text Extracts,Source Document,Article No,Celex No,Work Reference,ELI,position_in_documents,position_in_slices,sum_of_values,similarity_percentage,in_top_5_slices,in_top_10_slices,in_top_5_documents,in_top_10_documents,in_q3
0,1,‘clearing’ means the process of establishing p...,REGULATION (EU) No 648/2012 OF THE EUROPEAN PA...,2(3),32012R0648,http://publications.europa.eu/resource/cellar/...,http://data.europa.eu/eli/reg/2012/648/oj,[27],[27],1,[0.4546839568469639],False,False,False,False,False
1,2,Incentives to promote the use of CCPs have not...,REGULATION (EU) No 648/2012 OF THE EUROPEAN PA...,Recital 13,32012R0648,http://publications.europa.eu/resource/cellar/...,http://data.europa.eu/eli/reg/2012/648/oj,[0],[0],1,[0.641506835303643],True,True,True,True,False
2,3,Ensuring that the clearing obligation reduces ...,REGULATION (EU) No 648/2012 OF THE EUROPEAN PA...,Recital 15,32012R0648,http://publications.europa.eu/resource/cellar/...,http://data.europa.eu/eli/reg/2012/648/oj,[0],[0],1,[0.659670742152862],True,True,True,True,False


In [520]:
sample_questions['Questions/Text Extracts'][0]

'‘clearing’ means the process of establishing positions, including the calculation of net obligations, and ensuring that financial instruments, cash, or both, are available to secure the exposures arising from those positions;'

In [576]:
a = {'a': [1, 34], 'b': [2, 123], }
df = pd.DataFrame(a)
df.values.tolist()
# df['c'] = df.values.tolist()
# df

[[1, 2], [34, 123]]

In [583]:
df

Unnamed: 0,a,b,c
0,1,2,1
1,34,123,34


In [740]:
a = semantic_search(sample_questions['Questions/Text Extracts'][0])


In [743]:
a[0]

Unnamed: 0,text_piece,document_id_source,text_piece_embedding,similarity


In [700]:
a['similarity'].to_list()

[1.0244039297103882,
 1.0740917921066284,
 1.1240408420562744,
 1.1341547966003418,
 1.143677830696106,
 1.150281548500061,
 1.1680783033370972,
 1.1894234418869019]

In [701]:
a['text_piece'].to_list()

["ANNEX I CALCULATING CAPITAL REQUIREMENTS FOR POSITION RISK GENERAL PROVISIONS Netting 1. The excess of an institution's long (short) positions over its short (long) positions in the same equity, debt and convertible issues and identical financial futures, options, warrants and covered warrants shall be its net position in each of those different instruments. In calculating the net position the competent authorities shall allow positions in derivative instruments to be treated, as laid down in points 4 to 7, as positions in the underlying (or notional) security or securities. Institutions' holdings of their own debt instruments shall be disregarded in calculating specific risk under point 14. 2. No netting shall be allowed between a convertible and an offsetting position in the instrument underlying it, unless the competent authorities adopt an approach under which the likelihood of a particular convertible's being converted is taken into account or have a capital requirement to cover