In [2]:
import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import faiss
import pickle
import pandas as pd
import numpy as np

from sem_covid.entrypoints.notebooks.legal_radar.services.split_documents_pipeline import DOCUMENT_ID_SOURCE
from sem_covid.services.store_registry import store_registry
from sem_covid.services.model_registry import embedding_registry
from sem_covid import config
from sem_covid.services.sc_wrangling.feature_selector import reduce_array_column

In [3]:
FAISS_BUCKET_NAME = 'faiss-index'
FAISS_INDEX_FINREG_NAME = 'faiss_index_finreg.pkl'
FIN_REG_SPLITTED_ES_INDEX = 'ds_finreg_splitted'
DATES_DOCUMENT = 'dates_document'
HTML_LINKS = 'htmls_to_download'
DEFAULT_SEARCH = """The Semantic Interoperability Community develops solutions to help European public administrations perform seamless and meaningful cross-border and cross-domain data exchanges."""
TEXT_PIECE = 'text_piece'

In [369]:
def load_documents():
    """Read the data from ES."""
    es_store = store_registry.es_index_store()
    df = es_store.get_dataframe(index_name=config.EU_FINREG_CELLAR_ELASTIC_SEARCH_INDEX_NAME)
    df[DATES_DOCUMENT] = pd.to_datetime(df[DATES_DOCUMENT]).dt.date
    return df

def load_splitted_documents():
    """Read the data from ES."""
    es_store = store_registry.es_index_store()
    return es_store.get_dataframe(index_name=FIN_REG_SPLITTED_ES_INDEX)

def load_emb_model():
    return embedding_registry.sent2vec_universal_sent_encoding()

def load_faiss_index():
    """Load and deserialize the Faiss index."""
    minio_store = store_registry.minio_object_store(minio_bucket=FAISS_BUCKET_NAME)
    data = pickle.loads(minio_store.get_object(object_name=FAISS_INDEX_FINREG_NAME))
    return faiss.deserialize_index(data)


def semantic_search(user_input: str):
    documents = load_documents()
    splitted_documents = load_splitted_documents()
    model = load_emb_model()
    faiss_index = load_faiss_index()
    num_results = 100
    embeddings = model.encode(sentences=[user_input])
    D, I = faiss_index.search(np.array(embeddings).astype("float32"), k=num_results)
    document_parts = pd.DataFrame(splitted_documents.iloc[I.flatten().tolist()])
    document_parts['similarity'] = pd.Series(D.flatten().tolist()).values
    document_parts = document_parts.drop_duplicates(DOCUMENT_ID_SOURCE).reset_index(drop=True)
    documents_id = document_parts[DOCUMENT_ID_SOURCE].values
    result_documents = pd.DataFrame(documents.loc[documents_id]).reset_index(drop=True)
    result_documents['similarity'] = document_parts['similarity']
    result_documents['text_piece'] = document_parts['text_piece']
    return result_documents

In [356]:
# documents, celex number and url reference from sample question file
sample_questions = pd.read_csv('sem_covid/entrypoints/notebooks/legal_radar/docs/sample_questions_v4.csv').dropna()

In [380]:
# def execution_verification(dataset: pd.DataFrame, source_documents_column: str, celex_number_column: str) -> pd.DataFrame:
    # """
    #     This function helps us to compare the celex id from our source and legal radar semantic search.
    #     Inserting source document text and his celex number, checks if each document is found in the first
    #     10 results.
    # Args:
    #     dataset: dataframe with required source document and his celex number
    #     source_documents_column: dataframe's source document column name
    #     celex_number_column: dataframe's celex number column name

    # Returns: the name of inserted document and how much results that are having their celex number.
    # """
#     results = []

#     for (source_document, celex_number) in zip(dataset[source_documents_column], dataset[celex_number_column]):
#         executed_search_engine = main(source_document)
#         reduced_execution = reduce_array_column(executed_search_engine, 'celex_numbers').reset_index(drop=True)
#         sum_celex_number_found = reduced_execution['celex_numbers'].isin([celex_number])
#         index = reduced_execution[sum_celex_number_found].index.to_list()

#         results.append(
#             {
#                 'position_in_documents': reduced_execution['content'].loc[index].index.to_list(),
#                 'position_in_slices': reduced_execution['text_piece'].loc[index].index.to_list(),
#                 'sum_of_values': sum_celex_number_found.sum(),
#                 "similarity_percentage": reduced_execution['similarity'].apply(lambda x: 1 / (1 + x)).loc[index].to_list()
#             }
#         )
#     result = pd.DataFrame(results).applymap(lambda x: x[0] if isinstance(x, list) else x)
    
#     result['in_top_5_slices'] = result['position_in_slices'] <= 5
#     result['in_top_10_slices'] = result['position_in_slices'] <= 10
#     result['in_top_5_documents'] = result['position_in_documents'] <= 5
#     result['in_top_10_documents'] = result['position_in_documents'] <= 10
#     result['in_q3'] = result['similarity_percentage'] >= 0.75

#     return pd.concat([dataset, result], axis=1)


# def execute_semantic_search(dataset: pd.DataFrame, input_query: str, celex_number: str) -> list:
#         results = []
        
#         for (query_text, celex_number) in zip(dataset[input_query], dataset[celex_number]):
#             executed_search_engine = semantic_search(query_text)
#             reduced_execution = reduce_array_column(executed_search_engine, 'celex_numbers').reset_index(drop=True)
#             sum_celex_number_found = reduced_execution['celex_numbers'].isin([celex_number])
#             index = reduced_execution[sum_celex_number_found].index.to_list()

#             results.append({
#                     'position_in_documents': reduced_execution['content'].loc[index].index.to_list(),
#                     'position_in_slices': reduced_execution['text_piece'].loc[index].index.to_list(),
#                     'sum_of_values': sum_celex_number_found.sum(),
#                     "similarity_percentage": reduced_execution['similarity'].apply(lambda x: 1 / (1 + x)).loc[index].to_list()})
            
#         return results
    

# def view_semantic_search_results(semantic_search_results) -> pd.DataFrame:
#         result = pd.DataFrame(semantic_search_results).applymap(lambda x: x[0] if isinstance(x, list) else x)
#         result = result.assign(in_top_5_slices=result['position_in_slices'] <= 5,
#                                in_top_10_slices=result['position_in_slices'] <= 10,
#                                in_top_5_documents=result['position_in_documents'] <= 5,
#                                in_top_10_documents=result['position_in_documents'] <= 10,
#                                in_q3=result['similarity_percentage'] >= 0.75)
        
#         return pd.concat([dataset, result], axis=1)


class SemanticSearchEvaluation:
    """Test evaluation for Legal Radar - Semantic Search"""
    def __init__(self, dataset: pd.DataFrame, input_query: str, celex_number: str) -> None:
        """
            dataset: dataframe with required source document and his celex number
            source_documents_column: dataframe's source document column name
            celex_number_column: dataframe's celex number column name
        """
        self.dataset = dataset
        self.input_query = input_query
        self.celex_number = celex_number
    
    def execute_semantic_search(self) -> list:
        """
            From inserted dataframe, it finds the column with queries and the values 
            and executes the semantic search function. IT calculates the sum of found numbers
            and gets their indexes. After that it appends into a list the possitions of 
            documents and documents' slices, sum of found values and similarity percentage.
            
        """
        results = []
        
        for (query_text, celex_number) in zip(self.dataset[self.input_query], self.dataset[self.celex_number]):
            executed_search_engine = semantic_search(query_text)
            reduced_execution = reduce_array_column(executed_search_engine, 'celex_numbers').reset_index(drop=True)
            sum_celex_number_found = reduced_execution['celex_numbers'].isin([celex_number])
            index = reduced_execution[sum_celex_number_found].index.to_list()

            results.append({
                    'position_in_documents': reduced_execution['content'].loc[index].index.to_list(),
                    'position_in_slices': reduced_execution['text_piece'].loc[index].index.to_list(),
                    'sum_of_values': sum_celex_number_found.sum(),
                    "similarity_percentage": reduced_execution['similarity'].apply(lambda x: 1 / (1 + x)).loc[index].to_list()})
            
        return results

    def view_semantic_search_results(self) -> pd.DataFrame:
        """
            With founded results, it drops the lists from the table and assigns new columns with
            boolean statement of founded celex numbers in first 5, 10 search results of documents and
            documents' slices and checks if the similarity of those results are more than Q3, and concatenates
            the original inserted dataset and new generated one with found data.
        """
        result = pd.DataFrame(self.execute_semantic_search()).applymap(lambda x: x[0] if isinstance(x, list) else x)
        result = result.assign(in_top_5_slices=result['position_in_slices'] <= 5,
                               in_top_10_slices=result['position_in_slices'] <= 10,
                               in_top_5_documents=result['position_in_documents'] <= 5,
                               in_top_10_documents=result['position_in_documents'] <= 10,
                               in_q3=result['similarity_percentage'] >= 0.75)
        
        return pd.concat([self.dataset, result], axis=1)


In [375]:
execution = SemanticSearchEvaluation(sample_questions.iloc[[0, 1, 2]], 'Questions/Text Extracts', 'Celex No')
execution = view_semantic_search_results()

100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24
100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24
100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24


In [376]:
execution

Unnamed: 0,No,Questions/Text Extracts,Source Document,Article No,Celex No,Work Reference,ELI,position_in_documents,position_in_slices,sum_of_values,similarity_percentage,in_top_5_slices,in_top_10_slices,in_top_5_documents,in_top_10_documents,in_q3
0,1,‘clearing’ means the process of establishing p...,REGULATION (EU) No 648/2012 OF THE EUROPEAN PA...,2(3),32012R0648,http://publications.europa.eu/resource/cellar/...,http://data.europa.eu/eli/reg/2012/648/oj,27,27,1,0.454684,False,False,False,False,False
1,2,Incentives to promote the use of CCPs have not...,REGULATION (EU) No 648/2012 OF THE EUROPEAN PA...,Recital 13,32012R0648,http://publications.europa.eu/resource/cellar/...,http://data.europa.eu/eli/reg/2012/648/oj,0,0,1,0.641507,True,True,True,True,False
2,3,Ensuring that the clearing obligation reduces ...,REGULATION (EU) No 648/2012 OF THE EUROPEAN PA...,Recital 15,32012R0648,http://publications.europa.eu/resource/cellar/...,http://data.europa.eu/eli/reg/2012/648/oj,0,0,1,0.659671,True,True,True,True,False


In [341]:
result = execution_verification(sample_questions.iloc[[0, 1, 2]], 'Questions/Text Extracts', 'Celex No')

100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24
100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24
100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24


How this execution function works:
    * It requires the dataset with columns of source documents and their celex number,
    gets each document and add it into the main function ( which is execution function ).

    * Using reduce_array_column() it drops the list type from celex number generated column and
    compares if celex numbers from executed search dataframe are equals with number of inserted document
    and checks with each generated result from dataframe.

    * Depending on the sum of celex numbers, it gets their IDs and similarity percentage.

Below dataframe is the result of test function with input of sample_questions_v4 csv file and its
source documents with theirs celex numbers.

After each inserted document it found a few values with the same celex numbers with
similarity percentage more than Q3.


In [342]:
result

Unnamed: 0,No,Questions/Text Extracts,Source Document,Article No,Celex No,Work Reference,ELI,position_in_documents,position_in_slices,sum_of_values,similarity_percentage,in_top_5_slices,in_top_10_slices,in_top_5_documents,in_top_10_documents,in_q3
0,1,‘clearing’ means the process of establishing p...,REGULATION (EU) No 648/2012 OF THE EUROPEAN PA...,2(3),32012R0648,http://publications.europa.eu/resource/cellar/...,http://data.europa.eu/eli/reg/2012/648/oj,27,27,1,0.454684,False,False,False,False,False
1,2,Incentives to promote the use of CCPs have not...,REGULATION (EU) No 648/2012 OF THE EUROPEAN PA...,Recital 13,32012R0648,http://publications.europa.eu/resource/cellar/...,http://data.europa.eu/eli/reg/2012/648/oj,0,0,1,0.641507,True,True,True,True,False
2,3,Ensuring that the clearing obligation reduces ...,REGULATION (EU) No 648/2012 OF THE EUROPEAN PA...,Recital 15,32012R0648,http://publications.europa.eu/resource/cellar/...,http://data.europa.eu/eli/reg/2012/648/oj,0,0,1,0.659671,True,True,True,True,False
