In [160]:
import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import faiss
import pickle
import pandas as pd
import numpy as np

from sem_covid.entrypoints.notebooks.legal_radar.services.split_documents_pipeline import DOCUMENT_ID_SOURCE
from sem_covid.services.store_registry import store_registry
from sem_covid.services.model_registry import embedding_registry
from sem_covid import config
from sem_covid.services.sc_wrangling.feature_selector import reduce_array_column

In [132]:
FAISS_BUCKET_NAME = 'faiss-index'
FAISS_INDEX_FINREG_NAME = 'faiss_index_finreg.pkl'
FIN_REG_SPLITTED_ES_INDEX = 'ds_finreg_splitted'
DATES_DOCUMENT = 'dates_document'
HTML_LINKS = 'htmls_to_download'
DEFAULT_SEARCH = """The Semantic Interoperability Community develops solutions to help European public administrations perform seamless and meaningful cross-border and cross-domain data exchanges."""
TEXT_PIECE = 'text_piece'

In [219]:
def load_documents():
    """Read the data from ES."""
    es_store = store_registry.es_index_store()
    df = es_store.get_dataframe(index_name=config.EU_FINREG_CELLAR_ELASTIC_SEARCH_INDEX_NAME)
    df[DATES_DOCUMENT] = pd.to_datetime(df[DATES_DOCUMENT]).dt.date
    return df

def load_splitted_documents():
    """Read the data from ES."""
    es_store = store_registry.es_index_store()
    return es_store.get_dataframe(index_name=FIN_REG_SPLITTED_ES_INDEX)

def load_emb_model():
    return embedding_registry.sent2vec_universal_sent_encoding()

def load_faiss_index():
    """Load and deserialize the Faiss index."""
    minio_store = store_registry.minio_object_store(minio_bucket=FAISS_BUCKET_NAME)
    data = pickle.loads(minio_store.get_object(object_name=FAISS_INDEX_FINREG_NAME))
    return faiss.deserialize_index(data)


def main(user_input: str):
    documents = load_documents()
    splitted_documents = load_splitted_documents()
    model = load_emb_model()
    faiss_index = load_faiss_index()
    num_results = 100
    embeddings = model.encode(sentences=[user_input])
    D, I = faiss_index.search(np.array(embeddings).astype("float32"), k=num_results)
    document_parts = pd.DataFrame(splitted_documents.iloc[I.flatten().tolist()])
    document_parts['similarity'] = pd.Series(D.flatten().tolist()).values
    document_parts = document_parts.drop_duplicates(DOCUMENT_ID_SOURCE).reset_index(drop=True)
    documents_id = document_parts[DOCUMENT_ID_SOURCE].values
    result_documents = pd.DataFrame(documents.loc[documents_id]).reset_index(drop=True)
    result_documents['similarity'] = document_parts['similarity']
    return result_documents

In [4]:
# documents, celex number and url reference from sample question file
sample_questions = pd.read_csv('sem_covid/entrypoints/notebooks/legal_radar/docs/sample_questions_v4.csv').dropna()

In [247]:
def execution_verification(dataset: pd.DataFrame, source_documents_column: str, celex_number_column: str):
    """
        This function helps us to compare the celex id from our source and legal radar semantic search.
        Inserting source document text and his celex number, checks if each document is found in the first
        10 results.
    Args:
        dataset: dataframe with required source document and his celex number
        source_documents_column: dataframe's source document column name
        celex_number_column: dataframe's celex number column name

    Returns: the name of inserted document and how much results that are having their celex number.
    """
    results = []

    for (source_document, celex_number) in zip(set(dataset[source_documents_column]), set(dataset[celex_number_column])):
        executed_search_engine = main(source_document)
        reduced_execution = reduce_array_column(executed_search_engine, 'celex_numbers').reset_index(drop=True)
        sum_celex_number_found = reduced_execution['celex_numbers'].isin([celex_number])
        index = reduced_execution[sum_celex_number_found].index.to_list()
        results.append(
            {
                'index': index,
                'values': sum_celex_number_found.sum(),
                "similarity_percentage": reduced_execution['similarity'].loc[index].to_list()
            }
        )

    return pd.DataFrame(results)

In [248]:
result = execution_verification(sample_questions, 'Source Document', 'Celex No')

100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:25 Time:  0:00:25
100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:25 Time:  0:00:25
100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:25 Time:  0:00:25
100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24
100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24
100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24
100% (5791 of 5791) |###################

In [249]:
result

Unnamed: 0,index,values,similarity_percentage
0,[],0,[]
1,[],0,[]
2,[],0,[]
3,[],0,[]
4,[],0,[]
5,[],0,[]
6,[15],1,[1.0138442516326904]
7,[2],1,[0.9245582818984985]
8,[],0,[]
9,[37],1,[1.1886646747589111]
