In [3]:
import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import faiss
import pickle
import pandas as pd
import numpy as np
from more_itertools import unique_everseen

from sem_covid.entrypoints.notebooks.legal_radar.services.split_documents_pipeline import DOCUMENT_ID_SOURCE
from sem_covid.services.store_registry import store_registry
from sem_covid.services.model_registry import embedding_registry
from sem_covid import config
from sem_covid.services.sc_wrangling.feature_selector import reduce_array_column

In [4]:
FAISS_BUCKET_NAME = 'faiss-index'
FAISS_INDEX_FINREG_NAME = 'faiss_index_finreg.pkl'
FIN_REG_SPLITTED_ES_INDEX = 'ds_finreg_splitted'
DATES_DOCUMENT = 'dates_document'
HTML_LINKS = 'htmls_to_download'
DEFAULT_SEARCH = """The Semantic Interoperability Community develops solutions to help European public administrations perform seamless and meaningful cross-border and cross-domain data exchanges."""
TEXT_PIECE = 'text_piece'

In [7]:
def load_documents():
    """Read the data from ES."""
    es_store = store_registry.es_index_store()
    df = es_store.get_dataframe(index_name=config.EU_FINREG_CELLAR_ELASTIC_SEARCH_INDEX_NAME)
    df[DATES_DOCUMENT] = pd.to_datetime(df[DATES_DOCUMENT]).dt.date
    return df

def load_splitted_documents():
    """Read the data from ES."""
    es_store = store_registry.es_index_store()
    return es_store.get_dataframe(index_name=FIN_REG_SPLITTED_ES_INDEX)

def load_emb_model():
    return embedding_registry.sent2vec_universal_sent_encoding()

def load_faiss_index():
    """Load and deserialize the Faiss index."""
    minio_store = store_registry.minio_object_store(minio_bucket=FAISS_BUCKET_NAME)
    data = pickle.loads(minio_store.get_object(object_name=FAISS_INDEX_FINREG_NAME))
    return faiss.deserialize_index(data)

def main(user_input: str):
    documents = load_documents()
    splitted_documents = load_splitted_documents()
    model = load_emb_model()
    faiss_index = load_faiss_index()
    num_results = 100
    embeddings = model.encode(sentences=[user_input])
    D, I = faiss_index.search(np.array(embeddings).astype("float32"), k=num_results)
    document_parts = splitted_documents.iloc[I.flatten().tolist()]
    document_parts['similarity'] = D.flatten().tolist()
    documents_id = list(unique_everseen(
    splitted_documents.iloc[I.flatten().tolist()][DOCUMENT_ID_SOURCE].values))

    return documents.loc[documents_id]

In [8]:
# documents, celex number and url reference from sample question file
sample_questions = pd.read_csv('sem_covid/entrypoints/notebooks/legal_radar/docs/sample_questions_v4.csv').dropna()

In [None]:
def execution_verification(dataset: pd.DataFrame, source_documents_column: str, celex_number_column: str) -> None:
    """
        This function helps us to compare the celex id from our source and legal radar semantic search.
        Inserting source document text and his celex number, checks if each document is found in the first
        10 results.
    Args:
        dataset: dataframe with required source document and his celex number
        source_documents_column: dataframe's source document column name
        celex_number_column: dataframe's celex number column name

    Returns: the name of inserted document and how much results out of 10 are having their celex number.
    """
    for (source_document, celex_number) in zip(dataset[source_documents_column], dataset[celex_number_column]):
        executed_search_engine = main(source_document)
        reduced_execution = reduce_array_column(executed_search_engine, 'celex_numbers')
        sum_celex_number_found = reduced_execution['celex_numbers'][:10].isin([celex_number]).sum()

        print(f' --> RESULT: Out of top 10 found documents, {sum_celex_number_found} has the same celex number')

In [None]:
execution_verification(sample_questions, 'Source Document', 'Celex No')

100% (5791 of 5791) |####################| Elapsed Time: 0:00:05 Time:  0:00:05
100% (69866 of 69866) |##################| Elapsed Time: 0:00:30 Time:  0:00:30
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


 --> RESULT: Out of 10 docuemnts, 0 has the same celex number


100% (5791 of 5791) |####################| Elapsed Time: 0:00:05 Time:  0:00:05
100% (69866 of 69866) |##################| Elapsed Time: 0:00:30 Time:  0:00:30


 --> RESULT: Out of 10 docuemnts, 0 has the same celex number


100% (5791 of 5791) |####################| Elapsed Time: 0:00:05 Time:  0:00:05
100% (69866 of 69866) |##################| Elapsed Time: 0:00:30 Time:  0:00:30


 --> RESULT: Out of 10 docuemnts, 0 has the same celex number


100% (5791 of 5791) |####################| Elapsed Time: 0:00:05 Time:  0:00:05
100% (69866 of 69866) |##################| Elapsed Time: 0:00:30 Time:  0:00:30


 --> RESULT: Out of 10 docuemnts, 0 has the same celex number


100% (5791 of 5791) |####################| Elapsed Time: 0:00:05 Time:  0:00:05
100% (69866 of 69866) |##################| Elapsed Time: 0:00:31 Time:  0:00:31


 --> RESULT: Out of 10 docuemnts, 0 has the same celex number


100% (5791 of 5791) |####################| Elapsed Time: 0:00:05 Time:  0:00:05
100% (69866 of 69866) |##################| Elapsed Time: 0:00:31 Time:  0:00:31


 --> RESULT: Out of 10 docuemnts, 0 has the same celex number


100% (5791 of 5791) |####################| Elapsed Time: 0:00:05 Time:  0:00:05
100% (69866 of 69866) |##################| Elapsed Time: 0:00:30 Time:  0:00:30


 --> RESULT: Out of 10 docuemnts, 0 has the same celex number


100% (5791 of 5791) |####################| Elapsed Time: 0:00:05 Time:  0:00:05
100% (69866 of 69866) |##################| Elapsed Time: 0:00:30 Time:  0:00:30


 --> RESULT: Out of 10 docuemnts, 0 has the same celex number


100% (5791 of 5791) |####################| Elapsed Time: 0:00:05 Time:  0:00:05
  9% (6765 of 69866) |#                  | Elapsed Time: 0:00:03 ETA:   0:00:29