In [1]:
import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import faiss
import pickle
import pandas as pd
import numpy as np
from more_itertools import unique_everseen

from sem_covid.entrypoints.notebooks.legal_radar.services.split_documents_pipeline import DOCUMENT_ID_SOURCE
from sem_covid.services.store_registry import store_registry
from sem_covid.services.model_registry import embedding_registry
from sem_covid import config
from sem_covid.services.sc_wrangling.feature_selector import reduce_array_column

In [2]:
FAISS_BUCKET_NAME = 'faiss-index'
FAISS_INDEX_FINREG_NAME = 'faiss_index_finreg.pkl'
FIN_REG_SPLITTED_ES_INDEX = 'ds_finreg_splitted'
DATES_DOCUMENT = 'dates_document'
HTML_LINKS = 'htmls_to_download'
DEFAULT_SEARCH = """The Semantic Interoperability Community develops solutions to help European public administrations perform seamless and meaningful cross-border and cross-domain data exchanges."""
TEXT_PIECE = 'text_piece'

In [3]:
def load_documents():
    """Read the data from ES."""
    es_store = store_registry.es_index_store()
    df = es_store.get_dataframe(index_name=config.EU_FINREG_CELLAR_ELASTIC_SEARCH_INDEX_NAME)
    df[DATES_DOCUMENT] = pd.to_datetime(df[DATES_DOCUMENT]).dt.date
    return df

def load_splitted_documents():
    """Read the data from ES."""
    es_store = store_registry.es_index_store()
    return es_store.get_dataframe(index_name=FIN_REG_SPLITTED_ES_INDEX)

def load_emb_model():
    return embedding_registry.sent2vec_universal_sent_encoding()

def load_faiss_index():
    """Load and deserialize the Faiss index."""
    minio_store = store_registry.minio_object_store(minio_bucket=FAISS_BUCKET_NAME)
    data = pickle.loads(minio_store.get_object(object_name=FAISS_INDEX_FINREG_NAME))
    return faiss.deserialize_index(data)

def main(user_input: str):
    documents = load_documents()
    splitted_documents = load_splitted_documents()
    model = load_emb_model()
    faiss_index = load_faiss_index()
    num_results = 100
    embeddings = model.encode(sentences=[user_input])
    D, I = faiss_index.search(np.array(embeddings).astype("float32"), k=num_results)
    document_parts = splitted_documents.iloc[I.flatten().tolist()]
    document_parts['similarity'] = D.flatten().tolist()
    documents_id = list(unique_everseen(
    splitted_documents.iloc[I.flatten().tolist()][DOCUMENT_ID_SOURCE].values))

    return documents.loc[documents_id]

In [4]:
# documents, celex number and url reference from sample question file
sample_questions = pd.read_csv('sem_covid/entrypoints/notebooks/legal_radar/docs/sample_questions_v4.csv').dropna()

In [62]:
# pozitia gasite (daca a fost gasit)
# df -> pozitia si simlaritatea
def execution_verification(dataset: pd.DataFrame, source_documents_column: str, celex_number_column: str) -> None:
    """
        This function helps us to compare the celex id from our source and legal radar semantic search.
        Inserting source document text and his celex number, checks if each document is found in the first
        10 results.
    Args:
        dataset: dataframe with required source document and his celex number
        source_documents_column: dataframe's source document column name
        celex_number_column: dataframe's celex number column name

    Returns: the name of inserted document and how much results out of 10 are having their celex number.
    """
    for (source_document, celex_number) in zip(set(dataset[source_documents_column]), set(dataset[celex_number_column])):
        executed_search_engine = main(source_document)
        reduced_execution = reduce_array_column(executed_search_engine, 'celex_numbers')
        sum_celex_number_found = reduced_execution['celex_numbers'].isin([celex_number])
        reduced_execution[sum_celex_number_found].reset_index(drop=True).index.to_list()

        print(f" RESULT: From {len(reduced_execution['celex_numbers'])} results was found:\n "
              f"{sum_celex_number_found.sum()} value/s\n "
              f"Index: {reduced_execution[sum_celex_number_found].reset_index(drop=True).index.to_list()}" )

In [63]:
execution_verification(sample_questions, 'Source Document', 'Celex No')

100% (5791 of 5791) |####################| Elapsed Time: 0:00:03 Time:  0:00:03
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


 RESULT: From 70 results was found:
 0 value/s
 Index: []


100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24


 RESULT: From 49 results was found:
 0 value/s
 Index: []


100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:23 Time:  0:00:23


 RESULT: From 70 results was found:
 0 value/s
 Index: []


100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24


 RESULT: From 62 results was found:
 0 value/s
 Index: []


100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24


 RESULT: From 65 results was found:
 0 value/s
 Index: []


100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24


 RESULT: From 77 results was found:
 0 value/s
 Index: []


100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24


 RESULT: From 70 results was found:
 1 value/s
 Index: [0]


100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24


 RESULT: From 77 results was found:
 1 value/s
 Index: [0]


100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24


 RESULT: From 59 results was found:
 0 value/s
 Index: []


100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24


 RESULT: From 73 results was found:
 1 value/s
 Index: [0]


100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24


 RESULT: From 59 results was found:
 0 value/s
 Index: []


In [73]:
# for (source_document, celex_number) in zip(sample_questions['Source Document'][12], sample_questions['Celex No'][12]):
executed_search_engine = main('Directive 2002/47/EC of the European Parliament and of the Council of 6 June 2002 on financial collateral arrangements',)
reduced_execution = reduce_array_column(executed_search_engine, 'celex_numbers')
sum_celex_number_found = reduced_execution['celex_numbers'].isin(['32012R0648']).sum()

100% (5791 of 5791) |####################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (69866 of 69866) |##################| Elapsed Time: 0:00:24 Time:  0:00:24
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [54]:
reduced_execution.reset_index(drop=True, inplace=True)

In [55]:
true_values = reduced_execution['celex_numbers'].isin(['32019R1238'])

In [56]:
reduced_execution[true_values].reset_index(drop=True).index.to_list()
true_values.sum()

1

In [57]:
len(reduced_execution['celex_numbers'])

59

In [67]:
print(f" RESULT: From {len(reduced_execution['celex_numbers'])} results was found:\n "
      f"{true_values.sum()} value/s\n "
      f"Index: {reduced_execution[true_values].index.to_list()}" )

  print(f" RESULT: From {len(reduced_execution['celex_numbers'])} results was found:\n "


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [74]:

sum_celex_number_found

0