## Import libraries

In [1]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

from sem_covid import config
from sem_covid.services.store_registry import store_registry, StoreRegistryABC
import hashlib
from sem_covid.services.model_registry import EmbeddingModelRegistry, EmbeddingModelRegistryABC
import spacy
from more_itertools import windowed
from typing import List
import pandas as pd
import concurrent

2021-08-24 14:44:53.948766: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-08-24 14:44:53.948791: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
TEXTUAL_COLUMNS = ['title', 'content']
FIN_REG_SPLITTED_ES_INDEX = 'ds_finreg_splitted'

In [3]:
TEXTUAL_DATA = 'text_data'
TEXT_PIECE = 'text_piece'
DOCUMENT_ID_SOURCE = 'document_id_source'
TEXT_PIECE_EMBEDDING = 'text_piece_embedding'

nlp = spacy.load('en_core_web_sm')

In [36]:
class WindowedSplitDocumentsPipeline:

    def __init__(self, dataset_es_index_name: str,
                 result_es_index_name: str,
                 textual_columns: List[str],
                 split_window_size: int,
                 split_window_step: int,
                 store_registry: StoreRegistryABC,
                 embedding_model_registry: EmbeddingModelRegistryABC):
        self.dataset_es_index_name = dataset_es_index_name
        self.result_es_index_name = result_es_index_name
        self.store_registry = store_registry
        self.embedding_model_registry = embedding_model_registry
        self.textual_columns = textual_columns
        self.split_window_size = split_window_size
        self.split_window_step = split_window_step
        self.dataset = None
        self.result_dataset = None

    def load_dataset(self):
        es_store = self.store_registry.es_index_store()
        self.dataset = es_store.get_dataframe(self.dataset_es_index_name)
        self.dataset = self.dataset[self.textual_columns]
        self.dataset.dropna(inplace=True)

    def prepare_textual_data(self):
        for textual_column in self.textual_columns:
            self.dataset = self.dataset[
                self.dataset[textual_column].apply(lambda x: len(x) > 1)]
        self.dataset[TEXTUAL_DATA] = self.dataset[self.textual_columns].agg(lambda texts:
                                                                            ". ".join(texts),
                                                                            axis=1)

    def split_documents(self):
        def split_documents_worker(index,value,window_size,window_step):
            sentences = [sent.text for sent in nlp(value).sents]
            windowed_texts = list(
                windowed(sentences,
                         n=window_size,
                         fillvalue='',
                         step=window_step)
            )
            return [(index, ' '.join(windowed_text))
                    for windowed_text in windowed_texts]
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(split_documents_worker,
                                       index,
                                       value[:1000000],
                                       self.split_window_size,
                                       self.split_window_step
                                       )
                       for index, value in self.dataset[TEXTUAL_DATA][:1].items()
                       #[WARNING]!!Delete limit of dataset in production!
                       ]
            self.result_dataset = pd.DataFrame([result
                                                for future in futures for result in future.result()],
                                               columns=[DOCUMENT_ID_SOURCE,TEXT_PIECE])

    def compute_embeddings(self):
        emb_model = self.embedding_model_registry.sent2vec_universal_sent_encoding()
        self.result_dataset[TEXT_PIECE_EMBEDDING] = emb_model.encode(self.result_dataset[TEXT_PIECE].values)

    def store_splitted_documents(self):
        self.result_dataset.reset_index(drop=True, inplace=True)
        es_store = self.store_registry.es_index_store()
        es_store.put_dataframe(index_name=self.result_es_index_name,
                               content=self.result_dataset)

    def execute(self):
        self.load_dataset()
        self.prepare_textual_data()
        self.split_documents()
        self.compute_embeddings()
        self.store_splitted_documents()

In [37]:
windowed_split_documents_pipeline = WindowedSplitDocumentsPipeline(
    dataset_es_index_name=config.EU_FINREG_CELLAR_ELASTIC_SEARCH_INDEX_NAME,
    result_es_index_name=FIN_REG_SPLITTED_ES_INDEX,
    textual_columns=TEXTUAL_COLUMNS,
    split_window_size=10,
    split_window_step=5,
    store_registry=store_registry,
    embedding_model_registry=EmbeddingModelRegistry())

In [45]:
windowed_split_documents_pipeline.execute()

100% (5757 of 5757) |####################| Elapsed Time: 0:00:01 Time:  0:00:01
N/A% (0 of 28) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--

In [124]:
import faiss
import numpy as np

In [None]:
import pickle


class FaissIndexingPipeline:

    def __init__(self, es_index_name: str,
                 embedding_column_name: str,
                 result_bucket_name: str,
                 result_faiss_index_name: str,
                 store_registry: StoreRegistryABC):
        self.es_index_name = es_index_name
        self.store_registry = store_registry
        self.embedding_column_name = embedding_column_name
        self.result_bucket_name = result_bucket_name
        self.result_faiss_index_name = result_faiss_index_name
        self.dataset = None
        self.embeddings = None
        self.faiss_index = None

    def load_dataset(self):
        es_store = self.store_registry.es_index_store()
        self.dataset = es_store.get_dataframe(index_name=self.es_index_name)

    def prepare_embeddings(self):
        self.embeddings = self.dataset[self.embedding_column_name].values
        self.embeddings = np.array([np.array(embedding).astype('float32')
                                    for embedding in self.embeddings]).astype("float32")

    def embeddings_indexing(self):
        self.faiss_index = faiss.IndexFlatL2(self.embeddings.shape[1])
        self.faiss_index = faiss.IndexIDMap(self.faiss_index)
        self.faiss_index.add_with_ids(self.embeddings, self.dataset.index.values)

    def store_faiss_index(self):
        minio_store = store_registry.minio_object_store(self.result_bucket_name)
        minio_store.put_object(object_name=self.result_faiss_index_name,
                               content=pickle.dumps(faiss.serialize_index(self.faiss_index))
                               )

    def execute(self):
        self.load_dataset()
        self.prepare_embeddings()
        self.embeddings_indexing()
        self.store_faiss_index()

In [None]:
faiss_indexing_pipeline = FaissIndexingPipeline(es_index_name=FIN_REG_SPLITTED_ES_INDEX,
                                                embedding_column_name=TEXT_PIECE_EMBEDDING,
                                                result_bucket_name=,
                                                result_faiss_index_name=,
                                                store_registry=)

In [134]:
embeddings = new_df.text_piece_embedding.values

In [135]:
embeddings = np.array([np.array(embedding).astype('float32')
                       for embedding in embeddings]).astype("float32")

In [160]:
# Step 2: Instantiate the index
index = faiss.IndexFlatL2(embeddings.shape[1])

# Step 3: Pass the index to IndexIDMap
index = faiss.IndexIDMap(index)

# Step 4: Add vectors and their IDs
index.add_with_ids(embeddings, np.array(range(0, embeddings.shape[0])))

print(f"Number of vectors in the Faiss index: {index.ntotal}")

Number of vectors in the Faiss index: 1402


In [168]:
# Retrieve the 10 nearest neighbours
D, I = index.search(np.array([embeddings[0]]), k=50)
print(f'L2 distance: {D.flatten().tolist()}\n\nMAG paper IDs: {I.flatten().tolist()}')

L2 distance: [0.0, 0.22245340049266815, 0.5410061478614807, 0.5424696207046509, 0.5697960257530212, 0.620814323425293, 0.6340034604072571, 0.6435337662696838, 0.6524192690849304, 0.6616794466972351, 0.7028501033782959, 0.7037103176116943, 0.7069353461265564, 0.7169807553291321, 0.7531577944755554, 0.8260558843612671, 0.8626027703285217, 0.86916583776474, 0.8854991793632507, 0.9011704325675964, 0.9037261605262756, 0.9388514161109924, 0.9419631958007812, 0.9499005675315857, 0.9517160058021545, 0.9522029161453247, 0.9544087648391724, 0.9546146392822266, 0.9648861289024353, 0.9708071947097778, 0.9726360440254211, 0.974346935749054, 0.978111982345581, 0.9861728549003601, 0.9889810681343079, 0.9891105890274048, 0.9891105890274048, 0.9996849298477173, 1.0014151334762573, 1.0050978660583496, 1.0118540525436401, 1.021659255027771, 1.0343246459960938, 1.035066843032837, 1.0373544692993164, 1.0446761846542358, 1.0454301834106445, 1.0566376447677612, 1.0566376447677612, 1.0648467540740967]

MAG pa

In [172]:
new_df.iloc[I.flatten().tolist()].document_source.values

array(['483380ec660eea125bd574565bef076acd782f2297f5b0a51f513a30e04a8f5a',
       '483380ec660eea125bd574565bef076acd782f2297f5b0a51f513a30e04a8f5a',
       '483380ec660eea125bd574565bef076acd782f2297f5b0a51f513a30e04a8f5a',
       '483380ec660eea125bd574565bef076acd782f2297f5b0a51f513a30e04a8f5a',
       '483380ec660eea125bd574565bef076acd782f2297f5b0a51f513a30e04a8f5a',
       '483380ec660eea125bd574565bef076acd782f2297f5b0a51f513a30e04a8f5a',
       '483380ec660eea125bd574565bef076acd782f2297f5b0a51f513a30e04a8f5a',
       '483380ec660eea125bd574565bef076acd782f2297f5b0a51f513a30e04a8f5a',
       '483380ec660eea125bd574565bef076acd782f2297f5b0a51f513a30e04a8f5a',
       '483380ec660eea125bd574565bef076acd782f2297f5b0a51f513a30e04a8f5a',
       '483380ec660eea125bd574565bef076acd782f2297f5b0a51f513a30e04a8f5a',
       '483380ec660eea125bd574565bef076acd782f2297f5b0a51f513a30e04a8f5a',
       '483380ec660eea125bd574565bef076acd782f2297f5b0a51f513a30e04a8f5a',
       '483380ec660eea125

In [174]:
D.flatten().tolist()

[0.0,
 0.22245340049266815,
 0.5410061478614807,
 0.5424696207046509,
 0.5697960257530212,
 0.620814323425293,
 0.6340034604072571,
 0.6435337662696838,
 0.6524192690849304,
 0.6616794466972351,
 0.7028501033782959,
 0.7037103176116943,
 0.7069353461265564,
 0.7169807553291321,
 0.7531577944755554,
 0.8260558843612671,
 0.8626027703285217,
 0.86916583776474,
 0.8854991793632507,
 0.9011704325675964,
 0.9037261605262756,
 0.9388514161109924,
 0.9419631958007812,
 0.9499005675315857,
 0.9517160058021545,
 0.9522029161453247,
 0.9544087648391724,
 0.9546146392822266,
 0.9648861289024353,
 0.9708071947097778,
 0.9726360440254211,
 0.974346935749054,
 0.978111982345581,
 0.9861728549003601,
 0.9889810681343079,
 0.9891105890274048,
 0.9891105890274048,
 0.9996849298477173,
 1.0014151334762573,
 1.0050978660583496,
 1.0118540525436401,
 1.021659255027771,
 1.0343246459960938,
 1.035066843032837,
 1.0373544692993164,
 1.0446761846542358,
 1.0454301834106445,
 1.0566376447677612,
 1.0566376447