In [1]:
import concurrent.futures
import glob
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import EmbeddingRetriever
import multiprocessing
from tqdm.contrib.concurrent import process_map
import pandas
import haystack


In [2]:
files = glob.glob("../data/df_processed/*.parquet")
files

['../data/df_processed/part-0004.snappy.parquet',
 '../data/df_processed/part-0011.snappy.parquet',
 '../data/df_processed/part-0015.snappy.parquet',
 '../data/df_processed/part-0021.snappy.parquet',
 '../data/df_processed/part-0024.snappy.parquet',
 '../data/df_processed/part-0001.snappy.parquet',
 '../data/df_processed/part-0012.snappy.parquet',
 '../data/df_processed/part-0010.snappy.parquet',
 '../data/df_processed/part-0018.snappy.parquet',
 '../data/df_processed/part-0008.snappy.parquet',
 '../data/df_processed/part-0028.snappy.parquet',
 '../data/df_processed/part-0009.snappy.parquet',
 '../data/df_processed/part-0006.snappy.parquet',
 '../data/df_processed/part-0022.snappy.parquet',
 '../data/df_processed/part-0016.snappy.parquet',
 '../data/df_processed/part-0003.snappy.parquet',
 '../data/df_processed/part-0013.snappy.parquet',
 '../data/df_processed/part-0027.snappy.parquet',
 '../data/df_processed/part-0014.snappy.parquet',
 '../data/df_processed/part-0025.snappy.parquet',


In [3]:
def convert_to_document_dict(row):
    d = row.to_dict()
    #d['content']=d['Title'] + "\n" + d['Body']
    
    d['content']=d['Title']
    d['id']= d['Id']
    d['Tags'] = d['Tags']
    del d['Id']
    del d['Body']
    del d['AnswerBody']
    d_doc = haystack.schema.Document.from_dict(d)
    
    return d_doc


def get_document_store():
    
    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat",return_embedding=True)
    return document_store


def transform_file_batch(path: str) -> int:
    
    df = pandas.read_parquet(path)
    
    docs = df.apply(convert_to_document_dict, axis=1)
    
        
    return docs

In [4]:
document_store_faiss = get_document_store()

In [5]:
#FAISS Retriever initialization
retriever_faiss = EmbeddingRetriever(document_store_faiss
    ,embedding_model='distilroberta-base-msmarco-v2'
    , model_format='sentence_transformers')



In [6]:
# Delete existing documents in documents store
document_store_faiss.delete_documents()

In [7]:
cores = multiprocessing.cpu_count()-2

In [None]:
# Write documents to document store

# max_workers=10
with concurrent.futures.ProcessPoolExecutor(max_workers = cores) as executor:
    for file_path, docs in zip(files, executor.map(transform_file_batch, files)):
        
        document_store_faiss.write_documents(docs)
        
        print(f"path:{file_path}; records:{len(docs)}")



Writing Documents:   0%|          | 0/362133 [00:00<?, ?it/s]

ERROR:haystack.document_stores.sql:Document 36308472 - Discarded metadata 'CreationDate', since it has invalid type: Timestamp.
SQLDocumentStore can accept and cast to string only the following types: str, int, float, bool, bytes, bytearray, NoneType
ERROR:haystack.document_stores.sql:Document 33723298 - Discarded metadata 'CreationDate', since it has invalid type: Timestamp.
SQLDocumentStore can accept and cast to string only the following types: str, int, float, bool, bytes, bytearray, NoneType
ERROR:haystack.document_stores.sql:Document 35772072 - Discarded metadata 'CreationDate', since it has invalid type: Timestamp.
SQLDocumentStore can accept and cast to string only the following types: str, int, float, bool, bytes, bytearray, NoneType
ERROR:haystack.document_stores.sql:Document 30680938 - Discarded metadata 'CreationDate', since it has invalid type: Timestamp.
SQLDocumentStore can accept and cast to string only the following types: str, int, float, bool, bytes, bytearray, NoneT

In [None]:
!ls

In [None]:
# Add documents embeddings to index
document_store_faiss.update_embeddings(retriever=retriever)