# Build faiss indexes using haystack
Ref: https://github.com/deepset-ai/haystack-tutorials/blob/main/tutorials/06_Better_Retrieval_via_Embedding_Retrieval.ipynb

Ref: https://docs.haystack.deepset.ai/docs/retriever#documentstore-compatibility

In [17]:
import os

In [18]:
NOTEBOOK_DIR = '../data/evaluation/notebooks/notebooks_contents'
DOCS_FILE = '../preprocessed_data/docs.json'
FAISS_INDEX_DIR = './faiss_indexes_512'
FAISS_DB_DIR = './faiss_db_512'

In [19]:
EMBEDDING_MODELS = [("model1", "sentence-transformers/multi-qa-mpnet-base-dot-v1"),
                    ("model2", "sentence-transformers/all-mpnet-base-v2")]


In [20]:
embedding_model = EMBEDDING_MODELS[1]
embedding_model

('model2', 'sentence-transformers/all-mpnet-base-v2')

## Text preprocessing (First run only)
- Transform the JSON file to input form
- Convert JSON to `document`
- Split the documents to passages
- Index the passages to `document_store`

In [5]:
import json 

# Set the path to the directory containing the input JSON files
input_dir = NOTEBOOK_DIR

# Set the path to the output JSON file
output_file = DOCS_FILE

# Loop through the input JSON files and extract relevant information to a new JSON file
data = []
for filename in os.listdir(input_dir):
    if filename.endswith(".json"):
        file_path = os.path.join(input_dir, filename)
        with open(file_path, "r") as f:
            json_data = json.load(f)
            data.append({
                "docid": json_data["docid"],
                "content": json_data["md_text_clean"],
            })
file_count = sum(1 for file in os.listdir(input_dir) if file.endswith('.json'))
print(f"[{file_count}] computational notebooks")
# Write the extracted data to the output JSON file
with open(output_file, "w") as f:
    json.dump(data, f)


[3779] computational notebooks


## Create document store

In [6]:
from haystack.nodes import JsonConverter

converter = JsonConverter()
docs = converter.convert(DOCS_FILE)
len(docs)



3779

In [7]:
from haystack.nodes import PreProcessor

processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=512,
    split_respect_sentence_boundary=True,
    split_overlap=0
)

passages = processor.process(docs)
len(passages)

Preprocessing:   0%|          | 0/3779 [00:00<?, ?docs/s]

Document f955c4408695b9f350043e4a7fca3a83 is 24064 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document 4404a3e98dfe4df3cbee942240d9a06e is 57280 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document 6ea3852f87501e1f0e1fa201dffe34df is 26578 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document 14383041fec20b28f2ca5ff56c1d524e is 234679 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document 6a30a08dccc73af3339ab0606a76db4 is 146705 char

4970

In [21]:
%%capture
# Write documents
from haystack.document_stores import FAISSDocumentStore
os.makedirs(f"{FAISS_DB_DIR}/{embedding_model[0]}", exist_ok=True)
document_store = FAISSDocumentStore(sql_url=f"sqlite:///{FAISS_DB_DIR}/{embedding_model[0]}/faiss_base.db", faiss_index_factory_str = "Flat")

for i, passage in enumerate(passages): 
    docid = passage.meta['docid']
    passage_docid = f"{docid}_passage{i}"
    index_document = {
        "id": passage_docid,
        "content": passage.content,
        "meta": {
            "name": docid,
            "passage_number": i,
        },
    }
    document_store.write_documents([index_document])

In [22]:
# for i in dir(document_store): 
#     print(i)

document_store.embedding_dim

768

In [23]:
document_store.get_document_count(), document_store.get_embedding_count()

(4970, 0)

## Write embeddings

In [24]:
# Update embeddings
from haystack.nodes import EmbeddingRetriever

def update_index(document_store, embedding_model):
    retriever = EmbeddingRetriever(
        document_store=document_store,
        embedding_model=embedding_model[1],
    )
    # Important:
    # Now that we initialized the Retriever, we need to call update_embeddings() to iterate over all
    # previously indexed documents and update their embedding representation.
    # While this can be a time consuming operation (depending on the corpus size), it only needs to be done once.
    # At query time, we only need to embed the query and compare it to the existing document embeddings, which is very fast.
    document_store.update_embeddings(retriever)

    # Save the document store:
    index_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/index.faiss"
    config_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/config.json"
    os.makedirs(f"{FAISS_INDEX_DIR}/{embedding_model[0]}", exist_ok=True)
    
    document_store.save(index_path=index_path, config_path=config_path)
    print(f"Save index to {index_path}")


In [25]:
update_index(document_store, embedding_model)

Updating Embedding:   0%|          | 0/4970 [00:00<?, ? docs/s]

Batches:   0%|          | 0/156 [00:00<?, ?it/s]

Save index to ./faiss_indexes_512/model2/index.faiss


In [26]:
document_store.get_document_count(), document_store.get_embedding_count()

(4970, 4970)

## Load index

In [27]:
embedding_model = EMBEDDING_MODELS[1]


In [28]:
# Load index
from haystack.document_stores import FAISSDocumentStore
index_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/index.faiss"
config_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/config.json"
document_store = FAISSDocumentStore.load(index_path=index_path, config_path=config_path)

# Check if the DocumentStore is loaded correctly
assert document_store.faiss_index_factory_str == "Flat"

In [29]:
# document_store.get_document_count()
document_store.get_embedding_count()

4970

: 