# Build faiss indexes using haystack
Ref: https://github.com/deepset-ai/haystack-tutorials/blob/main/tutorials/06_Better_Retrieval_via_Embedding_Retrieval.ipynb

Ref: https://docs.haystack.deepset.ai/docs/retriever#documentstore-compatibility

In [1]:
import os

In [2]:
NOTEBOOK_DIR = '../data/evaluation/notebooks/notebooks_contents'
DOCS_FILE = '../preprocessed_data/docs.json'
FAISS_INDEX_DIR = './faiss_indexes_100'
FAISS_DB_DIR = './faiss_db_100'

In [3]:
EMBEDDING_MODELS = [("model1", "sentence-transformers/multi-qa-mpnet-base-dot-v1"),
                    ("model2", "sentence-transformers/all-mpnet-base-v2")]


In [4]:
embedding_model = EMBEDDING_MODELS[0]
embedding_model

('model1', 'sentence-transformers/multi-qa-mpnet-base-dot-v1')

## Text preprocessing (First run only)
- Transform the JSON file to input form
- Convert JSON to `document`
- Split the documents to passages
- Index the passages to `document_store`

In [6]:
import json 

# Set the path to the directory containing the input JSON files
input_dir = NOTEBOOK_DIR

# Set the path to the output JSON file
output_file = DOCS_FILE

# Loop through the input JSON files and extract relevant information to a new JSON file
data = []
for filename in os.listdir(input_dir):
    if filename.endswith(".json"):
        file_path = os.path.join(input_dir, filename)
        with open(file_path, "r") as f:
            json_data = json.load(f)
            data.append({
                "docid": json_data["docid"],
                "content": json_data["md_text_clean"],
            })

# Write the extracted data to the output JSON file
with open(output_file, "w") as f:
    json.dump(data, f)


## Create document store

In [5]:
from haystack.nodes import JsonConverter

converter = JsonConverter()
docs = converter.convert(DOCS_FILE)
len(docs)

  from .autonotebook import tqdm as notebook_tqdm


3828

In [6]:
from haystack.nodes import PreProcessor

processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
    split_overlap=0
)

passages = processor.process(docs)
len(passages)

Preprocessing:   0%|          | 0/3828 [00:00<?, ?docs/s]We found one or more sentences whose word count is higher than the split length.
Preprocessing:   4%|▎         | 134/3828 [00:00<00:02, 1339.30docs/s]Document 58836705291eec0981723eb66ba70310 is 23479 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document 98ce7ef4594f631e5cf6ba3194bbaac5 is 57219 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Preprocessing:  12%|█▏        | 443/3828 [00:00<00:02, 1483.10docs/s]Document 6ea3852f87501e1f0e1fa201dffe34df is 26578 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document 2e6c46b

20843

In [7]:
%%capture
# Write documents
from haystack.document_stores import FAISSDocumentStore
os.makedirs(f"{FAISS_DB_DIR}/{embedding_model[0]}", exist_ok=True)
document_store = FAISSDocumentStore(sql_url=f"sqlite:///{FAISS_DB_DIR}/{embedding_model[0]}/faiss_base.db", faiss_index_factory_str = "Flat")

for i, passage in enumerate(passages): 
    docid = passage.meta['docid']
    passage_docid = f"{docid}_passage{i}"
    index_document = {
        "id": passage_docid,
        "content": passage.content,
        "meta": {
            "name": docid,
            "passage_number": i,
        },
    }
    document_store.write_documents([index_document])

In [8]:
# for i in dir(document_store): 
#     print(i)

document_store.embedding_dim

768

In [9]:
document_store.get_document_count(), document_store.get_embedding_count()

(20843, 0)

## Write embeddings

In [10]:
# Update embeddings
from haystack.nodes import EmbeddingRetriever

def update_index(document_store, embedding_model):
    retriever = EmbeddingRetriever(
        document_store=document_store,
        embedding_model=embedding_model[1],
    )
    # Important:
    # Now that we initialized the Retriever, we need to call update_embeddings() to iterate over all
    # previously indexed documents and update their embedding representation.
    # While this can be a time consuming operation (depending on the corpus size), it only needs to be done once.
    # At query time, we only need to embed the query and compare it to the existing document embeddings, which is very fast.
    document_store.update_embeddings(retriever)

    # Save the document store:
    index_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/index.faiss"
    config_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/config.json"
    os.makedirs(f"{FAISS_INDEX_DIR}/{embedding_model[0]}", exist_ok=True)
    
    document_store.save(index_path=index_path, config_path=config_path)
    print(f"Save index to {index_path}")


In [11]:
update_index(document_store, embedding_model)

Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 78.8kB/s]
Downloading (…)16ebc/.gitattributes: 100%|██████████| 737/737 [00:00<00:00, 1.06MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 319kB/s]
Downloading (…)b6b5d16ebc/README.md: 100%|██████████| 8.65k/8.65k [00:00<00:00, 13.8MB/s]
Downloading (…)b5d16ebc/config.json: 100%|██████████| 571/571 [00:00<00:00, 934kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 184kB/s]
Downloading (…)ebc/data_config.json: 100%|██████████| 25.5k/25.5k [00:00<00:00, 32.4MB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:00<00:00, 773MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 79.8kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 404kB/s]
Downloading (…)16ebc/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.94MB/s]
Downloading (…)okenizer_config.json: 100%|████████

KeyboardInterrupt: 

In [None]:
document_store.get_document_count(), document_store.get_embedding_count()

## Load index

In [None]:
embedding_model = EMBEDDING_MODELS[1]


In [None]:
# Load index
from haystack.document_stores import FAISSDocumentStore
index_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/index.faiss"
config_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/config.json"
document_store = FAISSDocumentStore.load(index_path=index_path, config_path=config_path)

# Check if the DocumentStore is loaded correctly
assert document_store.faiss_index_factory_str == "Flat"

In [None]:
# document_store.get_document_count()
document_store.get_embedding_count()