# Build faiss indexes for text and code
Ref: https://github.com/deepset-ai/haystack-tutorials/blob/main/tutorials/06_Better_Retrieval_via_Embedding_Retrieval.ipynb

Ref: https://docs.haystack.deepset.ai/docs/retriever#documentstore-compatibility

In [1]:
import os

In [8]:
CODE_DIR = '../data/code/ast_analysis'
NOTEBOOK_DIR = '../data/evaluation/notebooks/notebooks_contents'
TEXT_CODE_FILE = '../preprocessed_data/texts_codes.json'
FAISS_INDEX_DIR = './text_code_faiss_indexes_512'
FAISS_DB_DIR = './text_code_faiss_db_512'

os.makedirs(FAISS_DB_DIR, exist_ok=True)
os.makedirs(FAISS_INDEX_DIR, exist_ok=True)

In [9]:
EMBEDDING_MODELS = [("model1", "sentence-transformers/multi-qa-mpnet-base-dot-v1"),
                    ("model2", "sentence-transformers/all-mpnet-base-v2")]

## Text+Code preprocessing
- Transform the JSON file to input form
- Convert JSON to `document`
- Split the documents to passages
- Index the passages to `document_store`

In [13]:
import json 

# Set the path to the directory containing the input JSON files
input_dir = CODE_DIR
notebook_dir = NOTEBOOK_DIR

# Set the path to the output JSON file
output_file = TEXT_CODE_FILE

# Loop through the input JSON files and extract relevant information to a new JSON file
data = []
for filename in os.listdir(input_dir):
    if filename.endswith(".json"):
        code_path = os.path.join(input_dir, filename)
        notebook_path = os.path.join(notebook_dir, filename)
        with open(code_path, "r") as f:
            code_data = json.load(f)
        try: 
            with open(notebook_path, "r") as f:
                text_data = json.load(f)
            data.append({
                "docid": code_data["docid"],
                "content": text_data["md_text_clean"] + code_data["code_clean"],
            })
        except Exception as e: 
            print(filename)
            continue
# Write the extracted data to the output JSON file
with open(output_file, "w") as f:
    json.dump(data, f)


NB_4330c7302d41a5a6b86058d8869cc003c99b5bfec99e110131c28b3584dd9613.json
NB_4031df746a545c5fffb3ca3357d5993ab8e6a0f45e08470776eaf1e36bccf6b0.json
NB_111b867c0a5019557693d6f4413766e3c00be7bf251ed261a8f64093efac01fa.json
NB_0e690547ce73697428dec182adfc0c09e1842d672119f2944bed38da29006c06.json
NB_c155d931d271eda500beafb47306753e4dacd11730c0ce7cb5bb64fee2d9e59f.json
NB_20ca8b97922013a0bb8aa3c9ecd4e3a7c9854e53e3b63f9ca0046982c1037c7a.json
NB_b36aabc7ff26adb52b75f00b8da5321ccca19cf1a72cdcefa91bd4acfb1fcc50.json
NB_998c37e6a49791984a5164e5af1e3d1d9b1885f982afec23e81041b949bfe96f.json
NB_cc07eb76d3b77b1d6888de52e33a1a560939956d6db4134fbd436a761cbe2e45.json
NB_bf8716686697b792430345446e40203f6df3d6b5dc77d1981b2fad59f152be49.json
NB_b880c17ddb7db5f7d5c9e684cce55801b2cea7b3792ed80b8141ba121d2ac6b6.json
NB_0e2c4638e7b1517b0926767556555cde873dd7db2a80ee7ccd504f3de4a76485.json
NB_0cc983f8d529d28af703aff32317b2f7d3f8d67428be55825cdc6fddc389ae20.json
NB_b12b88380b8e52d93a00ff0fd74278bfbc5ab71703fc50ed

## Create document store

In [4]:
from haystack.nodes import JsonConverter

converter = JsonConverter()
docs = converter.convert(TEXT_CODE_FILE)
len(docs)



3661

In [5]:
from haystack.nodes import PreProcessor

processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=512,
    split_respect_sentence_boundary=True,
    split_overlap=0
)

passages = processor.process(docs)
len(passages)

Preprocessing:   0%|          | 0/3661 [00:00<?, ?docs/s]

We found one or more sentences whose word count is higher than the split length.
Document 93fff83d3e36661892473d7eec47fb23 is 10382 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document 1336ab1e1938214aa2dd50232dae172d is 17190 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document fa0c780fa6065cd7a7129ef2238097d is 11487 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document 430c7c1aff3e5f93151ca72ff4a07e58 is 44093 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to preven

9766

## Write documents

In [13]:
%%capture

embedding_model = EMBEDDING_MODELS[1]

from haystack.document_stores import FAISSDocumentStore
os.makedirs(f"{FAISS_DB_DIR}/{embedding_model[0]}", exist_ok=True)
document_store = FAISSDocumentStore(sql_url=f"sqlite:///{FAISS_DB_DIR}/{embedding_model[0]}/faiss_base.db", faiss_index_factory_str = "Flat")

for i, passage in enumerate(passages): 
    docid = passage.meta['docid']
    passage_docid = f"{docid}_passage{i}"
    index_document = {
        "id": passage_docid,
        "content": passage.content,
        "meta": {
            "name": docid,
            "passage_number": i,
        },
    }
    document_store.write_documents([index_document])

In [14]:
document_store.get_document_count()

9766

In [15]:
embedding_model

('model2', 'sentence-transformers/all-mpnet-base-v2')

In [16]:
document_store.get_embedding_count()


0

## Write embeddings

In [17]:
# Update embeddings
from haystack.nodes import EmbeddingRetriever

def update_index(document_store, embedding_model):
    retriever = EmbeddingRetriever(
        document_store=document_store,
        embedding_model=embedding_model[1],
    )
    # Important:
    # Now that we initialized the Retriever, we need to call update_embeddings() to iterate over all
    # previously indexed documents and update their embedding representation.
    # While this can be a time consuming operation (depending on the corpus size), it only needs to be done once.
    # At query time, we only need to embed the query and compare it to the existing document embeddings, which is very fast.
    document_store.update_embeddings(retriever)

    # Save the document store:
    index_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/index.faiss"
    config_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/config.json"
    os.makedirs(f"{FAISS_INDEX_DIR}/{embedding_model[0]}", exist_ok=True)
    
    document_store.save(index_path=index_path, config_path=config_path)
    print(f"Save index to {index_path}")


In [18]:
update_index(document_store, embedding_model)

  return self.fget.__get__(instance, owner)()


Updating Embedding:   0%|          | 0/9766 [00:00<?, ? docs/s]

Batches:   0%|          | 0/306 [00:00<?, ?it/s]

Save index to ./text_code_faiss_indexes_512/model2/index.faiss


In [19]:
document_store.get_document_count(), document_store.get_embedding_count()

(9766, 9766)

## Load index

In [23]:
embedding_model = EMBEDDING_MODELS[0]

In [24]:
# Load index
from haystack.document_stores import FAISSDocumentStore
index_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/index.faiss"
config_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/config.json"
document_store = FAISSDocumentStore.load(index_path=index_path, config_path=config_path)

# Check if the DocumentStore is loaded correctly
assert document_store.faiss_index_factory_str == "Flat"

In [29]:
# document_store.get_document_count()
document_store.get_embedding_count()

3711