# Build faiss indexes for code
Ref: https://github.com/deepset-ai/haystack-tutorials/blob/main/tutorials/06_Better_Retrieval_via_Embedding_Retrieval.ipynb

Ref: https://docs.haystack.deepset.ai/docs/retriever#documentstore-compatibility

In [1]:
import os

In [2]:
CODE_DIR = '../data/code/ast_analysis'
CODE_FILE = '../preprocessed_data/codes.json'
FAISS_INDEX_DIR = './code_faiss_indexes_512'
FAISS_DB_DIR = './code_faiss_db_512'

os.makedirs(FAISS_DB_DIR, exist_ok=True)
os.makedirs(FAISS_INDEX_DIR, exist_ok=True)

In [3]:
EMBEDDING_MODELS = [("model1", "microsoft/codebert-base"), 
                    ("model2", "flax-sentence-embeddings/st-codesearch-distilroberta-base"), 
                    ("model3", "sentence-transformers/multi-qa-mpnet-base-dot-v1"),
                    ("model4", "sentence-transformers/all-mpnet-base-v2")]

## Code preprocessing
- Transform the JSON file to input form
- Convert JSON to `document`
- Split the documents to passages
- Index the passages to `document_store`

In [4]:
import json 

# Set the path to the directory containing the input JSON files
input_dir = CODE_DIR

# Set the path to the output JSON file
output_file = CODE_FILE

# Loop through the input JSON files and extract relevant information to a new JSON file
data = []
for filename in os.listdir(input_dir):
    if filename.endswith(".json"):
        code_path = os.path.join(input_dir, filename)
        with open(code_path, "r") as f:
            code_data = json.load(f)
        try: 
            data.append({
                "docid": code_data["docid"],
                "content": code_data["code_clean"],
            })
        except Exception as e: 
            print(filename)
# Write the extracted data to the output JSON file
with open(output_file, "w") as f:
    json.dump(data, f)

len(data)

3787

## Create document store

In [5]:
from haystack.nodes import JsonConverter

converter = JsonConverter()
docs = converter.convert(CODE_FILE)
len(docs)



3787

In [6]:
from haystack.nodes import PreProcessor

processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=512,
    split_respect_sentence_boundary=True,
    split_overlap=0
)

passages = processor.process(docs)
len(passages)

Preprocessing:   0%|          | 0/3787 [00:00<?, ?docs/s]

We found one or more sentences whose word count is higher than the split length.
Document 1336ab1e1938214aa2dd50232dae172d is 17190 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document fa0c780fa6065cd7a7129ef2238097d is 11487 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document b11ba5ce51b4cc26c01e7219e69d6a29 is 43727 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document f5f5d1c8ea591ce103fc346bc2e04080 is 35611 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to preven

7129

In [7]:
print(passages[487].content)

import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()
sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 22, 10

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
df = pd.read_csv('spx.csv', parse_dates=['date'], index_col='date')
df.head()
plt.plot(df, label='close price')
plt.legend();
train_size = int(len(df) * 0.95)
test_size = len(df) - train_size
train, test = df.iloc[0:train_size], df.iloc[train_size:len(df)]
print(train.shape, test.shape)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler = scaler.fit(train[['close']])

train['close'] = scaler.transform(train[['close']])
test['close'] = scaler.transform(test[['close']])
def create_dataset(X, y, ti

## Write documents

In [15]:
%%capture
embedding_model = EMBEDDING_MODELS[3]

# Write documents
from haystack.document_stores import FAISSDocumentStore
os.makedirs(f"{FAISS_DB_DIR}/{embedding_model[0]}", exist_ok=True)
document_store = FAISSDocumentStore(sql_url=f"sqlite:///{FAISS_DB_DIR}/{embedding_model[0]}/faiss_base.db", faiss_index_factory_str = "Flat")

for i, passage in enumerate(passages): 
    docid = passage.meta['docid']
    passage_docid = f"{docid}_passage{i}"
    index_document = {
        "id": passage_docid,
        "content": passage.content,
        "meta": {
            "name": docid,
            "passage_number": i,
        },
    }
    document_store.write_documents([index_document])

In [16]:
embedding_model

('model4', 'sentence-transformers/all-mpnet-base-v2')

In [17]:
document_store.get_document_count()

7129

In [18]:
document_store.get_embedding_count()


0

## Write embeddings

In [19]:
# Update embeddings
from haystack.nodes import EmbeddingRetriever

def update_index(document_store, embedding_model):
    retriever = EmbeddingRetriever(
        document_store=document_store,
        embedding_model=embedding_model[1],
    )
    # Important:
    # Now that we initialized the Retriever, we need to call update_embeddings() to iterate over all
    # previously indexed documents and update their embedding representation.
    # While this can be a time consuming operation (depending on the corpus size), it only needs to be done once.
    # At query time, we only need to embed the query and compare it to the existing document embeddings, which is very fast.
    document_store.update_embeddings(retriever)

    # Save the document store:
    index_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/index.faiss"
    config_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/config.json"
    os.makedirs(f"{FAISS_INDEX_DIR}/{embedding_model[0]}", exist_ok=True)
    
    document_store.save(index_path=index_path, config_path=config_path)
    print(f"Save index to {index_path}")


In [20]:
update_index(document_store, embedding_model)

Updating Embedding:   0%|          | 0/7129 [00:00<?, ? docs/s]

Batches:   0%|          | 0/223 [00:00<?, ?it/s]

Save index to ./code_faiss_indexes_512/model4/index.faiss


In [21]:
document_store.get_document_count(), document_store.get_embedding_count()

(7129, 7129)

## Load index

In [4]:
embedding_model = EMBEDDING_MODELS[1]

In [5]:
# Load index
from haystack.document_stores import FAISSDocumentStore
index_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/index.faiss"
config_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/config.json"
document_store = FAISSDocumentStore.load(index_path=index_path, config_path=config_path)

# Check if the DocumentStore is loaded correctly
assert document_store.faiss_index_factory_str == "Flat"



In [6]:
# document_store.get_document_count()
document_store.get_embedding_count()

7129