## Code retrieval using Haystack

In [3]:
import pandas as pd

In [4]:
FAISS_INDEX_DIR = './code_faiss_indexes'

EMBEDDING_MODELS = [("model1", "microsoft/codebert-base")]

## Load pre-built index

In [5]:
embedding_model = EMBEDDING_MODELS[0]

In [6]:
# Load index
from haystack.document_stores import FAISSDocumentStore
index_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/index.faiss"
config_path=f"{FAISS_INDEX_DIR}/{embedding_model[0]}/config.json"
document_store = FAISSDocumentStore.load(index_path=index_path, config_path=config_path)

# Check if the DocumentStore is loaded correctly
assert document_store.faiss_index_factory_str == "Flat"



In [7]:
document_store.get_document_count(), document_store.get_embedding_count()

(3711, 3711)

## Load retriever

In [8]:
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model=embedding_model[1],
)

  return self.fget.__get__(instance, owner)()


## Retrieve top k notebooks

In [9]:
k = 3
# Call the retrieve method to retrieve the top 10 documents for a given query
query = "congestion control"
retrieved_docs = retriever.retrieve(query=query, top_k=k*2)

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

In [13]:
docids = []
scores = []
for doc in retrieved_docs: 
    docids.append(doc.meta['name'])
    scores.append(doc.score)

# Create a sample dataframe
df = pd.DataFrame({'docid': docids, 'score': scores})

# Group the scores by ID and apply max pooling
max_pooled_scores = df.groupby('docid')['score'].max()

# Sort the max pooled scores in descending order and select the top k records
k = 3  # Set the value of k to 3 (you can change it to any number you like)
top_k_scores = max_pooled_scores.sort_values(ascending=False).head(k)

# Create a list of dictionaries to show the top k scores with their corresponding document IDs
output_list = []
for docid, score in top_k_scores.items():
    output_dict = {'docid': docid, 'score': score}
    output_list.append(output_dict)

# Output the top k scores with their corresponding document IDs as a list of dictionaries
print(output_list)


[{'docid': 'NB_3a14cad6ff19be08786f76a00ad5ba569d36449caf98b6babfe6d6e894720b34', 'score': 0.9766805911160776}, {'docid': 'NB_78dbea99683329571780ec1a9c6707fa189c2a5743af22c9dd72c8282921c34b', 'score': 0.9765680264825364}, {'docid': 'NB_7560bc963f57d6df336e1e9df37505293c8d027bc85914aee8b1d9c9e2a0c8e0', 'score': 0.9764630409584535}]


In [15]:
top_k_scores

docid
NB_3a14cad6ff19be08786f76a00ad5ba569d36449caf98b6babfe6d6e894720b34    0.976681
NB_78dbea99683329571780ec1a9c6707fa189c2a5743af22c9dd72c8282921c34b    0.976568
NB_7560bc963f57d6df336e1e9df37505293c8d027bc85914aee8b1d9c9e2a0c8e0    0.976463
Name: score, dtype: float64