In [None]:
import faiss

In [None]:
# put all json files into one list
import json
import os
doc_links = []
doc_dir = "../data/final_data"
doc_jsons = []
# handle documentation json
with open(os.path.join(doc_dir, "lightning_docs_cleaned.json"), 'r') as f:
    data = json.load(f)
    doc_jsons.append(data)
    doc_links += [d["url_html"] for d in data]
    doc_docs = [f"{d['title']}\n{d['text']}" for d in data]

# handle discussions json
with open(os.path.join(doc_dir, "discussions.json"), 'r') as f:
    data = json.load(f)
    doc_links += [d["url"] for d in data]
    disc_docs = [f"{d['title']}\n {d['bodyText']} \n Answer: {d["answer"]["bodyText"]}" for d in data]

# handle src code json
with open(os.path.join(doc_dir, "src_filtered_data.json"), 'r') as f:
    data = json.load(f)
    doc_links += [d["file"] for d in data]
    src_docs = [f"{d['text']}" for d in data]


documents = doc_docs + disc_docs + src_docs
print(f"Total documents: {len(documents)}")

Total documents: 2240


In [11]:
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer('all-MiniLM-L6-v2')

# embed all documents
embeddings = model.encode(documents, convert_to_numpy=True, show_progress_bar=True)

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 70/70 [00:12<00:00,  5.60it/s]


In [12]:
d = embeddings.shape[1]  # embedding dimension
index = faiss.IndexFlatL2(d)  # simple L2 distance index
index.add(embeddings)
print(f"Indexed {index.ntotal} documents.")

Indexed 2240 documents.


In [15]:
with open("../requests/richa_requests.json") as f:
    queries = json.load(f)

query_texts = [q["query"] for q in queries]

query_embeddings = model.encode(query_texts, convert_to_numpy=True)



In [16]:
k = 5  # number of results per query
D, I = index.search(query_embeddings, k)  # D = distances, I = indices
for q_idx, query in enumerate(query_texts):
    print(f"\nQuery: {query}")
    for rank, doc_idx in enumerate(I[q_idx]):
        print(f"  Rank {rank+1}: {documents[doc_idx][:150]}...")



Query: Training is taking a long time. How do I speed up the training for multiple datasets?
  Rank 1: Multiple Sequential trainings slows down speed
 Hi.
I have a task where I need to run a training script multiple time with a for-loop like this:
for d...
  Rank 2: Find bottlenecks in your code (basic)
## Find bottlenecks in your code (basic)
**Audience**: Users who want to learn the basics of removing bottleneck...
  Rank 3: --- Meta Data ---
Repo: pytorch-lightning
Path: src\lightning\pytorch\trainer\trainer.py
Function Name: estimated_stepping_batches
Language: python
Pa...
  Rank 4: Training seems to pause every N steps
 I am doing feature extraction using an efficientnet_b0 model. The training process works fine but it seems to p...
  Rank 5: Debug your model (intermediate)
## Debug your model (intermediate)
**Audience**: Users who want to debug their ML code

----

### Why should I debug M...

Query: What do I need to pass into LightningCLI() to get it working? Also, please giv

In [None]:
def recall_at_k(I, ground_truth, k):
    recalls = []
    for q_idx, retrieved in enumerate(I):
        gt = set(ground_truth[q_idx])
        hit = any(doc in gt for doc in retrieved[:k])
        recalls.append(hit)
    return sum(recalls)/len(recalls)


In [None]:
results = []
for q_idx, query in enumerate(query_texts):
    result = {
        "query": query,
        "top_docs": [documents[i] for i in I[q_idx].tolist()],
        "doc_ids": I[q_idx].tolist(),
        "doc_links": [doc_links[i]]
        "distances": D[q_idx].tolist()
    }
    results.append(result)

with open("retrieval_results.json", "w") as f:
    json.dump(results, f, indent=2)
