In [12]:
# 03-embeddings-index.ipynb
# Purpose: compute embeddings for chunks.jsonl and build a FAISS index.
# Outputs: embeddings/embeddings.npy, embeddings/faiss_index.bin, embeddings/metadata.jsonl


# ----------------------------
# Cell 0 - Header
# ----------------------------
# Notebook: 03 - Embeddings & FAISS Index
# Use sentence-transformers to compute text embeddings and FAISS for retrieval index

In [4]:
# ----------------------------
# Cell 2 - Setup paths & load chunks
# ----------------------------
from pathlib import Path
import json
ROOT = Path.cwd().parent
CHUNKS_FILE = ROOT / "data" / "processed" / "chunks.jsonl"
EMB_DIR = ROOT / "embeddings"
EMB_DIR.mkdir(parents=True, exist_ok=True)


chunks = []
with open(CHUNKS_FILE, encoding="utf8") as f:
    for line in f:
        if line.strip():
            chunks.append(json.loads(line))


print("Loaded", len(chunks), "chunks")
if len(chunks) > 0:
    print("Example id:", chunks[0]['id'], "words:", chunks[0]['word_count'])

Loaded 69 chunks
Example id: Qutb Minar__0 words: 213


In [5]:
# ----------------------------
# Cell 3 - Choose embedding model
# ----------------------------
# For prototyping: all-MiniLM-L6-v2 is small & fast. For higher quality, use all-mpnet-base-v2.
EMB_MODEL = "all-MiniLM-L6-v2"
print("Embedding model:", EMB_MODEL)

Embedding model: all-MiniLM-L6-v2


In [8]:
# ----------------------------
# Cell 4 - Compute embeddings
# ----------------------------
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm

model = SentenceTransformer(EMB_MODEL)
texts = [c['text'] for c in chunks]
print("Computing embeddings for", len(texts), "texts...")
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
print("Embeddings shape:", embeddings.shape)

# Save embeddings
emb_path = EMB_DIR / "embeddings.npy"
np.save(str(emb_path), embeddings)
print("Saved embeddings to", emb_path)

Computing embeddings for 69 texts...


Batches: 100%|██████████| 3/3 [00:00<00:00, 31.95it/s]

Embeddings shape: (69, 384)
Saved embeddings to /Users/jaydobariya/Desktop/RAG Project/embeddings/embeddings.npy





In [9]:
# ----------------------------
# Cell 5 - Build FAISS index
# ----------------------------
import faiss

emb = np.load(str(emb_path))
d = emb.shape[1]
print("Embedding dimension:", d)

# Use IndexFlatL2 for simplicity (exact search)
index = faiss.IndexFlatL2(d)
index.add(emb.astype('float32'))
print("Index size:", index.ntotal)

# Save index to disk
faiss.write_index(index, str(EMB_DIR / "faiss_index.bin"))
print("Saved FAISS index to", EMB_DIR / "faiss_index.bin")

Embedding dimension: 384
Index size: 69
Saved FAISS index to /Users/jaydobariya/Desktop/RAG Project/embeddings/faiss_index.bin


In [10]:
# ----------------------------
# Cell 6 - Save metadata aligned with vectors
# ----------------------------
meta_file = EMB_DIR / "metadata.jsonl"
with open(meta_file, "w", encoding="utf8") as f:
    for c in chunks:
        m = {"id": c["id"], "title": c["title"], "word_count": c["word_count"], "source_file": c.get("source_file"), "chunk_index": c.get("chunk_index")}
        f.write(json.dumps(m, ensure_ascii=False) + "\n")
print("Saved metadata to", meta_file)

Saved metadata to /Users/jaydobariya/Desktop/RAG Project/embeddings/metadata.jsonl


In [11]:
# ----------------------------
# Cell 7 - Retrieval helper (define a function)
# ----------------------------
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer(EMB_MODEL)
index = faiss.read_index(str(EMB_DIR / "faiss_index.bin"))
meta = [json.loads(l) for l in open(EMB_DIR / "metadata.jsonl", encoding="utf8").read().splitlines()]
chunks_by_id = {c['id']: c for c in chunks}

def retrieve(query, k=4):
    q_emb = encoder.encode([query], convert_to_numpy=True)
    D, I = index.search(q_emb.astype('float32'), k)
    results = []
    for dist, idx in zip(D[0], I[0]):
        if idx < 0:
            continue
        m = meta[idx]
        chunk = chunks_by_id.get(m['id'], {})
        results.append({"id": m['id'], "title": m['title'], "distance": float(dist), "text": chunk.get('text','')})
    return results

# Quick test
print("Test retrieve for 'white marble mausoleum in Agra':")
res = retrieve("white marble mausoleum in Agra", k=3)
for r in res:
    print("-", r['id'], "score:", r['distance'], "title:", r['title'])
    print(" snippet:", r['text'][:200].replace('\n',' '), "...\n")

# ----------------------------
# Cell 8 - Next steps note
# ----------------------------
print("03-embeddings-index notebook complete. Files saved to embeddings/.")
print("Next: integrate retrieve() in rag_engine and test generation.")


Test retrieve for 'white marble mausoleum in Agra':
- Taj Mahal__0 score: 0.7491079568862915 title: Taj Mahal
 snippet: The Taj Mahal ( TAHJ mə-HAHL, TAHZH -⁠; Hindustani: [taːdʒ ˈmɛɦ(ɛ)l]; lit. 'Crown of the Palace') is an ivory-white marble mausoleum on the right bank of the river Yamuna in Agra, Uttar Pradesh, India ...

- Taj Mahal__4 score: 0.7501313090324402 title: Taj Mahal
 snippet: The Taj Mahal incorporates and expands on design traditions of Indo-Islamic and Mughal architecture. Inspirations for the building came from Timurid and Mughal buildings including the Gur-e Amir in Sa ...

- Taj Mahal__20 score: 0.7874093055725098 title: Taj Mahal
 snippet: The tomb complex was built mainly using brick and lime mortar. The external surface of the main tomb building and the interior of the main cenotaph chamber is veneered with white marble. The other int ...

03-embeddings-index notebook complete. Files saved to embeddings/.
Next: integrate retrieve() in rag_engine and test generati