In [1]:
# 04-clip-index.ipynb
# Cell 1 - install (if needed)
# import importlib, sys, subprocess
# pkgs = ["sentence-transformers", "faiss-cpu", "tqdm", "numpy"]
# missing = [p for p in pkgs if importlib.util.find_spec(p) is None]
# if missing:
#     subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing)

# Cell 2 - paths & load chunks
from pathlib import Path
import json
ROOT = Path.cwd().parent
CHUNKS_FILE = ROOT / "data" / "processed" / "chunks.jsonl"
OUT_DIR = ROOT / "embeddings"   # we will store clip_text_embeddings.npy and clip_faiss_index.bin here
OUT_DIR.mkdir(parents=True, exist_ok=True)

chunks = []
with open(CHUNKS_FILE, encoding="utf8") as f:
    for line in f:
        if line.strip():
            chunks.append(json.loads(line))
print("Loaded", len(chunks), "chunks")

Loaded 69 chunks


In [2]:
# Cell 3 - load CLIP text encoder via sentence-transformers
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm

# choose a CLIP-like text model (sentence-transformers has CLIP text encoders)
# examples: 'openai/clip-vit-base-patch32' isn't directly in s-t; use "clip-ViT-B-32" from sentence-transformers hub
CLIP_TEXT_MODEL = "clip-ViT-B-32"   # small, fast; swap for better model if needed
clip_model = SentenceTransformer(CLIP_TEXT_MODEL)
texts = [c["text"] for c in chunks]

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:
# Cell 4 - compute text embeddings (CLIP text embeddings)
batch_size = 32
embs = clip_model.encode(texts, show_progress_bar=True, batch_size=batch_size, convert_to_numpy=True)
print("Embeddings shape:", embs.shape)

# normalize (cosine similarity if desired)
# It is common to L2-normalize CLIP vectors for cosine similarity
def normalize_rows(x):
    norms = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12
    return x / norms

embs_norm = normalize_rows(embs.astype("float32"))

# Save embeddings
np.save(OUT_DIR / "clip_text_embeddings.npy", embs_norm)
with open(OUT_DIR / "clip_text_chunks_meta.json", "w", encoding="utf8") as f:
    json.dump([{"id": c["id"], "title": c["title"], "word_count": c["word_count"]} for c in chunks], f, ensure_ascii=False, indent=2)

Batches: 100%|██████████| 3/3 [00:01<00:00,  2.19it/s]

Embeddings shape: (69, 512)





In [4]:
# Cell 5 - build FAISS index (IndexFlatIP for cosine with normalized vectors)
import faiss
d = embs_norm.shape[1]
index = faiss.IndexFlatIP(d)   # inner product on normalized vectors = cosine similarity
index.add(embs_norm.astype("float32"))
faiss.write_index(index, str(OUT_DIR / "clip_faiss_index.bin"))
print("Saved CLIP FAISS index with", index.ntotal, "vectors at", OUT_DIR / "clip_faiss_index.bin")

Saved CLIP FAISS index with 69 vectors at /Users/jaydobariya/Desktop/RAG Project/embeddings/clip_faiss_index.bin


In [5]:
# Cell 6 - sanity test: nearest neighbors for a caption sample
sample_caption = "white marble mausoleum in Agra"
q = clip_model.encode([sample_caption], convert_to_numpy=True)
q = q / (np.linalg.norm(q, axis=1, keepdims=True) + 1e-12)
D, I = index.search(q.astype("float32"), 5)
print("Top indices:", I[0])
for idx in I[0]:
    print(idx, chunks[idx]["id"], chunks[idx]["title"], chunks[idx]["text"][:200].replace("\n"," "))

Top indices: [62 35 33 53 48]
62 Taj Mahal__27 Taj Mahal According to Ebba Koch, art historian and international expert in the understanding and interpretation of Mughal architecture and the Taj Mahal, the planning of the entire compound symbolises earthly 
35 Taj Mahal__0 Taj Mahal The Taj Mahal ( TAHJ mə-HAHL, TAHZH -⁠; Hindustani: [taːdʒ ˈmɛɦ(ɛ)l]; lit. 'Crown of the Palace') is an ivory-white marble mausoleum on the right bank of the river Yamuna in Agra, Uttar Pradesh, India
33 Red Fort__19 Red Fort West of the hammam is the Moti Masjid, the Pearl Mosque. A later addition to the Red Fort, the mosque was built in 1659 as a private place of worship for Emperor Aurangzeb. This small, three-domed str
53 Taj Mahal__18 Taj Mahal The Taj Mahal complex is enclosed by crenellated red sandstone walls on three sides, with the side facing the Yamuna river left open. Outside the complex walls, there are other mausoleums dedicated to
48 Taj Mahal__13 Taj Mahal Situated within the screen in the 