In [1]:
import json

file_path = "/content/drive/MyDrive/rag/flipkart_phones_chunks (1).json"

with open(file_path, 'r') as f:
    data = json.load(f)

print(f"Successfully loaded {len(data)} items from {file_path}")

Successfully loaded 3096 items from /content/drive/MyDrive/rag/flipkart_phones_chunks (1).json


In [8]:
# pipeline/embed_index.py
import os
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from tqdm import tqdm

MODEL_NAME = "all-MiniLM-L6-v2"   # small & fast
EMBED_DIM = 384  # for all-MiniLM-L6-v2

# Define paths for Colab environment
DATA_DIR = "/content/data"
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

VECTORS_FILE = os.path.join(DATA_DIR, "vectors.npy")
METAS_FILE = os.path.join(DATA_DIR, "metas.json")
FAISS_INDEX_FILE = os.path.join(DATA_DIR, "faiss.index")

def load_chunks():
    # Assuming 'data' is a global list of strings
    # We need to convert it to the expected format: list of dicts with 'chunk' and 'meta'
    processed_chunks = []
    for item in data:
        processed_chunks.append({"chunk": item, "meta": {}})
    return processed_chunks

def build_index():
    model = SentenceTransformer(MODEL_NAME)
    chunks = load_chunks()  # expects list of {"chunk": "...", "meta": {...}}
    texts = [c["chunk"] for c in chunks]

    print("Embedding", len(texts), "chunks...")
    embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
    embeddings = np.array(embeddings).astype("float32")

    # create FAISS index
    index = faiss.IndexHNSWFlat(EMBED_DIM, 32)  # suitable for accuracy/speed
    index.hnsw.efConstruction = 200
    index.add(embeddings)

    # save index & meta & vectors
    faiss.write_index(index, FAISS_INDEX_FILE)
    np.save(VECTORS_FILE, embeddings)
    with open(METAS_FILE, "w", encoding="utf-8") as f:
        json.dump(chunks, f, ensure_ascii=False, indent=2)

    print("Saved FAISS index, vectors, metas.")

if __name__ == "__main__":
    build_index()

Embedding 3096 chunks...


Batches:   0%|          | 0/97 [00:00<?, ?it/s]

Saved FAISS index, vectors, metas.


In [7]:
!pip install faiss-cpu sentence-transformers tqdm

