# Embeddings Generation

This notebook:
- Loads chunked lecture JSON files
- Converts text chunks into dense vector embeddings
- Uses a state-of-the-art embedding model (BAAI/bge-m3)
- Stores embeddings with metadata in a reusable CSV file

NOTE:
- Run this notebook ONLY when chunks change
- Do NOT recompute embeddings on every query


### Imports & Paths

In [None]:
import os
import json
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer


In [None]:
BASE_DIR = "/content/drive/MyDrive/RAG_BAS_PROJECT"
JSON_DIR = os.path.join(BASE_DIR, "jsons")
EMBEDDING_CSV = os.path.join(BASE_DIR, "embeddings.csv")

print("Paths configured")


#### Load Embedding Model

In [None]:
model = SentenceTransformer(
    "BAAI/bge-m3",
    device=device
)

print("Embedding model loaded")


## Embedding Function

- Batch encoding for efficiency
- Normalized embeddings for cosine similarity


In [None]:
def create_embeddings(texts, batch_size=8):
    return model.encode(
        texts,
        batch_size=batch_size,
        normalize_embeddings=True,
        show_progress_bar=False
    )


#### Load JSON Files


In [None]:
json_files = sorted([
    f for f in os.listdir(JSON_DIR)
    if f.endswith(".json")
])

print(f"Found {len(json_files)} JSON files")


#### Generate Embeddings

In [None]:
records = []
chunk_id = 0

for json_file in json_files:
    json_path = os.path.join(JSON_DIR, json_file)

    with open(json_path, "r") as f:
        content = json.load(f)

    print(f"Creating embeddings for: {json_file}")

    texts = [chunk["Text"] for chunk in content["chunks"]]
    embeddings = create_embeddings(texts)

    for i, chunk in enumerate(content["chunks"]):
        records.append({
            "chunk_id": chunk_id,
            "Number": chunk["Number"],
            "Title": chunk["Title"],
            "Start": chunk["Start"],
            "End": chunk["End"],
            "Text": chunk["Text"],
            "embedding": embeddings[i].tolist()
        })
        chunk_id += 1

    torch.cuda.empty_cache()


#### Create DataFrame

In [None]:
df = pd.DataFrame.from_records(records)
print(df.head())
print(f"Total chunks embedded: {len(df)}")


#### Save Embeddings to CSV

In [None]:
# Convert embedding list â†’ JSON string for CSV storage
df["embedding"] = df["embedding"].apply(json.dumps)

df.to_csv(EMBEDDING_CSV, index=False)

print(f"Embeddings saved to: {EMBEDDING_CSV}")


In [None]:
print("04_embeddings.ipynb completed successfully.")
