```
This notebook performs semantic retrieval over lecture transcripts.
Given a user query, it finds the most relevant transcript chunks
using vector similarity search. The retrieved chunks are later
used as context for a RAG-based AI teaching assistant.
```

In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import torch
from sentence_transformers import SentenceTransformer


#### Load Embeddings CSV

In [None]:
CSV_PATH = "/content/drive/MyDrive/RAG_BAS_PROJECT/embeddings.csv"

df = pd.read_csv(CSV_PATH)

print("CSV loaded successfully")
print("Total chunks:", len(df))


#### Convert Embedding Strings â†’ NumPy Matrix

In [None]:
df["embedding"] = df["embedding"].apply(json.loads)

embedding_matrix = np.vstack(df["embedding"].values)

print("Embedding matrix shape:", embedding_matrix.shape)


#### Load Same Embedding Model

In [None]:
model = SentenceTransformer(
    "BAAI/bge-m3",
    device=device
)

### Encode User Query

In [None]:
def encode_query(text,batch_size=8):
     embedding = model.encode(
    [text],
    batch_size=batch_size,
    normalize_embeddings=True,
    show_progress_bar=False
      )

    return embedding;


#### Cosine Similarity + Top-K Retrieval

In [None]:
def retrieve_top_k(query_embedding, embedding_matrix, df, k=5):
    similarities = cosine_similarity(
        embedding_matrix,
        query_embedding
    ).flatten()

    top_indices = similarities.argsort()[::-1][:k]

    results = df.iloc[top_indices].copy()
    results["similarity_score"] = similarities[top_indices]

    return results


### Run Retrieval

In [None]:
user_question = input("Ask a question: ")

query_embedding = encode_query(user_question)

top_k = retrieve_top_k(
    query_embedding,
    embedding_matrix,
    df,
    k=3
)

top_k[[
    "Number",
    "Title",
    "Start",
    "End",
    "Text",
    "similarity_score"
]]


#### Modify 05_retrieval.ipynb to save top-k chunks

In [None]:
TOP_K_PATH = "/content/drive/MyDrive/RAG_BAS_PROJECT/top_k.json"
top_k.to_json(TOP_K_PATH, orient="records", indent=2)
print(f"Top-K retrieval results saved to: {TOP_K_PATH}")

# Save the user question for 06_rag_gemini
QUESTION_PATH = "/content/drive/MyDrive/RAG_BAS_PROJECT/user_question.json"
with open(QUESTION_PATH, "w") as f:
    f.write(json.dumps({"question": user_question}))
print(f"User question saved to: {QUESTION_PATH}")