# Environment Setup

This notebook prepares the environment for the RAG-based
Signals & Systems AI Teaching Assistant project.

It performs:
- Google Drive mounting
- System dependency installation
- Python library installation
- Project directory creation
- GPU availability check

NOTE:
Run this notebook ONCE per session.


### Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')


#### Base Project Paths

In [None]:
import os

BASE_DIR = "/content/drive/MyDrive/RAG_BAS_PROJECT"

VIDEO_DIR = os.path.join(BASE_DIR, "Videos")
AUDIO_DIR = os.path.join(BASE_DIR, "Audios")
JSON_DIR  = os.path.join(BASE_DIR, "jsons")

os.makedirs(VIDEO_DIR, exist_ok=True)
os.makedirs(AUDIO_DIR, exist_ok=True)
os.makedirs(JSON_DIR, exist_ok=True)

print("Project folders created successfully")


#### Install System Dependencies

#### Install FFmpeg for audio/video processing

In [None]:

!apt-get update -qq
!apt-get install -y ffmpeg


#####  Verify FFmpeg installation

In [None]:

!ffmpeg -version


### Install Python Dependencies

In [None]:
# Core utilities
!pip install -U yt-dlp tqdm

# Whisper for transcription
!pip install -U openai-whisper

# Embeddings & ML
!pip install -U sentence-transformers scikit-learn

# Gemini API
!pip install -U google-generativeai


#### Verify yt-dlp Installation

In [None]:
!yt-dlp --version


In [None]:
print("Environment setup complete. You may proceed to the next notebook.")


# 02a_video_to_audio.ipynb


# Automatic Video Download (YouTube Playlist)

This notebook:
- Downloads lecture videos from a YouTube playlist
- Automatically skips already-downloaded videos
- Supports partial or full playlist downloads
- Stores videos in a structured format for further processing

NOTE:
- Safe to re-run
- Downloads only new videos
- Does NOT perform audio conversion


#### Imports & Paths

In [8]:
import os
import yt_dlp


In [None]:
BASE_DIR = "/content/drive/MyDrive/RAG_BAS_PROJECT"
VIDEO_DIR = os.path.join(BASE_DIR, "Videos")

os.makedirs(VIDEO_DIR, exist_ok=True)

print("Video directory ready")


In [10]:
# Playlist URL
PLAYLIST_URL = "https://www.youtube.com/playlist?list=PLBlnK6fEyqRhG6s3jYIU48CqsT5cyiDTO"

# Playlist range
# Examples:
# None       → full playlist
# "1-10"     → first 10 videos
# "16-25"    → specific range
PLAYLIST_ITEMS = "1-25"


In [11]:
def download_playlist_videos():
    ydl_opts = {
        # Output filename: keeps playlist order
        "outtmpl": f"{VIDEO_DIR}/%(playlist_index)s - %(title)s.%(ext)s",

        # Best quality video + audio
        "format": "bv*+ba/b",
        "merge_output_format": "mp4",

        # Playlist selection
        "playlist_items": PLAYLIST_ITEMS,

        # Skip already-downloaded files
        "overwrites": False,
        "continuedl": True,

        # Stability & retries
        "retries": 10,
        "fragment_retries": 10,
        "sleep_interval": 1,
        "max_sleep_interval": 3,

        # Cleaner logs
        "quiet": False,
        "no_warnings": False,
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([PLAYLIST_URL])

    print("Video download completed successfully")


In [None]:
download_playlist_videos()


In [None]:
print("02a_auto_video_download.ipynb completed.")


# 02_video_to_audio.ipynb

# Video to Audio (Data Acquisition)

This notebook:
- Downloads lecture videos from a YouTube playlist
- Stores videos in MP4 format
- Converts MP4 videos to MP3 audio using FFmpeg

NOTE:
- This notebook should be run ONLY ONCE.
- Re-run ONLY if the playlist changes.


#### Imports & Paths

In [14]:
import os
import subprocess
from yt_dlp import YoutubeDL


# Base project paths

In [None]:

BASE_DIR = "/content/drive/MyDrive/RAG_BAS_PROJECT"
VIDEO_DIR = os.path.join(BASE_DIR, "Videos")
AUDIO_DIR = os.path.join(BASE_DIR, "Audios")

os.makedirs(VIDEO_DIR, exist_ok=True)
os.makedirs(AUDIO_DIR, exist_ok=True)

print("Directories ready")


#### Convert MP4 → MP3 (Clean & Safe)

In [None]:
video_files = sorted([
    f for f in os.listdir(VIDEO_DIR)
    if f.endswith(".mp4")
])

print(f"Found {len(video_files)} video files for conversion")


In [None]:
for video in video_files:
    input_path = os.path.join(VIDEO_DIR, video)

    lecture_number = video.split(" - ")[0]
    lecture_title = video.split(" - ", 1)[1].replace(".mp4", "")
    output_audio = f"{lecture_number}_{lecture_title}.mp3"
    output_path = os.path.join(AUDIO_DIR, output_audio)

    # Skip if MP3 already exists (IMPORTANT)
    if os.path.exists(output_path):
        print(f"Skipping (already exists): {output_audio}")
        continue

    print(f"Converting: {video}")

    subprocess.run(
        [
            "ffmpeg",
            "-nostdin",
            "-y",
            "-i", input_path,
            output_path
        ],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.STDOUT
    )

print("All videos converted to MP3 successfully!")


In [None]:
print("02_video_to_audio.ipynb completed successfully.")


# 03_audio_to_json.ipynb

# Audio to JSON (Transcription & Chunking)

This notebook converts lecture audio files into structured JSON documents.

Pipeline:
1. Load MP3 lecture audio files
2. Transcribe audio using OpenAI Whisper (large-v2)
3. Merge Whisper segments into RAG-optimized chunks
4. Store timestamps, lecture metadata, and text
5. Save output as reusable JSON files

Chunking Strategy:
- Time-based chunking (15–25 seconds)
- Small overlap to preserve context
- Designed for embedding and retrieval performance


### Imports & Paths

In [19]:
import os
import json
import whisper
from tqdm import tqdm


In [None]:
BASE_DIR = "/content/drive/MyDrive/RAG_BAS_PROJECT"
AUDIO_DIR = os.path.join(BASE_DIR, "Audios")
JSON_DIR = os.path.join(BASE_DIR, "jsons")

os.makedirs(JSON_DIR, exist_ok=True)

print("Directories ready")


### Load Whisper Model

In [None]:
model = whisper.load_model("large-v2")
print("Whisper large-v2 model loaded")


## Chunking Parameters

- MAX_DURATION: Target chunk length (seconds)
- OVERLAP: Context overlap between chunks


In [22]:
MAX_DURATION = 20   # seconds
OVERLAP = 3         # seconds


### Chunk Creation Function

In [23]:
def create_chunks(segments, lecture_number, lecture_title):
    chunks = []
    buffer_text = []
    chunk_start = None

    for seg in segments:
        if chunk_start is None:
            chunk_start = seg["start"]

        buffer_text.append(seg["text"].strip())
        duration = seg["end"] - chunk_start

        if duration >= MAX_DURATION:
            chunks.append({
                "Number": lecture_number,
                "Title": lecture_title,
                "Start": round(chunk_start, 2),
                "End": round(seg["end"], 2),
                "Text": " ".join(buffer_text)
            })

            # Overlap handling
            chunk_start = seg["end"] - OVERLAP
            buffer_text = []

    # Handle leftover text
    if buffer_text:
        chunks.append({
            "Number": lecture_number,
            "Title": lecture_title,
            "Start": round(chunk_start, 2),
            "End": round(segments[-1]["end"], 2),
            "Text": " ".join(buffer_text)
        })

    return chunks


#### List Audio Files

In [None]:
audio_files = sorted([
    f for f in os.listdir(AUDIO_DIR)
    if f.endswith(".mp3")
])

print(f"Found {len(audio_files)} audio files")


#### Transcribe & Save JSON

In [None]:
for audio in tqdm(audio_files):
    lecture_number = audio.split("_")[0]
    lecture_title = audio.split("_", 1)[1].replace(".mp3", "")
    audio_path = os.path.join(AUDIO_DIR, audio)

    output_json_path = os.path.join(
        JSON_DIR,
        f"{lecture_number}_{lecture_title}.json"
    )

    # Skip if already processed
    if os.path.exists(output_json_path):
        print(f"Skipping (already exists): {lecture_number}_{lecture_title}")
        continue

    print(f"\nTranscribing: {audio}")

    result = model.transcribe(
        audio=audio_path,
        language="en",
        task="transcribe",
        word_timestamps=False
    )

    chunks = create_chunks(
        segments=result["segments"],
        lecture_number=lecture_number,
        lecture_title=lecture_title
    )

    output = {
        "lecture_number": lecture_number,
        "lecture_title": lecture_title,
        "full_text": result["text"],
        "chunks": chunks
    }

    with open(output_json_path, "w") as f:
        json.dump(output, f, indent=2)

    print(f"Saved JSON: {output_json_path}")


In [None]:
print("03_audio_to_json.ipynb completed successfully.")


# 04_embeddings.ipynb

# Embeddings Generation

This notebook:
- Loads chunked lecture JSON files
- Converts text chunks into dense vector embeddings
- Uses a state-of-the-art embedding model (BAAI/bge-m3)
- Stores embeddings with metadata in a reusable CSV file

NOTE:
- Run this notebook ONLY when chunks change
- Do NOT recompute embeddings on every query


### Imports & Paths

In [27]:
import os
import json
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer


In [None]:
BASE_DIR = "/content/drive/MyDrive/RAG_BAS_PROJECT"
JSON_DIR = os.path.join(BASE_DIR, "jsons")
EMBEDDING_CSV = os.path.join(BASE_DIR, "embeddings.csv")

print("Paths configured")


#### Load Embedding Model

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer(
    "BAAI/bge-m3",
    device=device
)

print("Embedding model loaded")

## Embedding Function

- Batch encoding for efficiency
- Normalized embeddings for cosine similarity


In [32]:
def create_embeddings(texts, batch_size=8):
    return model.encode(
        texts,
        batch_size=batch_size,
        normalize_embeddings=True,
        show_progress_bar=False
    )


#### Load JSON Files


In [None]:
json_files = sorted([
    f for f in os.listdir(JSON_DIR)
    if f.endswith(".json")
])

print(f"Found {len(json_files)} JSON files")


#### Generate Embeddings

In [None]:
records = []
chunk_id = 0

for json_file in json_files:
    json_path = os.path.join(JSON_DIR, json_file)

    with open(json_path, "r") as f:
        content = json.load(f)

    print(f"Creating embeddings for: {json_file}")

    texts = [chunk["Text"] for chunk in content["chunks"]]
    embeddings = create_embeddings(texts)

    for i, chunk in enumerate(content["chunks"]):
        records.append({
            "chunk_id": chunk_id,
            "Number": chunk["Number"],
            "Title": chunk["Title"],
            "Start": chunk["Start"],
            "End": chunk["End"],
            "Text": chunk["Text"],
            "embedding": embeddings[i].tolist()
        })
        chunk_id += 1

    torch.cuda.empty_cache()


#### Create DataFrame

In [None]:
df = pd.DataFrame.from_records(records)
print(df.head())
print(f"Total chunks embedded: {len(df)}")


#### Save Embeddings to CSV

In [None]:
# Convert embedding list → JSON string for CSV storage
df["embedding"] = df["embedding"].apply(json.dumps)

df.to_csv(EMBEDDING_CSV, index=False)

print(f"Embeddings saved to: {EMBEDDING_CSV}")


In [None]:
print("04_embeddings.ipynb completed successfully.")


# 05_retrieval.ipynb

```
This notebook performs semantic retrieval over lecture transcripts.
Given a user query, it finds the most relevant transcript chunks
using vector similarity search. The retrieved chunks are later
used as context for a RAG-based AI teaching assistant
```

In [38]:
import json
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import torch
from sentence_transformers import SentenceTransformer


#### Load Embeddings CSV

In [None]:
CSV_PATH = "/content/drive/MyDrive/RAG_BAS_PROJECT/embeddings.csv"

df = pd.read_csv(CSV_PATH)

print("CSV loaded successfully")
print("Total chunks:", len(df))


#### Convert Embedding Strings → NumPy Matrix

In [None]:
df["embedding"] = df["embedding"].apply(json.loads)

embedding_matrix = np.vstack(df["embedding"].values)

print("Embedding matrix shape:", embedding_matrix.shape)


#### Load Same Embedding Model

In [41]:
# model = SentenceTransformer(
#     "BAAI/bge-m3",
#     device=device
# )

# because the model previously uplodade

### Encode User Query

In [45]:
def encode_query(text,batch_size=8):
     embedding = model.encode(
        [text],
         batch_size=batch_size,
         normalize_embeddings=True,
         show_progress_bar=False
      )
     return embedding



#### Cosine Similarity + Top-K Retrieval

In [46]:
def retrieve_top_k(query_embedding, embedding_matrix, df, k=5):
    similarities = cosine_similarity(
        embedding_matrix,
        query_embedding
    ).flatten()

    top_indices = similarities.argsort()[::-1][:k]

    results = df.iloc[top_indices].copy()
    results["similarity_score"] = similarities[top_indices]

    return results


### Run Retrieval

In [None]:
user_question = input("Ask a question: ")

query_embedding = encode_query(user_question)

top_k = retrieve_top_k(
    query_embedding,
    embedding_matrix,
    df,
    k=3
)

top_k[[
    "Number",
    "Title",
    "Start",
    "End",
    "Text",
    "similarity_score"
]]


#### Modify 05_retrieval.ipynb to save top-k chunks

In [None]:
TOP_K_PATH = "/content/drive/MyDrive/RAG_BAS_PROJECT/top_k.json"
top_k.to_json(TOP_K_PATH, orient="records", indent=2)
print(f"Top-K retrieval results saved to: {TOP_K_PATH}")

# Save the user question for 06_rag_gemini
QUESTION_PATH = "/content/drive/MyDrive/RAG_BAS_PROJECT/user_question.json"
with open(QUESTION_PATH, "w") as f:
    f.write(json.dumps({"question": user_question}))
print(f"User question saved to: {QUESTION_PATH}")

# 06_rag_gemini.ipynb

# RAG-based AI Teaching Assistant (Gemini)

#### Install & Import Libraries

In [49]:
!pip install -q -U google-generativeai


In [51]:
import google.generativeai as genai
import pandas as pd


#### Configure Gemini API Key

In [53]:
from google.colab import userdata

api_key = userdata.get("GOOGLE_API_KEY")
genai.configure(api_key=api_key)

print("Gemini API configured successfully")


Gemini API configured successfully


In [54]:
def build_context(top_k_df):
    context = ""

    for _, row in top_k_df.iterrows():
        context += f"""
Lecture Title: {row['Title']}
Lecture Number: {row['Number']}
Start Time: {row['Start']} seconds
End Time: {row['End']} seconds
Lecture Content:
{row['Text']}
----------------------------------
"""
    return context


In [55]:
def build_prompt(context, user_question):
    prompt = f"""
You are an AI teaching assistant for a Signals and Systems course.

IMPORTANT RULES:
- Use ONLY the lecture content below.
- Do NOT use outside knowledge.
- You may combine multiple lecture chunks.
- If the topic is NOT covered, say:
  "This question is not covered in the provided lectures."

You MUST:
- Explain in simple student-friendly language
- Clearly mention:
  - Lecture Title
  - Lecture Number
  - Start Time
  - End Time

Lecture Content:
{context}

User Question:
{user_question}

Answer Format (STRICT):

Answer:
<simple explanation>

Where it is taught:
- Lecture Title:
- Lecture Number:
- Start Time:
- End Time:
"""
    return prompt


In [56]:
def gemini_rag_answer(prompt):
    model = genai.GenerativeModel("gemini-2.5-flash")
    response = model.generate_content(prompt)
    return response.text


In [None]:
# Load the saved user question from 05_file
QUESTION_PATH = "/content/drive/MyDrive/RAG_BAS_PROJECT/user_question.json"
with open(QUESTION_PATH, "r") as f:
    user_question = json.load(f)["question"]

print(f"Loaded user question: {user_question}")

In [None]:
import pandas as pd

TOP_K_PATH = "/content/drive/MyDrive/RAG_BAS_PROJECT/top_k.json"

# Load the top-k chunks retrieved in 05_retrieval.ipynb
top_k = pd.read_json(TOP_K_PATH)

print("Top-K retrieval data loaded successfully")
print(top_k.head())


In [None]:

context_text = build_context(top_k)
prompt = build_prompt(context_text, user_question)

answer = gemini_rag_answer(prompt)

print("\nAI Teaching Assistant Answer:\n")
print(answer)


In [None]:
SAVE_PATH = "/content/drive/MyDrive/RAG_BAS_PROJECT/response.txt"

with open(SAVE_PATH, "w") as f:
    f.write(answer)

print("Response saved successfully.")


Response saved successfully
