In [None]:
#Run this to install scikit-learn and LLM in Colab.
!pip install scikit-learn
!pip install -q transformers accelerate bitsandbytes torch

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import joblib
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load data
df = joblib.load("embeddings.joblib")
df["embedding"] = df["embedding"].apply(np.array)

# Load embedding model
embedding_model = SentenceTransformer("BAAI/bge-m3")

def create_embedding(text_list):
    return embedding_model.encode(text_list, normalize_embeddings=True)

# Load LLM once
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-1.5B-Instruct",
    device_map="auto"
)

def inference(prompt):
    messages = [{"role": "user", "content": prompt}]
    encoded = tokenizer.apply_chat_template(
        messages,
        return_tensors="pt",
        add_generation_prompt=True
    ).to(model.device)

    output_ids = model.generate(
        encoded,
        max_new_tokens=256,
        temperature=0.7,
        do_sample=True
    )

    resp = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    if "assistant" in resp:
        return resp.split("assistant")[-1].strip()
    return resp.strip()


In [None]:
incoming_query = input("Ask a Question: ")

question_embedding = create_embedding([incoming_query])[0]
similarities = cosine_similarity(
    np.vstack(df["embedding"].to_numpy()), 
    [question_embedding]
).flatten()
max_indx = similarities.argsort()[::-1][:5]

max_similarity = similarities[max_indx[0]]
RELEVANCE_THRESHOLD = 0.3

if max_similarity < RELEVANCE_THRESHOLD:
    print(f"\n{'='*60}")
    print(f"Question: {incoming_query}")
    print(f"{'='*60}\n")
    print("Answer:\nI couldn't find any relevant information about this topic in the course materials. This question appears to be outside the scope of this course.")
    print(f"\n{'='*60}\n")
else:
    new_df = df.loc[max_indx]
    
    context_text = ""
    for idx, row in new_df.iterrows():
        context_text += f"\n[Video #{row['number']}: {row['title']}]\n"
        context_text += f"Timestamp: {row['start']}s - {row['end']}s\n"
        context_text += f"Content: {row['text']}\n"
    
    relevance_check_prompt = f"""Question: {incoming_query}

Course content:
{context_text}

Is this question answerable using the course content above? Answer only "YES" or "NO".

Answer:"""
    
    relevance_response = inference(relevance_check_prompt).strip().upper()
    
    if "NO" in relevance_response or "NOT" in relevance_response:
        print(f"\n{'='*60}")
        print(f"Question: {incoming_query}")
        print(f"{'='*60}\n")
        print("Answer:\nI couldn't find any relevant information about this topic in the course materials. This question appears to be outside the scope of this course.")
        print(f"\n{'='*60}\n")
    else:
        
        prompt = f"""Question: {incoming_query}

Here are relevant video segments from the course:
{context_text}

Answer the question by listing the videos where this topic is covered. Use this exact format for each video:

- Video #[number] ([title]) at [start]s-[end]s: [brief description]

Answer:"""
        
        response = inference(prompt)
        
        print(f"\n{'='*60}")
        print(f"Question: {incoming_query}")
        print(f"{'='*60}\n")
        print(f"Answer:\n{response}")
        print(f"\n{'='*60}\n")
