In [1]:
# ============================================
# Part A. RAG without API Key (Required)
# ============================================
!pip -q install faiss-cpu chromadb sentence-transformers rank_bm25 pypdf

from sentence_transformers import SentenceTransformer
from chromadb import Client
from rank_bm25 import BM25Okapi
import numpy as np, re

# 1. Example text (you can replace with your course notes or a PDF)
text = """
The University of Exeter offers MSc programs in AI and Data Science.
Students learn about reinforcement learning, trustworthy AI, and RAG.
Assignments involve building models and evaluating robustness.
"""

# 2. Simple chunking function
def chunk_text(t, size=200, overlap=50):
    t = re.sub(r'\s+', ' ', t.strip())
    chunks, i = [], 0
    while i < len(t):
        chunks.append(t[i:i+size])
        i += (size - overlap)
    return chunks

docs = chunk_text(text, size=200, overlap=50)

# 3. Embedding & vector index
emb = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
X = emb.encode(docs, normalize_embeddings=True)

client = Client()
collection = client.create_collection("notes")
for i, d in enumerate(docs):
    collection.add(ids=[str(i)], documents=[d], embeddings=[X[i].tolist()])

# 4. BM25 index
bm25 = BM25Okapi([d.split() for d in docs])

# 5. Retrieval function
def retrieve(query, top_k=3, use_bm25=False):
    if use_bm25:
        scores = bm25.get_scores(query.split())
        idx = np.argsort(scores)[::-1][:top_k]
        return [(int(i), docs[int(i)]) for i in idx]
    else:
        q = emb.encode([query], normalize_embeddings=True)[0]
        res = collection.query(query_embeddings=[q.tolist()], n_results=top_k)
        ids = list(map(int, res["ids"][0]))
        return [(i, docs[i]) for i in ids]

# 6. Draft generator (template answer, no LLM needed)
def answer_draft(query):
    hits = retrieve(query, top_k=3, use_bm25=False)
    context = "\n\n".join([f"[{i}] {c}" for i,c in hits])
    return f"Q: {query}\nEvidence:\n{context}\n\nAnswer (draft): Based on the evidence above."

print(answer_draft("What topics are taught in the MSc AI course?"))

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m61.4/67.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.5/323.5 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m17.1 M

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Q: What topics are taught in the MSc AI course?
Evidence:
[0] The University of Exeter offers MSc programs in AI and Data Science. Students learn about reinforcement learning, trustworthy AI, and RAG. Assignments involve building models and evaluating robustness

[1]  involve building models and evaluating robustness.

Answer (draft): Based on the evidence above.


In [None]:
 ============================================
# Part B. RAG with API Key (Optional Extension)
# Requires Google Gemini API Key
# Get it at: https://aistudio.google.com
# ============================================

!pip -q install google-generativeai

import os, google.generativeai as genai

# Provide your API key here (or use Colab secret manager)
os.environ["GOOGLE_API_KEY"] = "YOUR_GEMINI_API_KEY"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

def answer_llm(query):
    hits = retrieve(query, top_k=3)
    context = "\n\n".join([c for _,c in hits])
    prompt = f"Based on the following context, answer the question.\nContext:\n{context}\n\nQuestion: {query}\nPlease include citations to the retrieved text."
    resp = genai.GenerativeModel("gemini-1.5-flash").generate_content(prompt)
    return resp.text

print(answer_llm("What topics are taught in the MSc AI course?"))