In [3]:
# ======================================
# Milestone 1: Build BM25 Retrieval
# ======================================
import json, re, numpy as np
from rank_bm25 import BM25Okapi
import nltk
from nltk.corpus import stopwords

# Download stopwords on first run
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def tokenize(text):
    tokens = re.findall(r"\w+", text.lower())
    return [t for t in tokens if t not in stop_words]

# Load and combine the two JSON corpora
with open('top_ai_questions.json') as f:    ai_data = json.load(f)
with open('top_datascience_questions.json') as f: ds_data = json.load(f)
passages = [entry['answer'] for entry in ai_data + ds_data]

# Build BM25 index
tokenized = [tokenize(p) for p in passages]
bm25 = BM25Okapi(tokenized)

def bm25_query(q, k=10):
    q_tok = tokenize(q)
    scores = bm25.get_scores(q_tok)
    idx = np.argsort(scores)[-k:][::-1]
    return [(passages[i], float(scores[i])) for i in idx]

# Sample: top-10 for “What is BM25?”
for doc, score in bm25_query("What is BM25?", k=10):
    print(f"{score:.3f} — {doc[:100]}...")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


FileNotFoundError: [Errno 2] No such file or directory: 'top_ai_questions.json'

In [6]:
# Mount Google Drive (if you haven’t already)
from google.colab import drive
drive.mount('/content/drive')

# Base path to your “Colab Notebooks” folder
base = '/content/drive/My Drive/Colab Notebooks'

import json, re, numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import faiss
import nltk
from nltk.corpus import stopwords

# Download stopwords on first run
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def tokenize(text):
    tokens = re.findall(r"\w+", text.lower())
    return [t for t in tokens if t not in stop_words]

# 1. Load & merge your JSON corpora
with open(f'{base}/top_ai_questions.json') as f:
    ai_data = json.load(f)
with open(f'{base}/top_datascience_questions.json') as f:
    ds_data = json.load(f)

# 2. Extract “passages” from title + body
passages = []
for entry in ai_data + ds_data:
    title = entry.get('title', '').strip()
    body  = entry.get('body', '').strip()
    # Optionally strip HTML tags from body if needed:
    # import re
    # body = re.sub(r'<[^>]+>', '', body)
    combined = title + "\n\n" + body
    passages.append(combined)

print(f"Loaded {len(passages)} passages.")

# 3. Build BM25 index
tokenized = [tokenize(p) for p in passages]
bm25 = BM25Okapi(tokenized)

def bm25_query(q, k=10):
    q_tok = tokenize(q)
    scores = bm25.get_scores(q_tok)
    idx = np.argsort(scores)[-k:][::-1]
    return [(passages[i], float(scores[i])) for i in idx]

# 4. Build semantic index
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embed_model.encode(passages, convert_to_numpy=True)
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

def semantic_query(q, k=10):
    q_emb = embed_model.encode([q], convert_to_numpy=True)
    D, I = index.search(q_emb, k)
    return [(passages[i], float(D[0][j])) for j, i in enumerate(I[0])]

# Example checks
print("BM25 top-3 for 'What is BM25?':")
for txt, sc in bm25_query("What is BM25?", 3):
    print(f"{sc:.2f} — {txt[:80]}…")

print("\nSemantic top-3 for 'What is BM25?':")
for txt, dist in semantic_query("What is BM25?", 3):
    print(f"{dist:.2f} — {txt[:80]}…")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loaded 1500 passages.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

BM25 top-3 for 'What is BM25?':
8.10 — How do I compute the structural similarity between sentences?

<p>I am working o…
0.00 — 5 years later, are maxout networks dead, and why?

<p><a href="https://arxiv.org…
0.00 — What is the relationship between these two taxonomies for machine learning with …

Semantic top-3 for 'What is BM25?':
1.43 — What is the bleu score of professional human translators?

<p>Machine translatio…
1.54 — What is the definition of &quot;soft label&quot; and &quot;hard label&quot;?

<p…
1.55 — Intuition Behind Restricted Boltzmann Machine (RBM)

<p>I went through Geoff Hin…


In [7]:

# 1) List out a few question titles so you can choose one
for i, p in enumerate(passages[:10]):
    title = p.split("\n\n")[0]
    print(f"{i:2d}: {title}")

# 2) Pick one of the above as your test query. For example index 1 was:
query = passages[1].split("\n\n")[0]
print("\nUsing query:", query)

# 3) Show BM25 & semantic results for that known‐in‐corpus query
print("\nBM25 top-3:")
for txt, score in bm25_query(query, k=3):
    print(f" • {score:.2f}\t{txt.splitlines()[0]}")

print("\nSemantic top-3:")
for txt, dist in semantic_query(query, k=3):
    print(f" • {dist:.2f}\t{txt.splitlines()[0]}")


 0: Could a paradox kill an AI?
 1: What is the difference between artificial intelligence and machine learning?
 2: How could self-driving cars make ethical decisions about who to kill?
 3: How can neural networks deal with varying input sizes?
 4: Do scientists know what is happening inside artificial neural networks?
 5: What is self-supervised learning in machine learning?
 6: What&#39;s the difference between model-free and model-based reinforcement learning?
 7: How is it possible that deep neural networks are so easily fooled?
 8: Why does the transformer do better than RNN and LSTM in long-range context dependencies?
 9: Why do we need explainable AI?

Using query: What is the difference between artificial intelligence and machine learning?

BM25 top-3:
 • 19.33	Difference between machine learning and artificial intelligence
 • 14.45	What are the top artificial intelligence journals?
 • 14.41	What is the difference between artificial intelligence and robots?

Semantic top-3:
 •

In [8]:
import math
import numpy as np

# --- 1) DEFINE YOUR EVAL SET ---
# Pick a handful of queries you know are answered in your corpus
eval_queries = [
    "What is the difference between artificial intelligence and machine learning?",
    "How can neural networks deal with varying input sizes?",
    "What is self-supervised learning in machine learning?"
]

# For each query, list the exact corpus passages (by text) that you consider relevant.
# (You could also use passage indices if you prefer; here we match on the full text.)
ground_truth = {
    eval_queries[0]: {
        passages[i] for i, p in enumerate(passages)
        if "difference between artificial intelligence and machine learning" in p.lower()
    },
    eval_queries[1]: {
        passages[i] for i, p in enumerate(passages)
        if "varying input sizes" in p.lower()
    },
    eval_queries[2]: {
        passages[i] for i, p in enumerate(passages)
        if "self-supervised learning" in p.lower()
    }
}

# --- 2) METRIC FUNCTIONS ---
def binary_relevance(retrieved, relevant_set):
    return [1 if doc in relevant_set else 0 for doc in retrieved]

def average_precision(rel, k=10):
    rel = rel[:k]
    if sum(rel) == 0:
        return 0.0
    return sum((sum(rel[:i+1])/(i+1))*rel[i] for i in range(len(rel))) / sum(rel)

def reciprocal_rank(rel, k=10):
    rel = rel[:k]
    for i, r in enumerate(rel):
        if r:
            return 1.0/(i+1)
    return 0.0

def dcg(rel, k=10):
    return sum((2**rel[i]-1)/math.log2(i+2) for i in range(min(len(rel), k)))

def ndcg(rel, k=10):
    ideal = sorted(rel, reverse=True)
    max_dcg = dcg(ideal, k)
    return dcg(rel, k) / max_dcg if max_dcg > 0 else 0.0

def evaluate(ranked_lists, ground_truth_sets):
    maps, mrrs, ndcgs = [], [], []
    for q, rel_list in zip(eval_queries, ranked_lists):
        relevant = ground_truth_sets[q]
        rel_vec = binary_relevance(rel_list, relevant)
        maps.append(average_precision(rel_vec))
        mrrs.append(reciprocal_rank(rel_vec))
        ndcgs.append(ndcg(rel_vec))
    return np.mean(maps), np.mean(mrrs), np.mean(ndcgs)

# --- 3) RUN EVAL FOR BM25 & SEMANTIC ---
bm25_ranked = []
sem_ranked   = []

for q in eval_queries:
    bm25_ranked.append([doc for doc, _ in bm25_query(q, k=10)])
    sem_ranked.append([doc for doc, _ in semantic_query(q, k=10)])

bm25_map, bm25_mrr, bm25_ndcg = evaluate(bm25_ranked, ground_truth)
sem_map,   sem_mrr,   sem_ndcg   = evaluate(sem_ranked,   ground_truth)

print("BM25   → MAP@10: {:.3f}, MRR@10: {:.3f}, nDCG@10: {:.3f}".format(bm25_map, bm25_mrr, bm25_ndcg))
print("Semantic → MAP@10: {:.3f}, MRR@10: {:.3f}, nDCG@10: {:.3f}".format(sem_map,   sem_mrr,   sem_ndcg))


BM25   → MAP@10: 0.733, MRR@10: 0.733, nDCG@10: 0.796
Semantic → MAP@10: 0.833, MRR@10: 0.833, nDCG@10: 0.877


In [2]:

# Install all required libraries for the A100-optimized RAG pipeline
!pip install transformers accelerate bitsandbytes sentence-transformers faiss-cpu rank_bm25 nltk


Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collec

In [5]:
# 1) (Re)install & import
!pip install transformers accelerate bitsandbytes --quiet

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    BitsAndBytesConfig,
    pipeline
)
import torch

# 2) Load FLAN-T5-Base in 8-bit on A100
model_name = "google/flan-t5-base"
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model     = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto",
    torch_dtype=torch.float16
)

# 3) Build the text2text pipeline (no device= argument!)
qa = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=150,
    do_sample=False,
    truncation=True
)

# 4) Strong instruction prompt to force a definition
def generate_definition(query: str) -> str:
    prompt = (
        "Task: In exactly two sentences, explain the following machine learning concept "
        "in your own words. Do NOT repeat the term itself and do NOT ask a follow-up question.\n\n"
        f"Concept: {query}\n\n"
        "Answer:"
    )
    return qa(prompt)[0]["generated_text"].strip()

# 5) Test it
if __name__ == "__main__":
    print(generate_definition("self-supervised learning in machine learning"))
    print("GPU:", torch.cuda.get_device_name(0))


Device set to use cuda:0


Self-supervised learning (SBL) is a type of self-supervised learning in machine learning.
GPU: NVIDIA A100-SXM4-40GB


In [8]:
import os, re, json, numpy as np, torch, nltk, faiss
from google.colab import drive
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)

# 1) Mount & load corpora
drive.mount("/content/drive", force_remount=False)
BASE = "/content/drive/My Drive/Colab Notebooks"
with open(f"{BASE}/top_ai_questions.json")    as f: ai = json.load(f)
with open(f"{BASE}/top_datascience_questions.json") as f: ds = json.load(f)

# 2) Build passages
passages = []
for e in ai + ds:
    title = e.get("title","").strip()
    body  = re.sub(r"<[^>]+>","", e.get("body","")).strip()
    passages.append(f"{title}\n\n{body}")

# 3) BM25 index
nltk.download("stopwords")
stop = set(stopwords.words("english"))
def tokenize(txt):
    return [t for t in re.findall(r"\w+", txt.lower()) if t not in stop]
bm25 = BM25Okapi([tokenize(p) for p in passages])
def bm25_query(q,k=5):
    toks,scores = tokenize(q), bm25.get_scores(tokenize(q))
    idx = np.argsort(scores)[-k:][::-1]; return [(passages[i],scores[i]) for i in idx]

# 4) Semantic index
embed = SentenceTransformer("all-MiniLM-L6-v2")
embs  = embed.encode(passages, convert_to_numpy=True)
idx   = faiss.IndexFlatL2(embs.shape[1]); idx.add(embs)
def semantic_query(q,k=5):
    D,I = idx.search(embed.encode([q],convert_to_numpy=True), k)
    return [(passages[i], float(D[0][j])) for j,i in enumerate(I[0])]

# 5) RRF fusion
def rrf(lists, top_k=5):
    sc = {}
    for lst in lists:
        for r,(t,_) in enumerate(lst,1):
            sc[t]=sc.get(t,0)+1/r
    return [t for t,_ in sorted(sc.items(), key=lambda x:-x[1])[:top_k]]

# 6) Load MPT-7B-Instruct in 8-bit on A100
quant = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=6.0)
tok   = AutoTokenizer.from_pretrained("mosaicml/mpt-7b-instruct")
mdl   = AutoModelForCausalLM.from_pretrained(
    "mosaicml/mpt-7b-instruct",
    quantization_config=quant,
    device_map="auto",
    torch_dtype=torch.float16
)
rag = pipeline(
    "text-generation", model=mdl, tokenizer=tok,
    max_new_tokens=200, do_sample=False, return_full_text=False
)

# 7) RAG agent
def generate_with_mpt(query, k=5):
    bm = bm25_query(query,k); sm = semantic_query(query,k)
    ctx = "\n\n".join(f"[{i+1}] {t[:200]}..." for i,(t,_) in enumerate(rrf([bm,sm],k)))
    prompt = f"Background:\n{ctx}\n\nQuestion: {query}\nAnswer:"
    return rag(prompt)[0]["generated_text"].strip()

# 8) Test
print("Using GPU:", torch.cuda.get_device_name(0))
print(generate_with_mpt("Explain self-supervised learning in two sentences without repeating the term.", k=5))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.36G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.8k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Using GPU: NVIDIA A100-SXM4-40GB


ValueError: too many values to unpack (expected 2)

In [10]:
def generate_with_mpt(query: str, k: int = 5) -> str:
    # 1) Retrieve
    bm = bm25_query(query, k)
    sm = semantic_query(query, k)
    fused_texts = rrf([bm, sm], top_k=k)  # list of strings

    # 2) Build numbered context without backslashes in f-strings
    context_lines = []
    for i, txt in enumerate(fused_texts):
        # replace newlines, truncate, add ellipsis if needed
        flat = txt.replace("\n", " ")
        snippet = flat[:200] + ("..." if len(flat) > 200 else "")
        context_lines.append(f"[{i+1}] {snippet}")
    context = "\n\n".join(context_lines)

    # 3) Prompt
    prompt = (
        f"Background:\n{context}\n\n"
        f"Question: {query}\n"
        "Answer:"
    )

    # 4) Generate (deterministic by default)
    return rag(prompt)[0]["generated_text"].strip()

# Test it
print("Using GPU:", torch.cuda.get_device_name(0))
print(generate_with_mpt(
    "Explain self-supervised learning in two sentences without repeating the term.",
    k=5
))


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Using GPU: NVIDIA A100-SXM4-40GB
Self-supervised learning is a machine learning technique that uses unlabeled data to learn useful representations of the data.  This is in contrast to supervised learning, which uses labeled data to learn useful representations.  Self-supervised learning is useful because it can learn useful representations from unlabeled data, which is often more plentiful than labeled data.


In [12]:
# 1) Define your evaluation questions
eval_questions = [
    "What is contrastive learning?",
    "How does self-attention work in Transformers?",
    "Explain transfer learning in two sentences.",
    "What are deconvolutional layers?",
    "How do you set class weights in Keras for imbalanced data?"
]

# 2) Collect (query, retrieved, answer) tuples
examples = []
for q in eval_questions:
    # retrieve top‐3 passages via BM25 + semantic + RRF
    bm = bm25_query(q, k=3)
    sm = semantic_query(q, k=3)
    ctx = reciprocal_rank_fusion([bm, sm], top_k=3)  # list of passages
    # generate answer with Flan-T5 RAG function
    answer = generate_answer_with_t5(q, k=3)
    examples.append((q, ctx, answer))

# 3) Render Markdown in Colab
for q, ctx, ans in examples:
    print(f"### Q: {q}\n")
    print("**Retrieved Passages:**")
    for i, p in enumerate(ctx, 1):
        snippet = p.replace("\n", " ")[:150] + ("…" if len(p) > 150 else "")
        print(f"- [{i}] {snippet}")
    print(f"\n**Answer:**\n{ans}\n")
    print("---\n")


NameError: name 'generate_answer_with_t5' is not defined