<a href="https://colab.research.google.com/github/perctapera/MSIT3103-GenAI-assignments/blob/main/assignment2_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Assignment 2 — Step-by-Step: GPT‑2, BERT, and T5 on CNN/DailyMail

**Decoder-only** — GPT‑2 (causal LM)
**Encoder-only** — BERT (extractive summarization via sentence scoring)  
**Encoder-decoder** — T5 (text-to-text summarization)


In [26]:
!pip install -q datasets transformers accelerate evaluate rouge-score sacrebleu nltk sentencepiece matplotlib pandas evaluate
import nltk; nltk.download('punkt')
!pip install nltk
import nltk
nltk.download('punkt_tab')
import transformers
print(transformers.__version__)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


4.56.1


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!



## Imports, Reproducibility, and Global Config


In [6]:

import os, math, json, random
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from datasets import load_dataset
import evaluate

from transformers import (
    AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM, AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
    DataCollatorForLanguageModeling, DataCollatorForSeq2Seq,
    Trainer, TrainingArguments
)

from nltk.tokenize import sent_tokenize

# Reproducibility
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Device:', device)

# Global Hyperparameters
MAX_SOURCE_LEN = 512
MAX_TARGET_LEN = 128
EPOCHS = 3
TRAIN_BS = 4
EVAL_BS = 4

# Quick-run toggles
QUICK_RUN = True         # use smaller subsets
TRAIN_EXAMPLES = 2000    # 2k for dev; increase for real runs
VAL_EXAMPLES   = 400
EVAL_SAMPLES   = 100     # #val examples for computing metrics quickly

# Output dirs
Path('artifacts').mkdir(exist_ok=True)


Device: cuda



## Dataset: CNN/DailyMail (3.0.0)

We use the **CNN/DailyMail** summarization dataset. Each record has:
- `article` — input document
- `highlights` — reference summary

We create **small subsets** for dev runs; played along with size limits.


In [7]:

dataset = load_dataset("cnn_dailymail", "3.0.0")

if QUICK_RUN:
    small_train = dataset["train"].shuffle(seed=SEED).select(range(min(TRAIN_EXAMPLES, len(dataset['train']))))
    small_val   = dataset["validation"].shuffle(seed=SEED).select(range(min(VAL_EXAMPLES, len(dataset['validation']))))
else:
    small_train = dataset["train"]
    small_val   = dataset["validation"]

print(dataset)
print('Train subset:', len(small_train), ' | Val subset:', len(small_val))
print('Columns:', small_train.column_names)

i = 0
print("\n--- Peek sample ---")
print("Article (truncated):\n", small_train[i]["article"][:800], "...")
print("\nReference summary:\n", small_train[i]["highlights"])


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})
Train subset: 2000  | Val subset: 400
Columns: ['article', 'highlights', 'id']

--- Peek sample ---
Article (truncated):
 By . Anthony Bond . PUBLISHED: . 07:03 EST, 2 March 2013 . | . UPDATED: . 08:07 EST, 2 March 2013 . Three members of the same family who died in a static caravan from carbon monoxide poisoning would have been unconscious 'within minutes', investigators said today. The bodies of married couple John and Audrey Cook were discovered alongside their daughter, Maureen, at the mobile home they shared on Tremarle Home Park in Camborne, west Cornwall. The inquests have now opened into the deaths last Saturday, with investigators saying the three died


## Metrics & Utility Helpers
We’ll compute:
- **ROUGE** (1/2/L)
- **BLEU** (via `sacrebleu`)
- **Perplexity (PPL)** for generative models (from eval loss)

- Safe generation helpers
- Log history saving & loss plotting


In [8]:

rouge_metric = evaluate.load("rouge")
bleu_metric  = evaluate.load("sacrebleu")

def compute_rouge_bleu(preds, refs):
    """Compute ROUGE and BLEU. `refs` must be list[str] for ROUGE and list[list[str]] for BLEU inside this function."""
    rouge = rouge_metric.compute(predictions=preds, references=refs)
    bleu  = bleu_metric.compute(predictions=preds, references=[[r] for r in refs])
    return rouge, bleu

def exp_perplexity(eval_loss: float):
    try:
        return math.exp(eval_loss)
    except Exception:
        return None

def save_metrics(name, metrics_dict):
    path = Path("artifacts") / f"{name}_metrics.json"
    with open(path, "w", encoding="utf-8") as f:
        json.dump(metrics_dict, f, indent=2)
    print(f"Saved metrics → {path.resolve()}")

def save_log_history(name, trainer):
    hist = trainer.state.log_history if hasattr(trainer, "state") else []
    path = Path("artifacts") / f"{name}_log_history.json"
    with open(path, "w", encoding="utf-8") as f:
        json.dump(hist, f, indent=2)
    print(f"Saved log history → {path.resolve()}")

def plot_loss_from_history(log_history, title):
    df = pd.DataFrame(log_history)
    df = df[df.get("loss").notna() | df.get("eval_loss").notna()]
    plt.figure(figsize=(7,4))
    if "loss" in df:
        plt.plot(df["step"], df["loss"], label="train_loss")
    if "eval_loss" in df:
        plt.plot(df["step"], df["eval_loss"], label="eval_loss")
    plt.legend(); plt.title(title); plt.xlabel("step"); plt.ylabel("loss"); plt.show()

def plot_loss_from_file(name, title=None):
    path = Path("artifacts") / f"{name}_log_history.json"
    if not path.exists():
        print("No log history found:", path)
        return
    with open(path, "r", encoding="utf-8") as f:
        hist = json.load(f)
    plot_loss_from_history(hist, title or name)


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

## Decoder-only — GPT‑2 (Abstractive Summarization)
**Framing:** causal LM over:  
`"Summarize the following article:\n{article}\n\nTL;DR:\n{summary}"`

> For stricter training, we can mask the prompt (set labels to `-100` for prompt tokens) so loss focuses on the summary.


In [12]:

from transformers import DataCollatorForLanguageModeling

gpt_name = "gpt2"
gpt_tok = AutoTokenizer.from_pretrained(gpt_name)
gpt_tok.pad_token = gpt_tok.eos_token  # add pad token

SEP = "\n\nTL;DR:\n"

def gpt_map(batch):
    text = [f"Summarize the following article:\n{a}{SEP}{s}" for a, s in zip(batch["article"], batch["highlights"])]
    enc = gpt_tok(text, truncation=True, padding="max_length",
                  max_length=MAX_SOURCE_LEN + MAX_TARGET_LEN)
    enc["labels"] = enc["input_ids"].copy()
    return enc

gpt_train = small_train.map(gpt_map, batched=True, remove_columns=small_train.column_names)
gpt_val   = small_val.map(gpt_map,   batched=True, remove_columns=small_val.column_names)

gpt_model = AutoModelForCausalLM.from_pretrained(gpt_name).to(device)

gpt_args = TrainingArguments(
    output_dir="out_gpt2",
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    learning_rate=1e-4,
    num_train_epochs=EPOCHS,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    report_to=["none"],
    fp16=torch.cuda.is_available(),
)

gpt_collator = DataCollatorForLanguageModeling(gpt_tok, mlm=False)

trainer_gpt = Trainer(
    model=gpt_model, args=gpt_args,
    train_dataset=gpt_train, eval_dataset=gpt_val,
    data_collator=gpt_collator, tokenizer=gpt_tok
)

print("GPT‑2 is configured. Uncomment trainer_gpt.train() to fine-tune.")
trainer_gpt.train()
save_log_history("gpt2", trainer_gpt)
gpt2_last_eval = [x for x in trainer_gpt.state.log_history if "eval_loss" in x][-1]
print("GPT‑2 Perplexity:", exp_perplexity(gpt2_last_eval["eval_loss"]))


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

  trainer_gpt = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


GPT‑2 is configured. Uncomment trainer_gpt.train() to fine-tune.


Epoch,Training Loss,Validation Loss
1,3.0244,2.84562
2,2.7718,2.838131
3,2.6476,2.847147


Saved log history → /content/artifacts/gpt2_log_history.json
GPT‑2 Perplexity: 17.23852166170957



### GPT‑2 Generation & Metrics
After training, we run the following to generate summaries, compute ROUGE/BLEU/Perplexity, and saving metrics.


In [14]:

def gpt_generate(article, max_new_tokens=128):
    prompt = f"Summarize the following article:\n{article}{SEP}"
    inp = gpt_tok(prompt, return_tensors="pt", truncation=True, max_length=MAX_SOURCE_LEN).to(device)
    out = gpt_model.generate(**inp, max_new_tokens=max_new_tokens, do_sample=False)
    text = gpt_tok.decode(out[0], skip_special_tokens=True)
    return text.split(SEP)[-1].strip()

def evaluate_gpt2(n=EVAL_SAMPLES):
    sample = small_val.select(range(min(n, len(small_val))))
    preds = [gpt_generate(a) for a in sample["article"]]
    refs  = sample["highlights"]
    rouge, bleu = compute_rouge_bleu(preds, refs)
    # PPL from last eval loss if available
    ppl = None
    if hasattr(trainer_gpt, "state"):
        ev = [x for x in trainer_gpt.state.log_history if "eval_loss" in x]
        if ev:
            ppl = exp_perplexity(ev[-1]["eval_loss"])
    metrics = {"rouge": rouge, "bleu": bleu, "perplexity": ppl}
    save_metrics("gpt2", metrics)
    # return first 3 pairs for quick inspection
    examples = [{"pred": p, "ref": r} for p, r in list(zip(preds, refs))[:3]]
    print("Sample outputs saved below (first 3).")
    for j, ex in enumerate(examples):
        print(f"\n--- Example {j} ---\nPRED:\n{ex['pred']}\n\nREF:\n{ex['ref']}")
    return metrics

# evaluate_gpt2(n=50)



## Encoder-only — BERT (Extractive Summarization)

**Approach:** Split the article into sentences, score each with a **BERT classifier** trained with **weak labels**:
- For each sentence, compute ROUGE‑1 F1 vs. the gold summary.
- Label = 1 if ≥ τ (e.g., 0.5), else 0.
- At inference, select top‑k sentences (e.g., k=3).

This reflects BERT’s strength in **understanding/extraction** rather than free-form generation.


In [18]:

bert_name = "bert-base-uncased"
bert_tok = AutoTokenizer.from_pretrained(bert_name)

def weak_labels(article, summary, tau=0.5, max_sents=30):
    sents = sent_tokenize(article)[:max_sents]
    labels = []
    for s in sents:
        r = rouge_metric.compute(predictions=[s], references=[summary])
        f1 = r["rouge1"]
        labels.append(1 if f1 >= tau else 0)
    return sents, labels

def bert_map(batch, max_sents=30, tau=0.5):
    input_ids, attention_mask, labels = [], [], []
    for a, s in zip(batch["article"], batch["highlights"]):
        sents, labs = weak_labels(a, s, tau=tau, max_sents=max_sents)
        enc = bert_tok(sents, truncation=True, padding="max_length", max_length=128)
        input_ids.extend(enc["input_ids"]); attention_mask.extend(enc["attention_mask"]); labels.extend(labs)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

bert_train = small_train.select(range(min(1500, len(small_train)))).map(
    lambda b: bert_map(b, max_sents=30, tau=0.5),
    batched=True, remove_columns=small_train.column_names
)
bert_val = small_val.select(range(min(300, len(small_val)))).map(
    lambda b: bert_map(b, max_sents=30, tau=0.5),
    batched=True, remove_columns=small_val.column_names
)

bert_model = AutoModelForSequenceClassification.from_pretrained(bert_name, num_labels=2).to(device)

def cls_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = float((preds == labels).mean())
    return {"accuracy": acc}

bert_args = TrainingArguments(
    output_dir="out_bert_extractive",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=5e-5,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    report_to=["none"],
    fp16=torch.cuda.is_available()
)

trainer_bert = Trainer(
    model=bert_model, args=bert_args,
    train_dataset=bert_train, eval_dataset=bert_val,
    tokenizer=bert_tok, compute_metrics=cls_metrics
)

print("BERT extractive is configured. Uncomment trainer_bert.train() to fine-tune.")
# trainer_bert.train()
# save_log_history("bert_extractive", trainer_bert)




Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

KeyboardInterrupt: 


### Build Extractive Summaries & Score
After training, select top‑k sentences by predicted probability and compute ROUGE/BLEU.


In [27]:

def summarize_extractive(article, k=3, max_sents=30):
    sents = sent_tokenize(article)[:max_sents]
    enc = bert_tok(sents, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        logits = bert_model(**enc).logits
        probs = torch.softmax(logits, dim=1)[:,1].cpu().numpy()
    idx = np.argsort(probs)[-k:]; idx.sort()
    return " ".join([sents[i] for i in idx])

def evaluate_bert_extractive(n=EVAL_SAMPLES, k=3):
    sample = small_val.select(range(min(n, len(small_val))))
    preds = [summarize_extractive(a, k=k) for a in sample["article"]]
    refs  = sample["highlights"]
    rouge, bleu = compute_rouge_bleu(preds, refs)
    metrics = {"rouge": rouge, "bleu": bleu, "perplexity": "N/A"}
    save_metrics("bert_extractive", metrics)
    examples = [{"pred": p, "ref": r} for p, r in list(zip(preds, refs))[:3]]
    print("Sample outputs saved below (first 3).")
    for j, ex in enumerate(examples):
        print(f"\n--- Example {j} ---\nPRED:\n{ex['pred']}\n\nREF:\n{ex['ref']}")
    return metrics

evaluate_bert_extractive(n=50, k=3)


NameError: name 'bert_model' is not defined


## Encoder-decoder — T5‑small (Abstractive Summarization)

**Framing:** text‑to‑text with prefix `"summarize: "`.


In [29]:

t5_name = "t5-small"
t5_tok = AutoTokenizer.from_pretrained(t5_name)
t5_model = AutoModelForSeq2SeqLM.from_pretrained(t5_name).to(device)

def t5_map(batch):
    inputs = ["summarize: " + a for a in batch["article"]]
    model_inputs = t5_tok(inputs, max_length=MAX_SOURCE_LEN, truncation=True)
    with t5_tok.as_target_tokenizer():
        labels = t5_tok(batch["highlights"], max_length=MAX_TARGET_LEN, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

t5_train = small_train.map(t5_map, batched=True, remove_columns=small_train.column_names)
t5_val   = small_val.map(t5_map,   batched=True, remove_columns=small_val.column_names)

t5_args = TrainingArguments(
    output_dir="out_t5",
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    learning_rate=5e-5,
    num_train_epochs=EPOCHS,
    eval_strategy="epoch",
    save_strategy="epoch",
    predict_with_generate=True,
    generation_max_length=MAX_TARGET_LEN,
    logging_steps=50,
    report_to=["none"],
    fp16=torch.cuda.is_available()
)

t5_collator = DataCollatorForSeq2Seq(t5_tok, model=t5_model)

def t5_metrics(eval_pred):
    preds, labels = eval_pred
    pred_texts = t5_tok.batch_decode(preds, skip_special_tokens=True)
    labels[labels == -100] = t5_tok.pad_token_id
    refs = t5_tok.batch_decode(labels, skip_special_tokens=True)
    r = rouge_metric.compute(predictions=pred_texts, references=refs)
    b = bleu_metric.compute(predictions=pred_texts, references=[[x] for x in refs])
    return {**r, "bleu": b["score"]}

trainer_t5 = Trainer(
    model=t5_model, args=t5_args,
    train_dataset=t5_train, eval_dataset=t5_val,
    data_collator=t5_collator, tokenizer=t5_tok,
    compute_metrics=t5_metrics
)

print("T5 is configured. Uncomment trainer_t5.train() to fine-tune.")
trainer_t5.train()
save_log_history("t5", trainer_t5)
t5_last_eval = [x for x in trainer_t5.state.log_history if "eval_loss" in x][-1]
print("T5 Perplexity:", exp_perplexity(t5_last_eval["eval_loss"]))


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



Map:   0%|          | 0/400 [00:00<?, ? examples/s]

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'predict_with_generate'

### T5 Generation & Metrics


In [30]:

def t5_generate(article, max_new_tokens=128):
    inp = t5_tok("summarize: " + article, return_tensors="pt", truncation=True, max_length=MAX_SOURCE_LEN).to(device)
    out = t5_model.generate(**inp, max_new_tokens=max_new_tokens)
    return t5_tok.decode(out[0], skip_special_tokens=True)

def evaluate_t5(n=EVAL_SAMPLES):
    sample = small_val.select(range(min(n, len(small_val))))
    preds = [t5_generate(a) for a in sample["article"]]
    refs  = sample["highlights"]
    rouge, bleu = compute_rouge_bleu(preds, refs)
    ppl = None
    if hasattr(trainer_t5, "state"):
        ev = [x for x in trainer_t5.state.log_history if "eval_loss" in x]
        if ev:
            ppl = exp_perplexity(ev[-1]["eval_loss"])
    metrics = {"rouge": rouge, "bleu": bleu, "perplexity": ppl}
    save_metrics("t5", metrics)
    examples = [{"pred": p, "ref": r} for p, r in list(zip(preds, refs))[:3]]
    print("Sample outputs saved below (first 3).")
    for j, ex in enumerate(examples):
        print(f"\n--- Example {j} ---\nPRED:\n{ex['pred']}\n\nREF:\n{ex['ref']}")
    return metrics

evaluate_t5(n=50)


NameError: name 'trainer_t5' is not defined


## Loss Curves (Visuals)

After you train each model, the previous cells save `*_log_history.json`.  


In [None]:

# usage after training:
plot_loss_from_file("gpt2", "GPT‑2 Loss")
plot_loss_from_file("bert_extractive", "BERT Extractive Loss")
plot_loss_from_file("t5", "T5 Loss")



## Comparative Table & CSV Export

Loads saved metrics JSONs (if present) and builds a single DataFrame for your report.


In [None]:

def load_metrics(name):
    p = Path("artifacts") / f"{name}_metrics.json"
    if not p.exists():
        return None
    with open(p, "r", encoding="utf-8") as f:
        return json.load(f)

def summarize_metrics():
    rows = []
    for tag, label in [("gpt2","GPT‑2 (decoder-only)"),
                       ("bert_extractive","BERT (encoder-only, extractive)"),
                       ("t5","T5-small (encoder-decoder)")]:

        m = load_metrics(tag)
        if not m:
            rows.append({"Model": label, "ROUGE-1": None, "ROUGE-2": None, "ROUGE-L": None,
                         "BLEU": None, "Perplexity": None})
            continue

        rouge = m.get("rouge", {})
        bleu  = m.get("bleu", {}).get("score", None)
        ppl   = m.get("perplexity", None)

        rows.append({
            "Model": label,
            "ROUGE-1": rouge.get("rouge1", None),
            "ROUGE-2": rouge.get("rouge2", None),
            "ROUGE-L": rouge.get("rougeL", None),
            "BLEU": bleu,
            "Perplexity": ppl
        })
    df = pd.DataFrame(rows)
    csv_path = Path("artifacts") / "comparative_metrics.csv"
    df.to_csv(csv_path, index=False)
    print("Saved comparative CSV →", csv_path.resolve())
    return df

summarize_metrics()



## Qualitative Panel (Side-by-side)

Show reference and outputs from each model for a given validation index.


In [None]:

def qualitative_panel(idx=0, max_new_tokens=128, k=3):
    idx = int(idx)
    art = small_val[idx]["article"]
    ref = small_val[idx]["highlights"]

    out = {"Reference": ref}
    try:
        out["GPT‑2"] = gpt_generate(art, max_new_tokens=max_new_tokens)
    except Exception as e:
        out["GPT‑2"] = f"(not generated: {e})"
    try:
        out["BERT (extractive)"] = summarize_extractive(art, k=k)
    except Exception as e:
        out["BERT (extractive)"] = f"(not generated: {e})"
    try:
        out["T5-small"] = t5_generate(art, max_new_tokens=max_new_tokens)
    except Exception as e:
        out["T5-small"] = f"(not generated: {e})"

    for k_, v in out.items():
        print(f"\n=== {k_} ===\n{v[:1200]}")
    return out

qualitative_panel(0)
