## Step 1: Import Set-up 

In [1]:
# Device Check
import torch
import transformers
import peft
import datasets
import sklearn
import numpy as np
import evaluate

print("=== Environment Pre-Check ===")
print(f"Torch version:          {torch.__version__}")
print(f"Transformers version:   {transformers.__version__}")
print(f"PEFT version:           {peft.__version__}")
print(f"Datasets version:       {datasets.__version__}")
print(f"Scikit-learn version:   {sklearn.__version__}")
print(f"NumPy version:          {np.__version__}")
print(f"Evaluate version: {evaluate.__version__}")

print("\n=== Device Check ===")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device in use:  {'cuda' if torch.cuda.is_available() else 'cpu'}")


=== Environment Pre-Check ===
Torch version:          2.9.1+cpu
Transformers version:   4.57.3
PEFT version:           0.18.0
Datasets version:       4.4.1
Scikit-learn version:   1.7.2
NumPy version:          2.3.5
Evaluate version: 0.4.6

=== Device Check ===
CUDA available: False
Device in use:  cpu


## Step 2: Dataset Preparation

In [9]:
from datasets import load_dataset, DatasetDict

# Load IMDb dataset
raw = load_dataset("imdb")

# Create validation split from train (stratified)
splits = raw["train"].train_test_split(test_size=0.2, stratify_by_column="label", seed=42)
train_ds = splits["train"]
val_ds = splits["test"]

# Use the original IMDb test set (already balanced)
test_ds = raw["test"]

dataset = DatasetDict({"train": train_ds, "validation": val_ds, "test": test_ds})

# ⚡ For CPU debugging, shrink dataset but shuffle first to keep balance
dataset_small = DatasetDict({
    "train": dataset["train"].shuffle(seed=42).select(range(500)),
    "validation": dataset["validation"].shuffle(seed=42).select(range(200)),
    "test": dataset["test"].shuffle(seed=42).select(range(100))
})
dataset = dataset_small


## Step 3: Tokenization & Formatting

In [3]:
from transformers import AutoTokenizer

model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name, legacy=False)

def preprocess_fn(examples):
    inputs = [f"review: {t}" for t in examples["text"]]
    labels_text = ["negative" if l == 0 else "positive" for l in examples["label"]]
    enc = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    enc_targets = tokenizer(text_target=labels_text, max_length=5, truncation=True, padding="max_length")
    enc["labels"] = enc_targets["input_ids"]
    return enc

tokenized = dataset.map(preprocess_fn, batched=True, remove_columns=dataset["train"].column_names)
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

## Step 4: Baseline Comparison (No Fine‑Tuning)

In [10]:
# baseline
from transformers import pipeline, AutoModelForSeq2SeqLM

baseline_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
baseline_pipe = pipeline(
    "text2text-generation",
    model=baseline_model,
    tokenizer=tokenizer,
    device=-1
)

def baseline_predict(texts):
    prompts = [f"review: {t}" for t in texts]
    # Tokenize with truncation to avoid >512 tokens
    enc = tokenizer(prompts, max_length=256, truncation=True, return_tensors="pt", padding=True)
    outs = baseline_model.generate(
        input_ids=enc["input_ids"],
        attention_mask=enc["attention_mask"],
        max_new_tokens=3
    )
    preds_str = tokenizer.batch_decode(outs, skip_special_tokens=True)
    return [1 if "positive" in s.lower() else 0 for s in preds_str]

sample = dataset["validation"].select(range(200))
baseline_preds = baseline_predict(sample["text"])
baseline_refs = sample["label"]

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

print("\nBaseline (no fine-tuning) on 200 validation samples:")
print("Accuracy:", accuracy.compute(predictions=baseline_preds, references=baseline_refs)["accuracy"])
print("Precision:", precision.compute(predictions=baseline_preds, references=baseline_refs, average="binary")["precision"])
print("Recall:", recall.compute(predictions=baseline_preds, references=baseline_refs, average="binary")["recall"])
print("F1:", f1.compute(predictions=baseline_preds, references=baseline_refs, average="binary")["f1"])

Device set to use cpu



Baseline (no fine-tuning) on 200 validation samples:
Accuracy: 0.485
Precision: 0.0
Recall: 0.0
F1: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Step 5: PEFT + LoRA Setup

In [5]:
from transformers import AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850


## Step 6: Training Configuration

In [12]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=1,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=50
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer
)

trainer.train()


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,0.0922,0.085823


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: ec609d31-e0f9-4974-a763-babc1c998cad)')' thrown while requesting HEAD https://huggingface.co/t5-small/resolve/main/config.json
Retrying in 1s [Retry 1/5].


TrainOutput(global_step=125, training_loss=0.09115130043029786, metrics={'train_runtime': 171.6717, 'train_samples_per_second': 2.913, 'train_steps_per_second': 0.728, 'total_flos': 17030971392000.0, 'train_loss': 0.09115130043029786, 'epoch': 1.0})

## Step 7: Evaluation Metrics

In [13]:
from sklearn.metrics import classification_report
import evaluate

# Load metrics
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def to_int_labels(strs):
    return [1 if "positive" in s.lower() else 0 for s in strs]

# Test evaluation (Assignment 7 held-out test set)
test_out = trainer.predict(tokenized["test"])
test_preds_str = tokenizer.batch_decode(test_out.predictions, skip_special_tokens=True)
test_labels_str = tokenizer.batch_decode(test_out.label_ids, skip_special_tokens=True)

test_preds = to_int_labels(test_preds_str)
test_refs = to_int_labels(test_labels_str)

print("Test metrics:")
print("Accuracy:", accuracy.compute(predictions=test_preds, references=test_refs)["accuracy"])
print("Precision (Macro):", precision.compute(predictions=test_preds, references=test_refs, average="macro")["precision"])
print("Recall (Macro):", recall.compute(predictions=test_preds, references=test_refs, average="macro")["recall"])
print("F1-Score (Macro):", f1.compute(predictions=test_preds, references=test_refs, average="macro")["f1"])

print("\nClassification report (test):")
print(classification_report(test_refs, test_preds, target_names=["negative", "positive"]))


Test metrics:
Accuracy: 0.8
Precision (Macro): 0.8044646548160397
Recall (Macro): 0.795664391810518
F1-Score (Macro): 0.797077922077922

Classification report (test):
              precision    recall  f1-score   support

    negative       0.78      0.87      0.82        53
    positive       0.83      0.72      0.77        47

    accuracy                           0.80       100
   macro avg       0.80      0.80      0.80       100
weighted avg       0.80      0.80      0.80       100



## Step 8: Inference on Custom Reviews

In [8]:
def classify_review(text: str):
    prompt = f"review: {text}"
    gen = trainer.model.generate(**tokenizer(prompt, return_tensors="pt"), max_new_tokens=3)
    pred_str = tokenizer.decode(gen[0], skip_special_tokens=True)
    return "positive" if "positive" in pred_str.lower() else "negative"

# Clear positive case
print("Test 1:", classify_review("This movie was amazing!"))  # Expected: positive

# Clear negative case
print("Test 2:", classify_review("Terrible acting and a boring plot."))  # Expected: negative

# Ambiguous/mixed sentiment case
print("Test 3:", classify_review("The visuals were stunning, but the story was weak."))  # Model’s prediction may vary


Test 1: positive
Test 2: negative
Test 3: negative
