# Trump GPT-2 Medium — Fine-Tuning on Google Colab

Fine-tunes `gpt2-medium` on Trump speech data collected for ECE 595 NLP Assignment 01.

**Before running:**
1. Set runtime to **T4 GPU**: `Runtime → Change runtime type → T4 GPU`
2. Upload `trump_train.txt` when prompted in the *Load Dataset* cell (or place it in Drive first)
3. Run all cells top-to-bottom

## 1 · Setup — Install Dependencies

In [None]:
# Install all required packages
!pip install -q transformers datasets accelerate soundfile scipy

import os, sys, torch
from datasets import Dataset
from transformers import (
    GPT2LMHeadModel,
    GPT2TokenizerFast,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)

print(f"PyTorch  : {torch.__version__}")
print(f"CUDA     : {torch.cuda.is_available()}  —  {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU only'}")

## 2 · Mount Google Drive

Mounting Drive lets checkpoints and the final model survive session disconnects.
After training, the model will be saved to `MyDrive/ECE595/models/trump/`.

In [None]:
from google.colab import drive
drive.mount("/content/drive")

DRIVE_ROOT = "/content/drive/MyDrive/ECE595"
os.makedirs(f"{DRIVE_ROOT}/data", exist_ok=True)
os.makedirs(f"{DRIVE_ROOT}/models/trump",   exist_ok=True)
print("Drive mounted. Project root:", DRIVE_ROOT)

## 3 · Configure Training Parameters

All hyperparameters are centralised here — edit this cell to tune the run.

In [None]:
# ── Paths ─────────────────────────────────────────────────────────────────────
TRAIN_DATA = f"{DRIVE_ROOT}/data/trump_train.txt"
OUTPUT_DIR = f"{DRIVE_ROOT}/models/trump"

# ── Hyperparameters ────────────────────────────────────────────────────────────
BASE_MODEL  = "gpt2-medium"   # 345 M params
EPOCHS      = 8
BATCH_SIZE  = 4               # safe for T4 (16 GB); raise to 8 on A100
GRAD_ACCUM  = 4               # effective batch = BATCH_SIZE × GRAD_ACCUM = 16
LEARN_RATE  = 5e-5
BLOCK_SIZE  = 512
RESUME      = False           # set True to continue from last checkpoint
USE_FP16    = True            # auto-disabled on CPU

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device   : {DEVICE}")
print(f"Data     : {TRAIN_DATA}")
print(f"Output   : {OUTPUT_DIR}")

## 4 · Load and Preprocess Dataset

Upload `trump_train.txt` from your local machine **or** copy it to Drive manually first.  
If the file is already in Drive (at the path configured above) the upload step is skipped automatically.

In [None]:
# ── Upload from local disk if not already on Drive ────────────────────────────
if not os.path.isfile(TRAIN_DATA):
    from google.colab import files
    print("trump_train.txt not found on Drive — upload it now:")
    uploaded = files.upload()                     # opens file picker
    fname = list(uploaded.keys())[0]
    with open(TRAIN_DATA, "wb") as f:
        f.write(uploaded[fname])
    print(f"Saved to {TRAIN_DATA}")
else:
    print(f"Found existing data file: {TRAIN_DATA}")

# ── Tokenise into fixed-length blocks ─────────────────────────────────────────
def build_dataset(tokenizer, file_path: str, block_size: int):
    print(f"\nLoading: {file_path}")
    with open(file_path, "r", encoding="utf-8") as f:
        raw_text = f.read()

    examples = [e.strip() for e in raw_text.split("\n\n") if e.strip()]
    print(f"Raw examples : {len(examples):,}")

    def tokenize(batch):
        tokens = tokenizer(
            batch["text"],
            truncation=True,
            max_length=block_size,
            padding="max_length",
        )
        tokens["labels"] = tokens["input_ids"].copy()
        return tokens

    ds = Dataset.from_dict({"text": examples})
    ds = ds.map(tokenize, batched=True, remove_columns=["text"], desc="Tokenising")
    ds.set_format(type="torch")
    return ds

## 5 · Build Model and Tokeniser

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Loading tokeniser from  {BASE_MODEL}  …")
tokenizer = GPT2TokenizerFast.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({
    "additional_special_tokens": [
        "<|startoftext|>", "<|endoftext|>",
        "[BIDEN]:", "[TRUMP]:",
    ]
})
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Loading model  {BASE_MODEL}  …")
model = GPT2LMHeadModel.from_pretrained(BASE_MODEL)
model.resize_token_embeddings(len(tokenizer))
model.to(DEVICE)

total_params = sum(p.numel() for p in model.parameters()) / 1e6
print(f"Model parameters : {total_params:.0f} M")
print(f"Vocab size       : {len(tokenizer)}")

## 6 · Training Loop

Runs via HuggingFace `Trainer`. Estimated time on a **free T4**: ~10–20 min total.

In [None]:
dataset  = build_dataset(tokenizer, TRAIN_DATA, BLOCK_SIZE)
split    = dataset.train_test_split(test_size=0.05, seed=42)
train_ds = split["train"]
eval_ds  = split["test"]
print(f"Train : {len(train_ds):,}  |  Eval : {len(eval_ds):,}")

# Auto-compute logging_steps so we always get at least one log per epoch
steps_per_epoch = max(1, len(train_ds) // (BATCH_SIZE * GRAD_ACCUM))
log_every       = max(1, steps_per_epoch // 4)   # ~4 logs per epoch
print(f"Steps/epoch: {steps_per_epoch}  →  logging every {log_every} steps")

training_args = TrainingArguments(
    output_dir                  = OUTPUT_DIR,
    num_train_epochs            = EPOCHS,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size  = BATCH_SIZE,
    gradient_accumulation_steps = GRAD_ACCUM,
    learning_rate               = LEARN_RATE,
    weight_decay                = 0.01,
    warmup_steps                = 16,
    lr_scheduler_type           = "cosine",
    eval_strategy               = "epoch",
    save_strategy               = "epoch",
    load_best_model_at_end      = True,
    metric_for_best_model       = "eval_loss",
    greater_is_better           = False,
    fp16                        = USE_FP16 and torch.cuda.is_available(),
    dataloader_pin_memory       = torch.cuda.is_available(),
    logging_steps               = log_every,
    report_to                   = "none",
    seed                        = 42,
)

trainer = Trainer(
    model         = model,
    args          = training_args,
    train_dataset = train_ds,
    eval_dataset  = eval_ds,
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

resume_ckpt = OUTPUT_DIR if RESUME and os.path.isdir(OUTPUT_DIR) else None
print("\nStarting training …")
trainer.train(resume_from_checkpoint=resume_ckpt)
print("Training complete.")

## 7 · Evaluate Model Performance

Plot the train/eval loss curves logged during training and run a quick perplexity check.

In [None]:
import math
import matplotlib.pyplot as plt

# ── Loss curves ───────────────────────────────────────────────────────────────
log_history = trainer.state.log_history

train_steps  = [e["step"] for e in log_history if "loss" in e]
train_losses = [e["loss"] for e in log_history if "loss" in e]
eval_epochs  = [e["epoch"] for e in log_history if "eval_loss" in e]
eval_losses  = [e["eval_loss"] for e in log_history if "eval_loss" in e]

fig, ax = plt.subplots(figsize=(9, 4))
ax.plot(train_steps, train_losses, label="Train loss", alpha=0.8)
if eval_losses:
    ax2 = ax.twiny()
    ax2.plot(eval_epochs, eval_losses, "o--", color="tomato", label="Eval loss")
    ax2.set_xlabel("Epoch")
    ax2.legend(loc="upper right")
ax.set_xlabel("Step")
ax.set_ylabel("Loss")
ax.set_title("Trump GPT-2 — Training Loss")
ax.legend(loc="upper left")
plt.tight_layout()
plt.show()

# ── Final perplexity ──────────────────────────────────────────────────────────
final_eval = trainer.evaluate()
perplexity = math.exp(final_eval["eval_loss"])
print(f"\nFinal eval loss : {final_eval['eval_loss']:.4f}")
print(f"Perplexity      : {perplexity:.2f}")

## 8 · Save and Export Model

The `Trainer` already saves checkpoints to Drive after every epoch.  
This cell does a final explicit save and verifies the files are intact.

In [None]:
# ── Final save ────────────────────────────────────────────────────────────────
print(f"Saving model to {OUTPUT_DIR} …")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# ── Verify saved files ────────────────────────────────────────────────────────
saved_files = os.listdir(OUTPUT_DIR)
print("\nSaved files:")
for f in sorted(saved_files):
    size = os.path.getsize(os.path.join(OUTPUT_DIR, f)) / 1e6
    print(f"  {f:<40} {size:.1f} MB")

# ── Quick sample generation (sanity check) ────────────────────────────────────
print("\n── Sample generation ──────────────────────────────────────")
prompt = "[TRUMP]: Nobody knows more about this than me —"
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
with torch.no_grad():
    output_ids = model.generate(
        **inputs,
        max_new_tokens=80,
        do_sample=True,
        temperature=0.85,
        top_p=0.92,
        repetition_penalty=1.3,
        pad_token_id=tokenizer.eos_token_id,
    )
print(tokenizer.decode(output_ids[0], skip_special_tokens=True))
print("\n[DONE] Trump model saved to:", OUTPUT_DIR)

## 9 · Download Model to Local Machine

The model folder contains several files (`model.safetensors`, `config.json`, tokenizer files, etc.).  
This cell zips the entire folder into **one file** and downloads it straight to your browser's download folder.

After downloading, extract it into `models/trump/` inside your local project so `start_debate.py` can find it automatically.

In [None]:
import os
import zipfile
from google.colab import files

ZIP_PATH = "/content/trump_model.zip"

print(f"Zipping {OUTPUT_DIR} → {ZIP_PATH} …")
with zipfile.ZipFile(ZIP_PATH, "w", compression=zipfile.ZIP_DEFLATED) as zf:
    for fname in os.listdir(OUTPUT_DIR):
        fpath = os.path.join(OUTPUT_DIR, fname)
        if os.path.isfile(fpath):
            zf.write(fpath, arcname=fname)
            size_mb = os.path.getsize(fpath) / 1e6
            print(f"  + {fname:<45} {size_mb:.1f} MB")

zip_size_mb = os.path.getsize(ZIP_PATH) / 1e6
print(f"\nZip size : {zip_size_mb:.1f} MB")
print("Downloading …")
files.download(ZIP_PATH)
print("\n[DONE]  Extract the zip into  models/trump/  in your local project.")