# Fine-Tuning LLMs for a Domain-Specific Classification Task

This notebook is the **runnable code deliverable** from the slide deck.

## What we will do

- Load a **biomedical domain dataset** from Hugging Face: `OxAISH-AL-LLM/pubmed_20k_rct`
- Train a strong classical baseline: **TFâ€‘IDF + Logistic Regression**
- Fine-tune a Transformer encoder: **PubMedBERT** (biomedical BERT)
- Evaluate both on a held-out **test** split using **Accuracy** and **Macro F1**
- Save outputs to:
  - `artifacts/metrics.json`
  - `artifacts/examples.json`

## Notes

- If you are on **CPU**, the notebook automatically subsamples the dataset so it finishes in reasonable time.


In [None]:
# colab clone
import os
import sys
from pathlib import Path

IN_COLAB = "google.colab" in sys.modules

REPO_URL = "https://github.com/<YOUR_USERNAME>/<YOUR_REPO>.git"
REPO_DIR = "<YOUR_REPO>"

if IN_COLAB:
    if not (Path.cwd() / "src").exists():
        os.chdir("/content")
        if not Path(REPO_DIR).exists():
            !git clone {REPO_URL}
        os.chdir(REPO_DIR)

print("cwd:", Path.cwd())



In [None]:
import sys
from dataclasses import replace
from pathlib import Path

# repo path setup
ROOT = Path.cwd().resolve()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent
sys.path.insert(0, str(ROOT))

if not (ROOT / "src").exists():
    raise RuntimeError("src/ folder not found. in colab you need to clone the repo first.")

# colab install
IN_COLAB = "google.colab" in sys.modules
if IN_COLAB:
    import subprocess

    req_path = ROOT / "requirements.txt"
    if not req_path.exists():
        raise RuntimeError(f"requirements.txt not found: {req_path}")

    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-U", "pip"])

    # dont reinstall torch in colab
    filtered = []
    for line in req_path.read_text(encoding="utf-8").splitlines():
        s = line.strip()
        if not s or s.startswith("#"):
            continue
        if s == "torch" or s.startswith("torch"):
            continue
        filtered.append(line)

    tmp_req = ROOT / "requirements-colab.txt"
    tmp_req.write_text("\n".join(filtered) + "\n", encoding="utf-8")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-r", str(tmp_req)])

import json
import logging
import warnings

# hide optional hub auth warning (token is optional for public models/datasets)
warnings.filterwarnings(
    "ignore",
    message="The secret `HF_TOKEN` does not exist*",
    category=UserWarning,
)
logging.getLogger("huggingface_hub.utils._http").setLevel(logging.ERROR)

import torch

from src.baseline import train_tfidf_logreg_baseline
from src.config import TrainConfig
from src.data import dataset_preview, load_hf_classification_dataset
from src.finetune import finetune_distilbert_classifier
from src.reporting import write_json

# save outputs here
ARTIFACTS = ROOT / "artifacts"
ARTIFACTS.mkdir(parents=True, exist_ok=True)

cfg = TrainConfig()

# cpu run (smaller)
if not torch.cuda.is_available():
    cfg = replace(cfg, max_train_samples=2000, max_eval_samples=1000, max_test_samples=1000)

print("CUDA available:", torch.cuda.is_available())
print("Config:", cfg)


In [None]:
ds, text_field, id2label = load_hf_classification_dataset(
    dataset_name=cfg.dataset_name,
    dataset_config=cfg.dataset_config,
    text_field=cfg.text_field,
    label_field=cfg.label_field,
    max_train_samples=cfg.max_train_samples,
    max_eval_samples=cfg.max_eval_samples,
    max_test_samples=cfg.max_test_samples,
    seed=cfg.seed,
)

print('Resolved text field:', text_field)
print('Labels:', id2label)
print('Sizes:', {k: len(v) for k, v in ds.items()})
print(json.dumps(dataset_preview(ds, text_field=text_field, label_field=cfg.label_field, k=2), indent=2)[:2000])


In [None]:
# Baseline: TF-IDF + Logistic Regression
baseline_result, baseline_metrics = train_tfidf_logreg_baseline(
    ds,
    text_field=text_field,
    label_field=cfg.label_field,
    max_features=cfg.baseline_max_features,
    seed=cfg.seed,
)
print("Baseline (test):", baseline_result)
print("Baseline metrics:", baseline_metrics)


In [None]:
# Fine-tune model
ft_result, ft_metrics = finetune_distilbert_classifier(
    ds,
    text_field=text_field,
    label_field=cfg.label_field,
    model_name=cfg.model_name,
    max_length=cfg.max_length,
    output_dir=cfg.output_dir,
    num_train_epochs=cfg.num_train_epochs,
    per_device_train_batch_size=cfg.per_device_train_batch_size,
    per_device_eval_batch_size=cfg.per_device_eval_batch_size,
    learning_rate=cfg.learning_rate,
    weight_decay=cfg.weight_decay,
    warmup_ratio=cfg.warmup_ratio,
    logging_steps=cfg.logging_steps,
    eval_strategy=cfg.eval_strategy,
    save_strategy=cfg.save_strategy,
    seed=cfg.seed,
    id2label=id2label,
    gradient_accumulation_steps=cfg.gradient_accumulation_steps,
    early_stopping_patience=cfg.early_stopping_patience,
    early_stopping_threshold=cfg.early_stopping_threshold,
)

print("Fine-tuned (test):", ft_result)
print("Fine-tuned metrics:", ft_metrics)


In [None]:
# Save combined metrics
metrics_path = ARTIFACTS / "metrics.json"
write_json(
    metrics_path,
    {
        "dataset": {
            "name": cfg.dataset_name,
            "config": cfg.dataset_config,
            "text_field": text_field,
            "splits": {k: len(v) for k, v in ds.items()},
        },
        "baseline": {
            "test_accuracy": baseline_result.accuracy,
            "test_macro_f1": baseline_result.macro_f1,
            **baseline_metrics,
        },
        "finetune": {
            "model_dir": str(ft_result.model_dir),
            "test_accuracy": ft_result.test_accuracy,
            "test_macro_f1": ft_result.test_macro_f1,
            **ft_metrics,
        },
    },
)
print("Wrote:", metrics_path)
print(metrics_path.read_text(encoding="utf-8")[:1500])


In [None]:
# Generate a few example predictions using the fine-tuned model
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_dir = Path(ft_result.model_dir)
tok = AutoTokenizer.from_pretrained(model_dir)
mdl = AutoModelForSequenceClassification.from_pretrained(model_dir)
mdl.eval()

def predict_one(text: str) -> int:
    enc = tok(text, return_tensors="pt", truncation=True, max_length=cfg.max_length)
    with torch.no_grad():
        out = mdl(**enc)
    return int(out.logits.argmax(dim=-1).item())

examples = []
for i in range(min(12, len(ds["test"]))):
    text = ds["test"][i][text_field]
    true = int(ds["test"][i][cfg.label_field])
    pred = predict_one(text)
    examples.append(
        {
            "text": text,
            "true_id": true,
            "pred_id": pred,
            "true_label": id2label.get(true) if id2label else None,
            "pred_label": id2label.get(pred) if id2label else None,
            "correct": bool(true == pred),
        }
    )

examples_path = ARTIFACTS / "examples.json"
write_json(examples_path, {"examples": examples})
print("Wrote:", examples_path)
print(examples_path.read_text(encoding="utf-8")[:1500])

# run extra scripts (make curves + report)
import subprocess

for rel in [
    "scripts/dataset_checks.py",
    "scripts/plot_training_curves.py",
    "scripts/error_analysis.py",
    "scripts/generate_evaluation_report.py",
]:
    subprocess.run([sys.executable, str(ROOT / rel)], check=True)

print("done")
