In [1]:
# Cell 1 â€” Setup + Dataset class
%run ./00_config.ipynb

import os, json, time
import numpy as np
import torch
from torch.utils.data import Dataset

# Mirror of train.py's ToxicDataset
class ToxicDataset(Dataset):
    def __init__(self, df, tokenizer, max_len, is_test=False):
        self.texts = df["comment"].tolist()
        self.is_test = is_test
        self.max_len = max_len
        self.tokenizer = tokenizer
        # pick only the labels present in df, in cfg.labels order
        if not is_test:
            keep = [c for c in cfg.labels if c in df.columns]
            self.labels = df[keep].values.astype("float32")

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, i):
        enc = self.tokenizer(
            self.texts[i],
            truncation=True,
            padding="max_length",
            max_length=cfg.train.max_len,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        if not self.is_test:
            item["labels"] = torch.from_numpy(self.labels[i])
        return item

print("âœ… ToxicDataset ready.")


Python version: 3.13.7 (tags/v3.13.7:bcee1c3, Aug 14 2025, 14:15:11) [MSC v.1944 64 bit (AMD64)]
CUDA available: False
Running on CPU
../../data/train_data.csv
microsoft/mdeberta-v3-base
âœ… Config loaded and random seed set to: 42
ðŸ“‚ Model directory: ../models/best
ðŸ“‚ Reports directory: ../reports
âœ… Folder setup complete.
âœ… Found: ..\..\data\train_data.csv
âœ… Found: ..\..\data\test_data.csv

All required data files are present and accessible.
âœ… Configuration snapshot saved at:
../reports\config_snapshot.json
âœ… ToxicDataset ready.


In [2]:
# Cell 2 â€” Helpers

import os, json

def _ensure_dirs(report_dir, model_dir):
    os.makedirs(report_dir, exist_ok=True)
    os.makedirs(os.path.join(report_dir, "figs"), exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)

def _plot_curves(report_dir, train_losses, val_aucs):
    import matplotlib.pyplot as plt

    # Train loss
    plt.figure()
    plt.plot(range(1, len(train_losses) + 1), train_losses, marker="o")
    plt.xlabel("Epoch"); plt.ylabel("Train Loss"); plt.title("Train Loss"); plt.grid(True, alpha=0.3)
    plt.savefig(os.path.join(report_dir, "figs", "train_loss.png"), bbox_inches="tight")
    plt.close()

    # Val macro AUC
    plt.figure()
    plt.plot(range(1, len(val_aucs) + 1), val_aucs, marker="o")
    plt.xlabel("Epoch"); plt.ylabel("Val Macro AUC"); plt.title("Validation Macro AUC"); plt.grid(True, alpha=0.3)
    plt.savefig(os.path.join(report_dir, "figs", "val_macro_auc.png"), bbox_inches="tight")
    plt.close()

def _append_run_summary(report_dir, row: dict):
    import csv, datetime, hashlib
    path = os.path.join(report_dir, "run_summary.csv")
    row = row.copy()
    row["timestamp"] = datetime.datetime.utcnow().isoformat() + "Z"
    conf_str = json.dumps(row.get("config_snapshot", {}), sort_keys=True)
    row["config_hash"] = hashlib.md5(conf_str.encode()).hexdigest()[:8]
    row.pop("config_snapshot", None)
    header = ["timestamp","run_id","config_hash","model_name","max_len","batch_size","lr",
              "epochs","patience","macro_auc","best_epoch","model_dir"]
    exists = os.path.exists(path)
    with open(path, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=header)
        if not exists:
            w.writeheader()
        w.writerow({k: row.get(k, "") for k in header})

print("âœ… Helpers ready.")


âœ… Helpers ready.


In [3]:
# Cell 3 â€” Wire up loaders, model, loss, optimizer, scheduler, accelerator

import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, get_linear_schedule_with_warmup
from accelerate import Accelerator
from sklearn.model_selection import train_test_split
import pandas as pd
import re

# 0) Ensure we have train_df/val_df; fallback to quick rebuild if this kernel is fresh
try:
    _ = train_df, val_df
    print("Using train/val already in memory from 01_data.ipynb.")
except NameError:
    print("Rebuilding splits quickly (01_data not in memory).")
    # minimal cleaner (same as 01_data)
    _HTML = re.compile(r"<.*?>"); _URL = re.compile(r"http\S+|www\.\S+")
    _NONASCII = re.compile(r"[^\x00-\x7F]+"); _WS = re.compile(r"\s+")
    def clean_text(t: str) -> str:
        if not isinstance(t, str): return ""
        t = t.lower(); t = _URL.sub(" ", t); t = _HTML.sub(" ", t); t = _NONASCII.sub(" ", t); t = _WS.sub(" ", t).strip()
        return t

    raw = pd.read_csv(cfg.paths.raw_train)
    text_col = "comment_text" if "comment_text" in raw.columns else ("comment" if "comment" in raw.columns else None)
    assert text_col is not None, "Expected 'comment_text' or 'comment' in train CSV."
    raw["comment"] = raw[text_col].apply(clean_text)
    label_cols = [c for c in cfg.labels if c in raw.columns]
    strat = raw["toxic"] if "toxic" in raw.columns else None
    train_df, val_df = train_test_split(raw, test_size=0.2, random_state=cfg.train.seed, stratify=strat)
    keep = ["comment"] + label_cols
    train_df = train_df[keep].reset_index(drop=True)
    val_df   = val_df[keep].reset_index(drop=True)

label_cols = [c for c in cfg.labels if c in train_df.columns]
assert len(label_cols) == len(cfg.labels), f"Expected all labels {cfg.labels}, found {label_cols}"

# 1) Tokenizer & collator
tokenizer = AutoTokenizer.from_pretrained(cfg.train.model_name)
collate = DataCollatorWithPadding(tokenizer)

# 2) Datasets & loaders (reuse ToxicDataset from Cell 1)
tset = ToxicDataset(train_df, tokenizer, cfg.train.max_len)
vset = ToxicDataset(val_df, tokenizer, cfg.train.max_len)

tl = DataLoader(tset, batch_size=cvfg.train.batch_size if 'cvfg' in globals() else cfg.train.batch_size,
                shuffle=True, num_workers=2, collate_fn=collate, pin_memory=True)
vl = DataLoader(vset, batch_size=cfg.train.batch_size, shuffle=False, num_workers=2, collate_fn=collate, pin_memory=True)

# 3) Model
model = AutoModelForSequenceClassification.from_pretrained(
    cfg.train.model_name,
    num_labels=len(cfg.labels),
    problem_type="multi_label_classification"
)

# 4) Weighted BCE loss (compute label frequencies from train_df)
freq = (train_df[label_cols].sum(axis=0).values / len(train_df))
weights = 1.0 / (freq + 1e-6); weights = weights / weights.sum() * len(freq)
weights_t = torch.tensor(weights, dtype=torch.float32)

def loss_fn(logits, targets):
    return torch.nn.functional.binary_cross_entropy_with_logits(
        logits, targets, weight=weights_t.to(logits.device)
    )

# 5) Optimizer & scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.train.lr, weight_decay=cfg.train.weight_decay)
total_steps = len(tl) * cfg.train.epochs
scheduler = get_linear_schedule_with_warmup(optimizer, int(cfg.train.warmup_ratio * total_steps), total_steps)

# 6) Accelerator
accelerator = Accelerator(mixed_precision="fp16")
tl, vl, model, optimizer, scheduler = accelerator.prepare(tl, vl, model, optimizer, scheduler)

print(f"âœ… Setup complete | device processes: {accelerator.num_processes}")
print(f"Train batches/epoch: {len(tl)} | Val batches/epoch: {len(vl)}")
print(f"Effective batch size: {cfg.train.batch_size * accelerator.num_processes}")


  from .autonotebook import tqdm as notebook_tqdm


Rebuilding splits quickly (01_data not in memory).


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


âœ… Setup complete | device processes: 1
Train batches/epoch: 7979 | Val batches/epoch: 1995
Effective batch size: 16


In [None]:
# full tune
# Cell 4 â€” Train + Validate + Save best

import os, json, time
import numpy as np
import torch
from sklearn.metrics import roc_auc_score

# Ensure output dirs exist
report_dir = cfg.paths.reports_dir
model_dir  = cfg.paths.model_dir
_ensure_dirs(report_dir, model_dir)

best_auc, best_epoch, patience_ctr = 0.0, -1, 0
train_losses, val_macro_aucs = [], []
run_id = f"run_{int(time.time())}"
MIN_DELTA = 5e-4

for epoch in range(cfg.train.epochs):
    # ---- Train
    model.train()
    ep_loss = 0.0
    for batch in tl:
        outputs = model(**{k: v for k, v in batch.items() if k != "labels"})
        loss = loss_fn(outputs.logits, batch["labels"])
        accelerator.backward(loss)
        accelerator.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step(); scheduler.step(); optimizer.zero_grad()
        ep_loss += loss.item()
    ep_loss /= max(1, len(tl))
    train_losses.append(ep_loss)

    # ---- Validate
    model.eval()
    preds, refs = [], []
    with torch.no_grad():
        for batch in vl:
            outputs = model(**{k: v for k, v in batch.items() if k != "labels"})
            probs = torch.sigmoid(outputs.logits)
            preds.append(accelerator.gather(probs).cpu().numpy())
            refs.append(accelerator.gather(batch["labels"]).cpu().numpy())

    preds = np.concatenate(preds)[:len(vset)]
    refs  = np.concatenate(refs)[:len(vset)]

    # Per-label AUC (skip labels missing in this fold)
    per_label_aucs = []
    for i, lab in enumerate(cfg.labels):
        y_true = refs[:, i]
        if len(np.unique(y_true)) < 2:
            continue
        per_label_aucs.append(roc_auc_score(y_true, preds[:, i]))
    macro_auc = float(np.mean(per_label_aucs)) if per_label_aucs else 0.0
    val_macro_aucs.append(macro_auc)

    if accelerator.is_main_process:
        print(f"Epoch {epoch+1}/{cfg.train.epochs} | train_loss={ep_loss:.4f} | val_macro_auc={macro_auc:.4f}")

        # Save best
        if macro_auc > best_auc + MIN_DELTA:
            best_auc, best_epoch = macro_auc, epoch + 1
            patience_ctr = 0
            unwrapped = accelerator.unwrap_model(model)
            unwrapped.save_pretrained(model_dir, save_function=accelerator.save)
            tokenizer.save_pretrained(model_dir)

            with open(os.path.join(report_dir, "metrics.json"), "w", encoding="utf-8") as f:
                json.dump({
                    "macro_auc": best_auc,
                    "per_label_auc": {str(i): float(a) for i, a in enumerate(per_label_aucs)},
                    "best_epoch": best_epoch
                }, f, indent=2)

        else:
            patience_ctr += 1

        # Update curves each epoch
        _plot_curves(report_dir, train_losses, val_macro_aucs)

        # Early stopping
        if patience_ctr >= cfg.train.patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

    accelerator.wait_for_everyone()

# Append run summary (main process only)
if accelerator.is_main_process:
    from dataclasses import asdict
    _append_run_summary(report_dir, {
        "run_id": run_id,
        "model_name": cfg.train.model_name,
        "max_len": cfg.train.max_len,
        "batch_size": cfg.train.batch_size,
        "lr": cfg.train.lr,
        "epochs": cfg.train.epochs,
        "patience": cfg.train.patience,
        "macro_auc": best_auc,
        "best_epoch": best_epoch,
        "model_dir": model_dir,
        "config_snapshot": {
            "paths": asdict(cfg.paths),
            "train": asdict(cfg.train),
            "labels": list(cfg.labels)
        }
    })

print("\nâœ… Training complete.")
print(f"Best macro AUC: {best_auc:.4f} at epoch {best_epoch}")
print(f"Model saved to: {model_dir}")
print(f"Curves saved under: {os.path.join(report_dir, 'figs')}")




In [5]:
# Demo training: small subset + short seq length + 1 epoch + progress bar
import os, json, time, numpy as np, torch, pandas as pd, re
from tqdm import tqdm
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, get_linear_schedule_with_warmup
from accelerate import Accelerator

# ---------- demo knobs ----------
DEMO_TRAIN_ROWS = 4000
DEMO_VAL_ROWS   = 1000
DEMO_EPOCHS     = 1
DEMO_MAX_LEN    = 128
BATCH_SIZE      = 8
MIN_DELTA       = 5e-4

# ---------- rebuild a minimal split for speed ----------
_HTML = re.compile(r"<.*?>"); _URL = re.compile(r"http\S+|www\.\S+")
_NONASCII = re.compile(r"[^\x00-\x7F]+"); _WS = re.compile(r"\s+")
def clean_text(t: str) -> str:
    if not isinstance(t, str): return ""
    t = t.lower(); t = _URL.sub(" ", t); t = _HTML.sub(" ", t); t = _NONASCII.sub(" ", t); t = _WS.sub(" ", t).strip()
    return t

raw = pd.read_csv(cfg.paths.raw_train)
text_col = "comment_text" if "comment_text" in raw.columns else ("comment" if "comment" in raw.columns else None)
assert text_col is not None, "Need 'comment_text' or 'comment' in train_data.csv"
raw["comment"] = raw[text_col].apply(clean_text)
label_cols = [c for c in cfg.labels if c in raw.columns]
# stratify on toxic if present
strat = raw["toxic"] if "toxic" in raw.columns else None
# take a small random subset for speed
raw_small = (raw.sample(DEMO_TRAIN_ROWS + DEMO_VAL_ROWS, random_state=cfg.train.seed)
                  if len(raw) > DEMO_TRAIN_ROWS + DEMO_VAL_ROWS else raw.copy())
train_df = raw_small.iloc[:DEMO_TRAIN_ROWS][["comment"] + label_cols].reset_index(drop=True)
val_df   = raw_small.iloc[DEMO_TRAIN_ROWS:DEMO_TRAIN_ROWS+DEMO_VAL_ROWS][["comment"] + label_cols].reset_index(drop=True)

# ---------- tokenizer, datasets, loaders ----------
tokenizer = AutoTokenizer.from_pretrained(cfg.train.model_name)
collate = DataCollatorWithPadding(tokenizer)

class ToxicDatasetDemo(torch.utils.data.Dataset):
    def __init__(self, df, tok, max_len):
        self.texts = df["comment"].tolist()
        self.labels = df[label_cols].values.astype("float32")
        self.tok, self.max_len = tok, max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        enc = self.tok(self.texts[i], truncation=True, padding="max_length", max_length=DEMO_MAX_LEN, return_tensors="pt")
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.from_numpy(self.labels[i])
        return item

tset = ToxicDatasetDemo(train_df, tokenizer, DEMO_MAX_LEN)
vset = ToxicDatasetDemo(val_df, tokenizer, DEMO_MAX_LEN)

tl = DataLoader(tset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=collate)
vl = DataLoader(vset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=collate)

# ---------- model, loss, optim, sched ----------
model = AutoModelForSequenceClassification.from_pretrained(
    cfg.train.model_name, num_labels=len(cfg.labels), problem_type="multi_label_classification"
)
freq = (train_df[label_cols].sum(axis=0).values / len(train_df))
weights = 1.0 / (freq + 1e-6); weights = weights / weights.sum() * len(freq)
w_t = torch.tensor(weights, dtype=torch.float32)

def loss_fn(logits, targets):
    return torch.nn.functional.binary_cross_entropy_with_logits(logits, targets, weight=w_t.to(logits.device))

optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.train.lr, weight_decay=cfg.train.weight_decay)
total_steps = len(tl) * DEMO_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, int(cfg.train.warmup_ratio * total_steps), total_steps)

# CPU-friendly accelerator config
from accelerate import Accelerator

# Reuse existing Accelerator if present; otherwise create one that matches device
try:
    accelerator  # already exists
    print("Reusing existing Accelerator with mixed_precision:",
          getattr(getattr(accelerator, "state", None), "mixed_precision", "unknown"))
except NameError:
    accelerator = Accelerator(mixed_precision="fp16" if torch.cuda.is_available() else "no")

# Prepare using the (reused or newly created) accelerator
tl, vl, model, optimizer, scheduler = accelerator.prepare(tl, vl, model, optimizer, scheduler)

# ---------- train one fast epoch with progress bar ----------
report_dir, model_dir = cfg.paths.reports_dir, cfg.paths.model_dir
_ensure_dirs(report_dir, model_dir)
best_auc, best_epoch, patience_ctr = 0.0, -1, 0
train_losses, val_macro_aucs = [], []
run_id = f"demo_{int(time.time())}"

for epoch in range(DEMO_EPOCHS):
    model.train()
    ep_loss = 0.0
    for batch in tqdm(tl, desc=f"Epoch {epoch+1}/{DEMO_EPOCHS}"):
        outputs = model(**{k:v for k,v in batch.items() if k!="labels"})
        loss = loss_fn(outputs.logits, batch["labels"])
        accelerator.backward(loss)
        optimizer.step(); scheduler.step(); optimizer.zero_grad()
        ep_loss += loss.item()
    ep_loss /= max(1, len(tl))
    train_losses.append(ep_loss)

    # validate
    model.eval(); preds, refs = [], []
    with torch.no_grad():
        for batch in vl:
            outputs = model(**{k:v for k,v in batch.items() if k!="labels"})
            probs = torch.sigmoid(outputs.logits)
            preds.append(accelerator.gather(probs).cpu().numpy())
            refs.append(accelerator.gather(batch["labels"]).cpu().numpy())
    preds = np.concatenate(preds)[:len(vset)]
    refs  = np.concatenate(refs)[:len(vset)]

    per_label_aucs = []
    for i, lab in enumerate(cfg.labels):
        y_true = refs[:, i]
        if len(np.unique(y_true)) < 2: continue
        per_label_aucs.append(roc_auc_score(y_true, preds[:, i]))
    macro_auc = float(np.mean(per_label_aucs)) if per_label_aucs else 0.0
    val_macro_aucs.append(macro_auc)

    if accelerator.is_main_process:
        print(f"Epoch {epoch+1}/{DEMO_EPOCHS} | train_loss={ep_loss:.4f} | val_macro_auc={macro_auc:.4f}")

        # save checkpoint (best = only epoch here)
        unwrapped = accelerator.unwrap_model(model)
        unwrapped.save_pretrained(model_dir, save_function=accelerator.save)
        tokenizer.save_pretrained(model_dir)
        with open(os.path.join(report_dir, "metrics.json"), "w", encoding="utf-8") as f:
            json.dump({"macro_auc": macro_auc, "per_label_auc": {str(i): float(a) for i,a in enumerate(per_label_aucs)}, "best_epoch": epoch+1}, f, indent=2)

        _plot_curves(report_dir, train_losses, val_macro_aucs)

# Append a short demo run summary
if accelerator.is_main_process:
    from dataclasses import asdict
    _append_run_summary(report_dir, {
        "run_id": run_id,
        "model_name": cfg.train.model_name,
        "max_len": DEMO_MAX_LEN,
        "batch_size": BATCH_SIZE,
        "lr": cfg.train.lr,
        "epochs": DEMO_EPOCHS,
        "patience": cfg.train.patience,
        "macro_auc": val_macro_aucs[-1] if val_macro_aucs else 0.0,
        "best_epoch": 1,
        "model_dir": model_dir,
        "config_snapshot": {"paths": asdict(cfg.paths), "train": asdict(cfg.train), "labels": list(cfg.labels)}
    })

print("\nâœ… Demo training complete.")
print(f"Model saved to: {model_dir}")
print(f"Curves in: {os.path.join(report_dir, 'figs')}")


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reusing existing Accelerator with mixed_precision: fp16


Epoch 1/1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 500/500 [36:18<00:00,  4.36s/it]


Epoch 1/1 | train_loss=0.0870 | val_macro_auc=0.8034

âœ… Demo training complete.
Model saved to: ../models/best
Curves in: ../reports\figs


  row["timestamp"] = datetime.datetime.utcnow().isoformat() + "Z"


In [6]:
# Verify artifacts from demo training
import os, json

print("Model dir:", cfg.paths.model_dir)
print("Reports :", cfg.paths.reports_dir)

print("\nContents of model dir:")
print(os.listdir(cfg.paths.model_dir) if os.path.exists(cfg.paths.model_dir) else "!! not found")

metrics_path = os.path.join(cfg.paths.reports_dir, "metrics.json")
print("\nmetrics.json exists:", os.path.exists(metrics_path))
if os.path.exists(metrics_path):
    with open(metrics_path, "r", encoding="utf-8") as f:
        m = json.load(f)
    print("metrics:", {k: m.get(k) for k in ["macro_auc", "best_epoch"]})


Model dir: ../models/best
Reports : ../reports

Contents of model dir:
['added_tokens.json', 'config.json', 'model.safetensors', 'special_tokens_map.json', 'spm.model', 'tokenizer.json', 'tokenizer_config.json']

metrics.json exists: True
metrics: {'macro_auc': 0.8034086988331047, 'best_epoch': 1}
