In [3]:
# =========================================
# CPC Section Classifier (A–H) — Fine-tune
# =========================================
from pathlib import Path
import numpy as np
from datasets import load_from_disk
from transformers import (RobertaTokenizerFast, RobertaForSequenceClassification,
                          TrainingArguments, Trainer)
from sklearn.metrics import accuracy_score, f1_score

# ---- CONFIG (edit here) ----
ENCODED_DS_DIR = "../data/cpc_cls_encoded_vs8000_len128"   # from Step 2
TOKENIZER_DIR  = "../artifacts/patroberta-tokenizers/vs8000"
PRETRAINED_DIR = "../artifacts/patroberta-mlm-128-simple/checkpoint-21104"  # your MLM checkpoint dir
OUT_DIR        = "../artifacts/patroberta-cls-cpc-AH-128"

NUM_LABELS     = 8
ID2LABEL = {i:l for i,l in enumerate(list("ABCDEFGH"))}
LABEL2ID = {v:k for k,v in ID2LABEL.items()}

MAX_LEN        = 128  # keep aligned with encoding for now

# Small-GPU friendly knobs
PER_DEVICE_TRAIN_BS = 16
PER_DEVICE_EVAL_BS  = 16
GRAD_ACCUM_STEPS    = 2
LR                  = 2e-4
EPOCHS              = 3
FP16                = True

# ---- Load data/tokenizer ----
ds = load_from_disk(ENCODED_DS_DIR)
tok = RobertaTokenizerFast.from_pretrained(TOKENIZER_DIR)
tok.model_max_length = MAX_LEN

# ---- Model ----
model = RobertaForSequenceClassification.from_pretrained(
    PRETRAINED_DIR,
    num_labels=NUM_LABELS,
    id2label=ID2LABEL,
    label2id=LABEL2ID,
)
model.gradient_checkpointing_enable()

# ---- Metrics ----
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro"),
        "weighted_f1": f1_score(labels, preds, average="weighted"),
    }

# ---- Training args ----
steps_per_epoch = int(np.ceil(len(ds["train"]) / (PER_DEVICE_TRAIN_BS * GRAD_ACCUM_STEPS)))
EVAL_STEPS     = max(1, steps_per_epoch // 4)
SAVE_STEPS     = EVAL_STEPS
LOGGING_STEPS  = max(1, steps_per_epoch // 10)

args = TrainingArguments(
    output_dir=OUT_DIR,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BS,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_steps=SAVE_STEPS,
    logging_steps=LOGGING_STEPS,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    weight_decay=0.01,
    warmup_ratio=0.06,
    fp16=FP16,
    fp16_full_eval=True,
    report_to="none",
)

# ---- Trainer ----
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    processing_class=tok,  # fine here for Trainer
    compute_metrics=compute_metrics,
)

trainer.train()
val_metrics = trainer.evaluate()

# ---- Final test ----
test_metrics = trainer.evaluate(ds["test"])
print("Validation:", val_metrics)
print("Test:", test_metrics)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ../artifacts/patroberta-mlm-128-simple/checkpoint-21104 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,Macro F1,Weighted F1
1520,0.8832,0.680332,0.773719,0.641293,0.765114
3040,0.6021,0.534589,0.82088,0.778307,0.820732
4560,0.5494,0.499703,0.832902,0.798396,0.832091
6080,0.5053,0.47513,0.841039,0.812932,0.841089
7600,0.4433,0.462438,0.845571,0.817842,0.844926
9120,0.4346,0.442999,0.849454,0.822305,0.84941
10640,0.4392,0.442806,0.853246,0.828579,0.853223
12160,0.4198,0.425564,0.856205,0.832723,0.855877
13680,0.367,0.426188,0.860181,0.836322,0.860589
15200,0.3595,0.424109,0.860366,0.837903,0.860518


Validation: {'eval_loss': 0.4241010248661041, 'eval_accuracy': 0.8603661919733678, 'eval_macro_f1': 0.8379025626897066, 'eval_weighted_f1': 0.860518005406029, 'eval_runtime': 6.0715, 'eval_samples_per_second': 1781.107, 'eval_steps_per_second': 111.34, 'epoch': 3.0}
Test: {'eval_loss': 0.419538676738739, 'eval_accuracy': 0.8645274643980025, 'eval_macro_f1': 0.8390349748177244, 'eval_weighted_f1': 0.864790306295978, 'eval_runtime': 9.3516, 'eval_samples_per_second': 1156.381, 'eval_steps_per_second': 72.287, 'epoch': 3.0}
