In [4]:
# ================================================
# Baseline: DistilRoBERTa fine-tune on CPC (A–H)
# ================================================
from pathlib import Path
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (DistilBertTokenizerFast, RobertaTokenizerFast,
                          AutoTokenizer, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments)
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import json

# ---- config ----
CSV_DIR = Path("/kaggle/input/cpc-csv")
MODEL_NAME = "distilroberta-base"
OUT_METRICS = Path("/kaggle/working/baseline_distilroberta-512.json")
MAX_LEN = 512
BATCH_SIZE = 64
EPOCHS = 3
LR = 2e-5

LABELS = list("ABCDEFGH")
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

# ---- load CSVs into HF datasets ----
def load_split(name):
    df = pd.read_csv(CSV_DIR / f"cpc_cls_{name}.csv")
    df["labels"] = df["label"].map(label2id)
    return Dataset.from_pandas(df[["text", "labels"]], preserve_index=False)

ds = DatasetDict({
    "train": load_split("train"),
    "validation": load_split("val"),
    "test": load_split("test")
})

# ---- tokenize ----
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
def enc(batch):
    return tok(batch["text"], truncation=True, max_length=MAX_LEN)
ds = ds.map(enc, batched=True)

# ---- model ----
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABELS),
    id2label=id2label,
    label2id=label2id
)

# ---- metrics ----
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro"),
        "weighted_f1": f1_score(labels, preds, average="weighted")
    }

# ---- train ----
args = TrainingArguments(
    output_dir="/kaggle/working",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    processing_class=tok,
    compute_metrics=compute_metrics
)

trainer.train()

# ---- evaluate & save ----
metrics = {
    "val": trainer.evaluate(ds["validation"]),
    "test": trainer.evaluate(ds["test"])
}

OUT_METRICS.parent.mkdir(parents=True, exist_ok=True)
OUT_METRICS.write_text(json.dumps(metrics, indent=2))
print(json.dumps(metrics, indent=2))


Map:   0%|          | 0/194656 [00:00<?, ? examples/s]

Map:   0%|          | 0/10814 [00:00<?, ? examples/s]

Map:   0%|          | 0/10814 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Weighted F1
1,0.5474,0.360738,0.880895,0.862315,0.881228
2,0.3457,0.321302,0.894396,0.881132,0.894296
3,0.2867,0.315601,0.896615,0.884094,0.896666






{
  "val": {
    "eval_loss": 0.3156011998653412,
    "eval_accuracy": 0.8966154984279637,
    "eval_macro_f1": 0.8840941387207217,
    "eval_weighted_f1": 0.8966655528326992,
    "eval_runtime": 86.2595,
    "eval_samples_per_second": 125.366,
    "eval_steps_per_second": 0.985,
    "epoch": 3.0
  },
  "test": {
    "eval_loss": 0.3366488218307495,
    "eval_accuracy": 0.8917144442389495,
    "eval_macro_f1": 0.8762084060294124,
    "eval_weighted_f1": 0.891731866323502,
    "eval_runtime": 86.3405,
    "eval_samples_per_second": 125.248,
    "eval_steps_per_second": 0.984,
    "epoch": 3.0
  }
}
