In [None]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

label_cols = [
    "toxicity",
    "severe_toxicity",
    "obscene",
    "threat",
    "insult",
    "identity_attack",
    "sexual_explicit",
]
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 1
MODEL_NAME = "bert-base-uncased"

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Dataset
ds = load_dataset("google/civil_comments")
train_ds = ds["train"]
val_ds = ds["validation"]
test_ds = ds.get("test")

# 30% of the data for quicker runs
train_ds = train_ds.shuffle(seed=42).select(range(int(0.03 * len(train_ds))))
val_ds = val_ds.shuffle(seed=42).select(range(int(0.03 * len(val_ds))))
if test_ds:
    test_ds = test_ds.shuffle(seed=42).select(range(int(0.03 * len(test_ds))))

print(
    f"Train: {len(train_ds)}, Val: {len(val_ds)}, Test: {len(test_ds) if test_ds else 0}"
)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Train: 54146, Val: 2919, Test: 2919


In [None]:
def preprocess(batch):
    enc = tokenizer(
        batch["text"],
        truncation=True,
        padding=False,
        max_length=MAX_LEN,
    )
    labels = []
    for i in range(len(batch["text"])):
        labels.append([1.0 if float(batch[c][i]) >= 0.5 else 0.0 for c in label_cols])
    enc["labels"] = labels
    return enc


train_enc = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_enc = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)

test_enc = None
if test_ds is not None:
    test_enc = test_ds.map(
        preprocess, batched=True, remove_columns=test_ds.column_names
    )

collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/2919 [00:00<?, ? examples/s]

In [None]:
num_labels = len(label_cols)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    problem_type="multi_label_classification",
).to(device)

id2label = {i: name for i, name in enumerate(label_cols)}
label2id = {name: i for i, name in enumerate(label_cols)}
model.config.id2label = id2label
model.config.label2id = label2id


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))
    preds = (probs >= 0.5).astype(int)

    rows = {}
    f1_macro = f1_score(labels, preds, average="macro", zero_division=0)
    exact_acc = (preds == labels).all(axis=1).mean()

    # per-label metrics
    per_label = {}
    for i, name in enumerate(label_cols):
        y = labels[:, i]
        p = preds[:, i]
        s = probs[:, i]
        try:
            auc = roc_auc_score(y, s) if len(np.unique(y)) > 1 else np.nan
        except Exception:
            auc = np.nan
        per_label[name] = {
            "auc": auc,
            "f1": f1_score(y, p, zero_division=0),
            "acc": accuracy_score(y, p),
        }

    rows["f1_macro"] = f1_macro
    rows["exact_acc"] = exact_acc
    # flatten per-label with prefixes
    for k, v in per_label.items():
        rows[f"{k}_auc"] = v["auc"]
        rows[f"{k}_f1"] = v["f1"]
        rows[f"{k}_acc"] = v["acc"]

    return rows

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./bert_ckpt",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_enc,
    eval_dataset=val_enc,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro,Exact Acc,Toxicity Auc,Toxicity F1,Toxicity Acc,Severe Toxicity Auc,Severe Toxicity F1,Severe Toxicity Acc,Obscene Auc,Obscene F1,Obscene Acc,Threat Auc,Threat F1,Threat Acc,Insult Auc,Insult F1,Insult Acc,Identity Attack Auc,Identity Attack F1,Identity Attack Acc,Sexual Explicit Auc,Sexual Explicit F1,Sexual Explicit Acc
1,0.0534,0.042481,0.26654,0.935594,0.94899,0.641509,0.947927,,0.0,1.0,0.930599,0.555556,0.997259,0.860653,0.0,0.997945,0.955639,0.668712,0.963001,0.977699,0.0,0.993148,0.978302,0.0,0.99863


TrainOutput(global_step=3385, training_loss=0.058753080699081084, metrics={'train_runtime': 512.0412, 'train_samples_per_second': 105.745, 'train_steps_per_second': 6.611, 'total_flos': 3529705898473500.0, 'train_loss': 0.058753080699081084, 'epoch': 1.0})

In [None]:
val_raw = trainer.predict(val_enc)
val_logits = val_raw.predictions
val_labels = val_raw.label_ids
val_probs = 1 / (1 + np.exp(-val_logits))
val_pred = (val_probs >= 0.5).astype(int)

if test_enc is not None:
    test_raw = trainer.predict(test_enc)
    test_logits = test_raw.predictions
    test_labels = test_raw.label_ids
    test_probs = 1 / (1 + np.exp(-test_logits))
    test_pred = (test_probs >= 0.5).astype(int)
else:
    test_logits = test_labels = test_probs = test_pred = None

rows = []
for i, label in enumerate(label_cols):
    vy = val_labels[:, i]
    vs = val_probs[:, i]
    vp = val_pred[:, i]
    v_auc = roc_auc_score(vy, vs) if len(np.unique(vy)) > 1 else np.nan
    v_f1 = f1_score(vy, vp, zero_division=0)
    v_acc = accuracy_score(vy, vp)

    row = {"label": label, "val_auc": v_auc, "val_f1": v_f1, "val_acc": v_acc}
    if test_probs is not None:
        ty = test_labels[:, i]
        ts = test_probs[:, i]
        tp = test_pred[:, i]
        t_auc = roc_auc_score(ty, ts) if len(np.unique(ty)) > 1 else np.nan
        t_f1 = f1_score(ty, tp, zero_division=0)
        t_acc = accuracy_score(ty, tp)
        row.update({"test_auc": t_auc, "test_f1": t_f1, "test_acc": t_acc})
    else:
        row.update({"test_auc": np.nan, "test_f1": np.nan, "test_acc": np.nan})
    rows.append(row)

metrics_df = pd.DataFrame(
    rows,
    columns=[
        "label",
        "val_auc",
        "val_f1",
        "val_acc",
        "test_auc",
        "test_f1",
        "test_acc",
    ],
)

avg_vals = metrics_df.drop(columns=["label"]).mean(numeric_only=True)
avg_row = {**{"label": "AVG"}, **avg_vals.to_dict()}
metrics_df = pd.concat([metrics_df, pd.DataFrame([avg_row])], ignore_index=True)
metrics_df = metrics_df.round(3)
metrics_df

Unnamed: 0,label,val_auc,val_f1,val_acc,test_auc,test_f1,test_acc
0,toxicity,0.949,0.642,0.948,0.946,0.634,0.949
1,severe_toxicity,,0.0,1.0,,0.0,1.0
2,obscene,0.931,0.556,0.997,0.939,0.667,0.997
3,threat,0.861,0.0,0.998,0.834,0.0,0.997
4,insult,0.956,0.669,0.963,0.95,0.613,0.959
5,identity_attack,0.978,0.0,0.993,0.95,0.0,0.995
6,sexual_explicit,0.978,0.0,0.999,0.917,0.0,0.998
7,AVG,0.942,0.267,0.985,0.923,0.273,0.985


In [None]:
# CPU inference benchmarking
import time
from torch.utils.data import DataLoader

model_cpu = model.to("cpu").eval()

bench_ds = test_enc if test_enc is not None else val_enc
bench_loader = DataLoader(
    bench_ds,
    batch_size=BATCH_SIZE * 2,
    shuffle=False,
    collate_fn=collator,
)

# Warm-up
with torch.no_grad():
    for i, batch in enumerate(bench_loader):
        inputs = {
            k: v.to("cpu")
            for k, v in batch.items()
            if k in ["input_ids", "attention_mask", "token_type_ids"]
        }
        _ = model_cpu(**inputs)
        break

# Timed pass
n_samples = 0
start = time.perf_counter()
with torch.no_grad():
    for batch in bench_loader:
        inputs = {
            k: v.to("cpu")
            for k, v in batch.items()
            if k in ["input_ids", "attention_mask", "token_type_ids"]
        }
        _ = model_cpu(**inputs)
        n_samples += inputs["input_ids"].size(0)
end = time.perf_counter()

total_seconds = end - start
throughput = n_samples / total_seconds if total_seconds > 0 else float("inf")
per_sample_ms_all = (total_seconds / n_samples) * 1000.0

per_label_timings = [
    {
        "label": label,
        "test_infer_seconds": total_seconds,
        "per_sample_ms": per_sample_ms_all,
    }
    for label in label_cols
]

time_metrics_df = pd.DataFrame(per_label_timings)

# Append AVG row
avg_vals = time_metrics_df.drop(columns=["label"]).mean(numeric_only=True)
avg_row = {**{"label": "AVG"}, **avg_vals.to_dict()}
time_metrics_df = pd.concat(
    [time_metrics_df, pd.DataFrame([avg_row])], ignore_index=True
)

# Round for readability
time_metrics_df["test_infer_seconds"] = time_metrics_df["test_infer_seconds"].round(6)
time_metrics_df["per_sample_ms"] = time_metrics_df["per_sample_ms"].round(6)

print("Benchmark (CPU) on test set:")
print(f" - samples: {n_samples}")
print(f" - total_inference_seconds_all_labels: {total_seconds:.6f}")
print(f" - throughput_samples_per_sec_all_labels: {throughput:.2f}")
print(f" - avg_per_sample_latency_ms_all_labels: {per_sample_ms_all:.6f}")

time_metrics_df

Benchmark (CPU) on test set:
 - samples: 2919
 - total_inference_seconds_all_labels: 737.766411
 - throughput_samples_per_sec_all_labels: 3.96
 - avg_per_sample_latency_ms_all_labels: 252.746287


Unnamed: 0,label,test_infer_seconds,per_sample_ms
0,toxicity,737.766411,252.746287
1,severe_toxicity,737.766411,252.746287
2,obscene,737.766411,252.746287
3,threat,737.766411,252.746287
4,insult,737.766411,252.746287
5,identity_attack,737.766411,252.746287
6,sexual_explicit,737.766411,252.746287
7,AVG,737.766411,252.746287
