## **2.3 Training Evaluation on Test Set**

This section represents the final, unbiased evaluation of the trained system.

The **Test Set** is a strict hold-out: it is never used during training or validation.

### **2.3.1 Unseen Data Benchmark**

The final model is evaluated on the held-out test split to measure generalization to previously unseen faces.

### **2.3.2 Multi-Task Performance Breakdown**

Performance is reported independently for:

* **Age Classification**
* **Gender Classification**
* **Race Classification**

This analysis verifies whether the shared backbone supports all tasks equally well.

### **2.3.3 Fairness & Slice-Based Metrics**

Accuracy is evaluated across demographic slices (e.g., per-race accuracy) to detect residual bias.

### **2.3.4 Error Analysis via Confusion Matrices**

Confusion matrices are used to visualize systematic errors, such as confusion between neighboring age bins.

### **2.3.5 Identity Leakage Verification**

Final checks confirm that no subject identities overlap between training/validation and test sets.

### **2.3.6 Comparison to Commercial Baselines**

Results are benchmarked against industry-standard commercial APIs to contextualize performance.

In [1]:
from IPython.utils.io import capture_output

with capture_output() as cap:
    %run "2.2-Multi-TaskModelArchitecture.ipynb"

In [2]:
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Literal

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from torchvision.transforms import InterpolationMode
import torch.nn.functional as F

In [3]:
# Supports either:
#     targets = {"age": t, "gender": t, "race": t}
# or
#     targets = {"y": {"age": t, "gender": t, "race": t}}
def _extract_targets(targets: dict) -> dict:
    if "y" in targets and isinstance(targets["y"], dict):
        t = targets["y"]
    else:
        t = targets

    return {
        "age": t["age"].long().view(-1),
        "gender": t["gender"].long().view(-1),
        "race": t["race"].long().view(-1),
    }

# Multi-task classification loss for FairFace:
# preds: {"age": (B, A), "gender": (B, G), "race": (B, R)} logits
# targets: {"age": (B,), "gender": (B,), "race": (B,)} (or wrapped under targets["y"])
def multitask_loss(preds: dict, targets: dict, *, weights: dict | None = None, label_smoothing: float = 0.0) -> tuple[torch.Tensor, dict]:
    t = _extract_targets(targets)

    w_age = 1.0
    w_gender = 1.0
    w_race = 1.0
    if weights is not None:
        w_age = float(weights.get("age", 1.0))
        w_gender = float(weights.get("gender", 1.0))
        w_race = float(weights.get("race", 1.0))

    loss_age = F.cross_entropy(preds["age"], t["age"], label_smoothing=label_smoothing)
    loss_gender = F.cross_entropy(preds["gender"], t["gender"], label_smoothing=label_smoothing)
    loss_race = F.cross_entropy(preds["race"], t["race"], label_smoothing=label_smoothing)

    total = (w_age * loss_age) + (w_gender * loss_gender) + (w_race * loss_race)

    loss_parts = {
        "age": loss_age.detach(),
        "gender": loss_gender.detach(),
        "race": loss_race.detach(),
    }
    return total, loss_parts

# Returns per-head accuracy + mean accuracy.
@torch.inference_mode()
def multitask_accuracies(preds: dict, targets: dict) -> dict:
    t = _extract_targets(targets)

    age_acc = (preds["age"].argmax(dim=1) == t["age"]).float().mean()
    gender_acc = (preds["gender"].argmax(dim=1) == t["gender"]).float().mean()
    race_acc = (preds["race"].argmax(dim=1) == t["race"]).float().mean()

    mean_acc = (age_acc + gender_acc + race_acc) / 3.0

    return {
        "age": age_acc,
        "gender": gender_acc,
        "race": race_acc,
        "mean": mean_acc,
    }

In [4]:
from contextlib import nullcontext


# Supports:
#     A) batch = (imgs, targets, meta) from custom collate_fn
#     B) batch = {"img_t": imgs, "y": targets, ...}
# Returns:
#     imgs:    (B,3,H,W) on device
#     targets: dict of tensors on device (age/gender/race)
#     meta:    whatever (kept on CPU)
def _unpack_batch(batch, device):
    if isinstance(batch, (tuple, list)) and len(batch) == 3:
        imgs, targets, meta = batch
    elif isinstance(batch, dict):
        imgs = batch["img_t"]
        targets = batch.get("y", batch)  # if you stored targets directly
        meta = batch.get("meta", None)
    else:
        raise TypeError(f"Unsupported batch type: {type(batch)}")

    imgs = imgs.to(device, non_blocking=True)

    # move targets to device (each head target is (B,))
    targets = {
        "age": targets["age"].to(device, non_blocking=True),
        "gender": targets["gender"].to(device, non_blocking=True),
        "race": targets["race"].to(device, non_blocking=True),
    }

    return imgs, targets, meta


def train_one_epoch_fairface(model, optimizer, train_dl, *, label_smoothing: float = 0.0,
                             device: torch.device = torch.device("cuda"), scaler=None, amp: bool = True):
    model.train()
    
    use_amp = amp and (device.type == "cuda")
    autocast_cm = torch.amp.autocast(device_type="cuda", dtype=torch.float16) if use_amp else nullcontext()

    total = 0
    running_loss = {"age": 0.0, "gender": 0.0, "race": 0.0, "total": 0.0}
    running_acc  = {"age": 0.0, "gender": 0.0, "race": 0.0, "mean": 0.0}

    for batch in train_dl:
        optimizer.zero_grad(set_to_none=True)

        imgs, targets, _ = _unpack_batch(batch, device)

        with autocast_cm:
            preds = model(imgs)  # {"age": logits, "gender": logits, "race": logits}
            total_loss, loss_parts = multitask_loss(preds, targets, label_smoothing=label_smoothing)

        if use_amp and scaler is not None:
            scaler.scale(total_loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            total_loss.backward()
            optimizer.step()

        bs = imgs.size(0)
        total += bs

        with torch.inference_mode():
            accs = multitask_accuracies(preds, targets)

        running_loss["total"] += float(total_loss.detach().item()) * bs
        running_loss["age"]   += float(loss_parts["age"].item()) * bs
        running_loss["gender"]+= float(loss_parts["gender"].item()) * bs
        running_loss["race"]  += float(loss_parts["race"].item()) * bs

        running_acc["age"]    += float(accs["age"].item()) * bs
        running_acc["gender"] += float(accs["gender"].item()) * bs
        running_acc["race"]   += float(accs["race"].item()) * bs
        running_acc["mean"]   += float(accs["mean"].item()) * bs

    train_losses = {k: v / total for k, v in running_loss.items()}
    train_accs   = {k: v / total for k, v in running_acc.items()}
    return train_losses, train_accs


@torch.inference_mode()
def eval_one_epoch_fairface(model, eval_dl, *, label_smoothing: float = 0.0,
                            device: torch.device = torch.device("cuda"), amp: bool = True):
    model.eval()
    
    use_amp = amp and (device.type == "cuda")
    autocast_cm = torch.amp.autocast(device_type="cuda", dtype=torch.float16) if use_amp else nullcontext()

    total = 0
    running_loss = {"age": 0.0, "gender": 0.0, "race": 0.0, "total": 0.0}
    running_acc  = {"age": 0.0, "gender": 0.0, "race": 0.0, "mean": 0.0}

    for batch in eval_dl:
        imgs, targets, _ = _unpack_batch(batch, device)

        with autocast_cm:
            preds = model(imgs)
            total_loss, loss_parts = multitask_loss(preds, targets, label_smoothing=label_smoothing)
            accs = multitask_accuracies(preds, targets)

        bs = imgs.size(0)
        total += bs

        running_loss["total"] += float(total_loss.item()) * bs
        running_loss["age"]   += float(loss_parts["age"].item()) * bs
        running_loss["gender"]+= float(loss_parts["gender"].item()) * bs
        running_loss["race"]  += float(loss_parts["race"].item()) * bs

        running_acc["age"]    += float(accs["age"].item()) * bs
        running_acc["gender"] += float(accs["gender"].item()) * bs
        running_acc["race"]   += float(accs["race"].item()) * bs
        running_acc["mean"]   += float(accs["mean"].item()) * bs

    valid_losses = {k: v / total for k, v in running_loss.items()}
    valid_accs   = {k: v / total for k, v in running_acc.items()}
    
    return valid_losses, valid_accs


def fmt_metrics(d, factor=1.0, precision=4):
    return ", ".join(f"{k}:{factor * float(v):3.{precision}f}" for k, v in d.items())


def fit_fairface(model, optimizer, *, sched=None, train_dl=None, valid_dl=None, epochs: int = 25,
                 label_smoothing: float = 0.0, device: torch.device = torch.device("cuda"), amp: bool = True):
    
    scaler = torch.amp.GradScaler(enabled=(amp and device.type == "cuda"))
    
    history = {
        "train_loss": [],
        "train_acc": [],
        "valid_loss": [],
        "valid_acc": [],
        "lr": [],
    }

    for ep in range(1, epochs + 1):
        train_losses, train_accs = train_one_epoch_fairface(
            model, optimizer, train_dl,
            label_smoothing=label_smoothing,
            device=device, scaler=scaler, amp=amp)
        
        valid_losses, valid_accs = eval_one_epoch_fairface(
            model, valid_dl,
            label_smoothing=label_smoothing,
            device=device, amp=amp
        )

        if sched is not None:
            # standard: if ReduceLROnPlateau -> step on validation total loss
            if "plateau" in sched.__class__.__name__.lower():
                sched.step(valid_losses["total"])
            else:
                sched.step()

        curr_lr = optimizer.param_groups[0]["lr"]

        history["train_loss"].append(train_losses)
        history["train_acc"].append(train_accs)
        history["valid_loss"].append(valid_losses)
        history["valid_acc"].append(valid_accs)
        history["lr"].append(curr_lr)

        print(
            f"[Epoch {ep:02d}/{epochs:02d}]:\n"
            f"Train loss: {fmt_metrics(train_losses)} | "
            f"Train acc: {fmt_metrics(train_accs, factor=100, precision=2)}\n"
            f"Valid loss: {fmt_metrics(valid_losses)} | "
            f"Valid acc: {fmt_metrics(valid_accs, factor=100, precision=2)} | "
            f"lr: {curr_lr:.8f}"
        )

    return history


In [5]:
# --- device (single source of truth) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- model (create once, then move once) ---
model = FairFaceMultiTaskModel(pretrained=True, freeze_backbone=False).to(device)

# --- loss config ---
label_smoothing = 0.05  # set 0.0 if you don't want it

# --- optimizer ---
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# --- scheduler ---
sched = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode="min",
    factor=0.5,
    patience=2,
    threshold=1e-3,
    min_lr=1e-6,
)

torch.backends.cudnn.benchmark = True  # faster for fixed image sizes
torch.set_float32_matmul_precision("high")  # can help on Ampere+

In [None]:
# --- train ---
epochs = 15
history = fit_fairface(
    model,
    optimizer,
    sched=sched,
    train_dl=train_loader,
    valid_dl=valid_loader,
    epochs=epochs,
    label_smoothing=label_smoothing,
    device=device,
)

[Epoch 01/15]:
Train loss: age:1.3828, gender:0.3504, race:1.2657, total:2.9989 | Train acc: age:47.83, gender:86.81, race:55.10, mean:63.25
Valid loss: age:1.2742, gender:0.2805, race:1.0694, total:2.6242 | Valid acc: age:51.84, gender:91.01, race:64.22, mean:69.02 | lr: 0.00010000
[Epoch 02/15]:
Train loss: age:1.1665, gender:0.2573, race:0.9882, total:2.4120 | Train acc: age:56.94, gender:92.80, race:68.03, mean:72.59
Valid loss: age:1.1666, gender:0.2597, race:1.0160, total:2.4423 | Valid acc: age:56.38, gender:92.32, race:66.54, mean:71.75 | lr: 0.00010000
[Epoch 03/15]:
Train loss: age:1.0950, gender:0.2300, race:0.8893, total:2.2142 | Train acc: age:60.56, gender:94.34, race:72.73, mean:75.88
Valid loss: age:1.1629, gender:0.2485, race:0.9826, total:2.3940 | Valid acc: age:56.78, gender:93.05, race:68.11, mean:72.65 | lr: 0.00010000
[Epoch 04/15]:
Train loss: age:1.0322, gender:0.2099, race:0.8097, total:2.0518 | Train acc: age:63.95, gender:95.54, race:76.57, mean:78.69
Valid l

In [None]:
# Expects:
# history["train_loss"], history["valid_loss"] : list[dict]
# history["train_acc"],  history["valid_acc"]  : list[dict]
# Each dict has per-task keys like: "age", "gender", "race"
def plot_fairface_target_metrics(history, keys=("age", "gender", "race")):
    epochs = range(1, len(history["train_loss"]) + 1)

    # keep only keys that actually exist
    existing = []
    for k in keys:
        if k in history["train_loss"][0] and k in history["valid_loss"][0]:
            existing.append(k)

    if not existing:
        raise KeyError(f"None of these keys found in history dicts: {keys}")

    n = len(existing)
    fig, axes = plt.subplots(2, n, figsize=(6 * n, 8), constrained_layout=True)

    # if n==1, axes is 1D in each row; normalize indexing
    if n == 1:
        axes = [axes[0:1], axes[1:2]]  # make it 2 x 1-like

    # --- Row 1: Loss ---
    for i, k in enumerate(existing):
        ax = axes[0][i]
        tr = [d[k] for d in history["train_loss"]]
        va = [d[k] for d in history["valid_loss"]]
        ax.plot(epochs, tr, label="Train loss")
        ax.plot(epochs, va, label="Valid loss")
        ax.set_title(f"{k} loss")
        ax.set_xlabel("Epoch")
        ax.set_ylabel("Loss")
        ax.grid(True)
        ax.legend()

    # --- Row 2: Accuracy ---
    for i, k in enumerate(existing):
        ax = axes[1][i]
        tr = [d[k] * 100 for d in history["train_acc"]]   # as %
        va = [d[k] * 100 for d in history["valid_acc"]]   # as %
        ax.plot(epochs, tr, label="Train acc")
        ax.plot(epochs, va, label="Valid acc")
        ax.set_title(f"{k} accuracy")
        ax.set_xlabel("Epoch")
        ax.set_ylabel("Accuracy (%)")
        ax.grid(True)
        ax.legend()

    plt.show()

# usage
plot_fairface_target_metrics(history)


In [None]:
model.eval()
test_losses, test_accs = eval_one_epoch_fairface(
    model=model.to(device),
    eval_dl=test_loader,
    device=device,
    label_smoothing=0.0,  # keep 0.0 for eval/test
)

print(f"Test total loss : {test_losses['total']:.4f}")
print(f"Test mean acc   : {test_accs['mean']*100:.2f}%")

print(f"Age acc         : {test_accs['age']*100:.2f}%")
print(f"Gender acc      : {test_accs['gender']*100:.2f}%")
print(f"Race acc        : {test_accs['race']*100:.2f}%")

print("Losses:", {k: round(v, 4) for k, v in test_losses.items()})
print("Accs  :", {k: round(v*100, 2) for k, v in test_accs.items()})
