In [1]:
import os
import numpy as np

import torch

from sklearn.calibration import calibration_curve

from source.constants import RESULTS_PATH, PLOTS_PATH
from source.data.face_detection import get_fair_face, get_utk

os.makedirs(PLOTS_PATH, exist_ok=True)

In [2]:
method_seeds = [42, 142, 242, 342, 442]
dseed = 42

model = ["resnet18", "resnet34", "resnet50", "regnet", "efficientnet", "efficientnet_mcdropout"][2]

targets = ["age", "gender", "race (old)", "race"]
# predicting race does not give high unfairness (with either pa) for eod and aod
# predicting gender also not too nice (only unfairness with age)
target = 3 # 0, 1, 2, 3
pa = 0 # 0, 1, 2, 3

In [None]:
# no need to define targets and protected attributes, are queried directly afterwards
ff_train_ds, ff_test_ds = get_fair_face(binarize=True, augment=False)
utk_test_ds = get_utk(binarize=True)

run_path = os.path.join(RESULTS_PATH, f"fairface_target{target}_{model}_mseed{method_seeds[0]}_dseed{dseed}")
fair_inds = torch.load(os.path.join(run_path, "fair_inds.pt"))
val_inds = torch.load(os.path.join(run_path, "val_inds.pt"))

print(len(fair_inds), len(val_inds), len(ff_test_ds), len(utk_test_ds))

# get targets and protected attributes
y_fair_t = ff_train_ds.targets[target, fair_inds]
a_fair_t = ff_train_ds.targets[pa, fair_inds]
y_val_t = ff_train_ds.targets[target, val_inds]
a_val_t = ff_train_ds.targets[pa, val_inds]
y_ff_test_t = ff_test_ds.targets[target]
a_ff_test_t = ff_test_ds.targets[pa]
y_utk_test_t = utk_test_ds.targets[target]
a_utk_test_t = utk_test_ds.targets[pa]

# get fraction of protected attribute for testsets
p_a_ff_test = a_ff_test_t.float().mean().item() * 100
p_a_utk_test = a_utk_test_t.float().mean().item() * 100
print(p_a_ff_test, p_a_utk_test)

# get fraction of targets for testsets
p_y_ff_test = y_ff_test_t.float().mean().item() * 100
p_y_utk_test = y_utk_test_t.float().mean().item() * 100
print(p_y_ff_test, p_y_utk_test)

In [4]:
# load probits
fair_probits, val_probits, ff_test_probits, utk_test_probits = list(), list(), list(), list()
for mseed in method_seeds:
    path = os.path.join(RESULTS_PATH, f"fairface_target{target}_{model}_mseed{mseed}_dseed{dseed}")

    fair_probits.append(torch.load(os.path.join(path, f"fair_probits_t{target}.pt")))
    val_probits.append(torch.load(os.path.join(path, f"val_probits_t{target}.pt")))
    ff_test_probits.append(torch.load(os.path.join(path, f"ff_test_probits_t{target}.pt")))
    utk_test_probits.append(torch.load(os.path.join(path, f"utk_test_probits_t{target}.pt")))

In [5]:
def ece(y_probs, y_trues, n_bins):
    # Compute the calibration curve
    fraction_of_positives, mean_predicted_value = calibration_curve(y_trues, y_probs, n_bins=n_bins, strategy='uniform')
    
    # Define bin edges
    bin_edges = np.linspace(0.0, 1.0, n_bins + 1)
    
    # Assign each probability prediction to a bin
    bin_indices = np.digitize(y_probs, bins=bin_edges, right=True) - 1
    # Correct any indices that are out of bounds
    bin_indices = np.clip(bin_indices, 0, n_bins - 1)
    
    # Total number of samples
    n_samples = len(y_trues)
    
    # Count the number of samples per bin
    bin_counts = np.bincount(bin_indices, minlength=n_bins)
    
    # Calculate the weight of each bin (proportion of total samples)
    bin_weights = bin_counts / n_samples
    
    # Compute the absolute difference between accuracy and confidence for each bin
    bin_errors = np.abs(fraction_of_positives - mean_predicted_value)

    # Calculate the Expected Calibration Error
    ece = np.sum(bin_weights * bin_errors)
    
    return ece

# y_true = np.asarray([0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1])
# y_prob = np.asarray([0.1, 0.4, 0.35, 0.8, 0.1, 0.4, 0.25, 0.5, 0.1, 0.4, 0.35, 0.9])
# ece = ece(y_prob, y_true, n_bins=5)
# print(f"Expected Calibration Error: {ece:.4f}")

In [6]:
ensemble_members = list(range(1, len(ff_test_probits[0]) + 1))

ff_test_eces, utk_test_eces = list(), list()
ff_test_m_eces, utk_test_m_eces = list(), list()

for m in range(len(method_seeds)):

    ff_test_eces.append([ece(p[:, 1], y_ff_test_t, n_bins=10) for p in ff_test_probits[m]])
    utk_test_eces.append([ece(p[:, 1], y_utk_test_t, n_bins=10) for p in utk_test_probits[m]])

    ff_test_fm_eces_, utk_test_fm_eces_ = list(), list()


    probs = torch.mean(ff_test_probits[m], dim=0)[:, 1]
    ff_test_fm_eces_.append(ece(probs, y_ff_test_t, n_bins=10))

    probs = torch.mean(utk_test_probits[m], dim=0)[:, 1]
    utk_test_fm_eces_.append(ece(probs, y_utk_test_t, n_bins=10))

    ff_test_m_eces.append(ff_test_fm_eces_)
    utk_test_m_eces.append(utk_test_fm_eces_)

ff_test_m_eces = np.asarray(ff_test_m_eces).reshape(-1, )
utk_test_m_eces = np.asarray(utk_test_m_eces).reshape(-1, )
ff_test_eces = np.asarray(ff_test_eces).reshape(-1, )
utk_test_eces = np.asarray(utk_test_eces).reshape(-1, )

print(f"{ff_test_eces.mean(axis=0):.3f} $\pm$ {ff_test_eces.std(axis=0):.3f}")
print(f"{ff_test_m_eces.mean(axis=0):.3f} $\pm$ {ff_test_m_eces.std(axis=0):.3f}")
print(f"{utk_test_eces.mean(axis=0):.3f} $\pm$ {utk_test_eces.std(axis=0):.3f}")
print(f"{utk_test_m_eces.mean(axis=0):.3f} $\pm$ {utk_test_m_eces.std(axis=0):.3f}")

0.091 $\pm$ 0.024
0.046 $\pm$ 0.006
0.136 $\pm$ 0.023
0.102 $\pm$ 0.006
