# Tables for FairCal based on Salvador et al., 2022
## Reproduced by Group 42 of FACT-AI 2022/23 @ UvA

### Tables Reproduced

1. Fairness Calibration
2. Global Accuracy
3. Predictive Equality ('fpr at fpr')
4. Equal Opportunity ('fnr at fnr')

### Imports

In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.precision", 2)
import seaborn as sns
import sklearn.metrics
import matplotlib.pyplot as plt
import os

### Fairness Calibration

In [2]:
title_approaches = {
    'baseline':'Naive',
    # 'fsn':'Fair Score',
    'faircal':'FairCal (Ours)',
    # 'oracle':'Oracle (Ours)'
    }
title_calibration_methods = {
    'beta': 'Beta Calibration'
}
title_features = {
    'facenet':'FaceNet (VGGFace2)',
    'facenet-webface':'FaceNet (Webface)',
    'arcface': 'ArcFace'}
title_metrics = {
    'mean': 'Mean',
    'aad': 'AAD',
    'mad': 'MAD',
    'std': 'STD'}
caption_metrics = {
     'mean': 'Mean',
     'aad': 'AAD (Average Absolute Deviation)',
     'mad': 'MAD (Maximum Absolute Deviation)',
     'std': 'STD (Standard Deviation)',
}
title_keys = {
    'baseline': 'Baseline',
    # 'agenda': 'AGENDA',
    # 'ftc': 'FTC',
    # 'fsn': 'FSN',
    'faircal': 'FairCal (Ours)',
    # 'oracle': 'Oracle (Ours)'
    }
header_titles = {
    'African': 'Af',
    'Asian': 'As',
    'Caucasian': 'Ca',
    'Indian': 'In',
    'asian_females': 'AsF',
    'asian_males': 'AsM',
    'black_females': 'AfF',
    'black_males': 'AfM',
    'indian_females': 'IF',
    'indian_males': 'IM',
    'white_females': 'CF',
    'white_males': 'CM',
    'Global': 'Gl',
    'B': 'Af',
    'A': 'As',
    'W': 'C',
    'I': 'I',
    'F': 'F',
    'M': 'M'
}
title_datasets = {
    'rfw': 'RFW',
    'bfw': 'BFW'
}
caption_calibration_methods = {
    'beta': 'beta calibration'
}
caption_measures = {
    'ks': 'KS'
}
features_datasets = {
    'rfw': ['facenet', 'facenet-webface'],
    'bfw': ['facenet-webface']
}
attributes_datasets = {
    'rfw': 'ethnicity',
    'bfw': 'att',
}

In [3]:
def load_measures(dataset, feature, approach, subgroups, att, measure, calibration_method, nbins, n_clusters):
    filename = f'../experiments/{dataset}/{feature}/{approach}/{calibration_method}/nbins_{nbins}'
    if approach == 'faircal':
        filename += f'_nclusters_{n_clusters}'
    if approach == 'fsn':
        filename += f'_nclusters_{n_clusters}_fpr_1e-03'

    results = np.load(f'{filename}.npy', allow_pickle=True).item()

    data = pd.DataFrame()
    data['folds'] = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']
    data = data.set_index('folds')
    for fold in range(1, 6):
        for j, subgroup in enumerate(subgroups[att]):
            data.loc[f'fold{str(fold)}', f'{subgroup}'] = results[f'fold{str(fold)}'][measure][att][subgroup]
    return data

In [4]:
subgroups = {
            'e':['B', 'A', 'W', 'I'],
            'g':['F','M'],
            'att': ['black_females', 'black_males', 'asian_females', 'asian_males', 'white_females', 'white_males', 'indian_females', 'indian_males']
        }
att = 'e'   
test = load_measures('bfw','facenet-webface','faircal',subgroups, att,'ks','beta', 25, 100)
print(test)

              B     A     W         I
folds                                
fold1  9.21e-03  0.01  0.02  1.64e-02
fold2  2.28e-02  0.03  0.02  2.35e-02
fold3  4.09e-02  0.02  0.03  4.95e-02
fold4  2.20e-02  0.04  0.03  6.00e-03
fold5  2.71e-02  0.03  0.03  3.93e-02


In [5]:
def get_sensitive_attributes_subgroups(dataset):
    if dataset == 'rfw':
        sensitive_attributes = ['ethnicity']
        subgroups = {'ethnicity':['African', 'Asian', 'Caucasian', 'Indian']}
    elif 'bfw' in dataset:
        sensitive_attributes = ['e', 'g', 'att']
        subgroups = {
            'e':['B', 'A', 'W', 'I'],
            'g':['F','M'],
            'att': ['black_females', 'black_males', 'asian_females', 'asian_males', 'white_females', 'white_males', 'indian_females', 'indian_males']
        }
    return sensitive_attributes, subgroups

In [6]:
# Fairness calibration table

ks = np.array([5,10,15,20,25,50,75,100])
folds = [1,2,3,4,5]
data = pd.DataFrame()

measure = 'ks'
calibration = 'beta'

indices = {
    'rfw' : {
        'facenet': ['African', 'Asian', 'Caucasian', 'Indian'],
        'facenet-webface': ['African', 'Asian', 'Caucasian', 'Indian'],
    },
    'bfw' : {
        'facenet-webface': ['B', 'A', 'W', 'I', 'F','M', 
            'black_females', 'black_males', 'asian_females', 'asian_males', 'white_females', 'white_males', 'indian_females', 'indian_males']
        }
    }

# Create tuples from multi-indices
approaches = ['baseline', 'faircal']
tuples = []
for dataset in indices:
    for feature, sens in indices[dataset].items():
        for att in sens:
            for approach in approaches:
                tuples.append((dataset, feature, att, approach))

index = pd.MultiIndex.from_tuples(tuples, names=['dataset', 'feature', 'attribute', 'approach'])

data = pd.DataFrame(index=index)
for metric in ['mean', 'aad', 'mad', 'std']:
    data[metric] = np.nan

# For now, because we only have one experiment
for dataset in indices:
    for feature in indices[dataset]:
        for approach in approaches:
            sensitive_attributes, subgroups = get_sensitive_attributes_subgroups(dataset)
            for att in sensitive_attributes:
                nbins = 25 if dataset == 'bfw' else 10
                data_work = load_measures(dataset, feature, approach, subgroups, att, 'ks', 'beta', nbins=nbins, n_clusters=100)
                data_work = data_work * 100
                for subgroup in data_work.columns:
                    group_mean = data_work[subgroup].mean()
                    data.loc[dataset, feature, subgroup, approach]['mean'] = group_mean
                    data.loc[dataset, feature, subgroup, approach]['aad'] = np.abs(data_work[subgroup] - group_mean).mean()
                    data.loc[dataset, feature, subgroup, approach]['mad'] = np.abs(data_work[subgroup] - group_mean).max()
                    data.loc[dataset, feature, subgroup, approach]['std'] = np.std(data_work[subgroup])
print(data)

                                                  mean   aad   mad   std
dataset feature         attribute      approach                         
rfw     facenet         African        baseline   6.29  0.56  1.00  0.61
                                       faircal    1.72  0.50  0.73  0.55
                        Asian          baseline   5.66  0.40  0.63  0.43
                                       faircal    1.70  0.48  0.86  0.56
                        Caucasian      baseline  11.22  0.85  1.09  0.91
                                       faircal    1.53  0.35  0.57  0.40
                        Indian         baseline   1.99  0.91  1.63  0.98
                                       faircal    1.99  0.71  1.52  0.89
        facenet-webface African        baseline   4.32  0.59  1.46  0.76
                                       faircal    1.67  0.21  0.35  0.23
                        Asian          baseline   5.34  0.33  0.80  0.44
                                       faircal    1

### Global accuracy measures

In [7]:
def get_overall_stats(calibration_method, nbins, dataset, feature, approach, att, n_clusters, fpr_def):
    filename = f'../experiments/{dataset}/{feature}/{approach}/{calibration_method}/nbins_{nbins}'
    if approach == 'faircal':
        filename += f'_nclusters_{n_clusters}'
    if approach == 'fsn':
        filename += f'_nclusters_{n_clusters}_fpr_1e-03'
    key = 'calibration' if approach in ['faircal', 'baseline'] else 'pre_calibration'
    
    results = np.load(f'{filename}.npy', allow_pickle=True).item()

    data = pd.DataFrame()
    data['folds'] = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']
    data['auc'] = np.nan
    data['fpr_1e-3'] = np.nan
    data['fpr_1e-2'] = np.nan
    data = data.set_index('folds')

    for fold in range(1,6):
        fpr = results['fold'+str(fold)]['fpr'][att]['Global'][key]
        tpr = results['fold'+str(fold)]['tpr'][att]['Global'][key]
        data.loc[f'fold{str(fold)}', 'auc'] = sklearn.metrics.auc(fpr,tpr)
        inter = np.interp(fpr_def, fpr, tpr)
        data.iloc[fold-1, 1:] = inter
    return data

In [8]:
# Accuracy table
keys = ['baseline', 'faircal']
error = [1e-3, 1e-2]
title_stat = ['AUROC', '0.1\% FPR', '1\% FPR']
n_clusters = 100
calibration = 'beta'
datasets = ['bfw', 'rfw']

indices = {
    'rfw' : ['facenet', 'facenet-webface'],
    'bfw' : ['facenet-webface']
}
approaches = ['baseline', 'faircal']

tuples = []
for dataset in indices:
    for feature in indices[dataset]:
        for approach in approaches:
            tuples.append((dataset, feature, approach))

index = pd.MultiIndex.from_tuples(tuples, names=['dataset', 'feature', 'approach'])
data = pd.DataFrame(index=index)
metrics = ['auc', 'fpr_1e-3', 'fpr_1e-2']
for metric in metrics:
    data[metric] = ''

for dataset in indices:
    for feature in indices[dataset]:
        for approach in approaches:
            nbins = 25 if dataset == 'bfw' else 10
            att = 'att' if dataset == 'bfw' else 'ethnicity'
            data_work = get_overall_stats(calibration, nbins, dataset, feature, approach, att, n_clusters,error)
            data_work *= 100
            for metric in metrics:
                mean = round(data_work[metric].mean(), 2)
                std = round(data_work[metric].std(), 2)
                data.loc[dataset, feature, approach][metric] = f'{str(mean)} ({str(std)})'
print(data)

                                           auc      fpr_1e-3      fpr_1e-2
dataset feature         approach                                          
rfw     facenet         baseline  89.97 (0.58)  25.27 (6.51)   39.92 (2.4)
                        faircal   92.16 (0.43)  29.46 (4.65)  49.95 (4.08)
        facenet-webface baseline  84.46 (0.47)  11.14 (5.34)   26.45 (4.9)
                        faircal    86.9 (0.73)  19.54 (4.38)  35.05 (4.55)
bfw     facenet-webface baseline  95.17 (0.16)  30.23 (1.63)  55.11 (1.25)
                        faircal   96.05 (0.13)  40.24 (0.26)  62.59 (0.66)


In [9]:
# Comparing Ryan's old vs new versions of an experiment

# def get_overall_stats_temp(calibration_method, nbins, dataset,feature,approach,att,n_clusters,fpr_def, new):
#     filename = f'../experiments/{dataset}/{feature}/{approach}/{calibration_method}/nbins_{nbins}'
#     if approach == 'faircal':
#         filename += f'_nclusters_{n_clusters} ({new})'
#     if approach == 'fsn':
#         filename += f'_nclusters_{n_clusters}_fpr_1e-03'
#     key = 'calibration' if approach in ['faircal', 'baseline'] else 'pre_calibration'
    
#     results = np.load(f'{filename}.npy', allow_pickle=True).item()

#     data = pd.DataFrame()
#     data['folds'] = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']
#     data['auc'] = np.nan
#     data['fpr_1e-3'] = np.nan
#     data['fpr_1e-2'] = np.nan
#     data = data.set_index('folds')

#     for fold in range(1,6):
#         fpr = results['fold'+str(fold)]['fpr'][att]['Global'][key]
#         tpr = results['fold'+str(fold)]['tpr'][att]['Global'][key]
#         data.loc[f'fold{str(fold)}', 'auc'] = sklearn.metrics.auc(fpr,tpr)
#         inter = np.interp(fpr_def, fpr, tpr)
#         data.iloc[fold-1, 1:] = inter
#     return data

# # Accuracy table
# keys = ['baseline', 'faircal']
# error = [1e-3, 1e-2]
# title_stat = ['AUROC', '0.1\% FPR', '1\% FPR']
# n_clusters = 100
# calibration = 'beta'

# version = ['new', 'old']
# data = pd.DataFrame(index=version)
# metrics = ['auc', 'fpr_1e-3', 'fpr_1e-2']
# for metric in metrics:
#     data[metric] = ''

# dataset = 'bfw'
# feature = 'facenet-webface'
# approach = 'faircal'

# for current in version:
#     nbins = 25 if dataset == 'bfw' else 10
#     att = 'att' if dataset == 'bfw' else 'ethnicity'
#     data_work = get_overall_stats_temp(calibration, nbins, dataset, feature, approach, att ,n_clusters,error, current)
#     data_work *= 100
#     for metric in metrics:
#         mean = round(data_work[metric].mean(), 2)
#         std = round(data_work[metric].std(), 2)
#         data.loc[current][metric] = f'{str(mean)} ({str(std)})'

# print(data)

### Predictive Equality ('fpr at fpr') and Equal Opportunity ('fnr at fnr')

In [10]:
def get_error1_at_error2(calibration_method, dataset, feature, approach, subgroup, n_clusters, errors, at_error):
    nbins = 25 if dataset == 'bfw' else 10
    att = 'att' if dataset == 'bfw' else 'ethnicity'

    filename = f'../experiments/{dataset}/{feature}/{approach}/{calibration_method}/nbins_{nbins}'
    if approach == 'faircal':
        filename += f'_nclusters_{n_clusters}'
    if approach == 'fsn':
        filename += f'_nclusters_{n_clusters}_fpr_1e-03'
    key = 'calibration' if approach in ['faircal', 'oracle'] else 'pre_calibration' 
    
    results = np.load(f'{filename}.npy', allow_pickle=True).item()
    
    data = pd.DataFrame()
    data['folds'] = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']

    data[at_error] = np.nan
    data = data.set_index('folds')

    for fold in range(1,6):
        if subgroup == 'Global':
            fpr = results['fold'+str(fold)]['fpr'][att][subgroup][key]
            tpr = results['fold'+str(fold)]['tpr'][att][subgroup][key]

            if errors == 'fnr at fpr':
                data.iloc[fold-1,:] = 1-np.interp(at_error, fpr, tpr)
            elif errors == 'fpr at fpr' or errors == 'fnr at fnr':
                data.iloc[fold-1,:] = at_error
            elif errors == 'fpr at fnr':
                data.iloc[fold-1,:] = np.interp(1-np.array(at_error), tpr, fpr)
        else:
            fpr_global = results['fold'+str(fold)]['fpr'][att]['Global'][key]
            tpr_global = results['fold'+str(fold)]['tpr'][att]['Global'][key]
            thr_global = np.fmin(results['fold'+str(fold)]['thresholds'][att]['Global'][key], 1)

            fpr = results['fold'+str(fold)]['fpr'][att][subgroup][key]
            tpr = results['fold'+str(fold)]['tpr'][att][subgroup][key]
            thr = results['fold'+str(fold)]['thresholds'][att][subgroup][key]
            thr = np.fmin(thr, 1)
            
            if errors == 'fnr at fpr':
                thr_at_error = np.interp(at_error,fpr_global,thr_global)
                data.iloc[fold-1,:] = 1-np.interp(thr_at_error,thr[::-1],tpr[::-1])
            elif errors == 'fpr at fpr':
                thr_at_error = np.interp(at_error,fpr_global,thr_global)
                data.iloc[fold-1,:] = np.interp(thr_at_error,thr[::-1],fpr[::-1])
            elif errors == 'fpr at fnr':
                thr_at_error = np.interp(1-np.array(at_error),tpr_global,thr_global)
                data.iloc[fold-1,:] = np.interp(thr_at_error,thr[::-1],fpr[::-1])
            elif errors == 'fnr at fnr':
                thr_at_error = np.interp(1-np.array(at_error),tpr_global,thr_global)
                data.iloc[fold-1,:] = 1-np.interp(thr_at_error,thr[::-1],tpr[::-1])
    return data

In [11]:
# Example test

calibration_method = 'beta' 
dataset = 'rfw' 
feature = 'facenet-webface' 
approach = 'baseline' 
# subgroup = 'Global' 
subgroup = 'Asian'
n_clusters = 100
errors = 'fpr at fpr'
at_fpr = 1e-3

data = get_error1_at_error2(calibration_method, dataset, feature, approach, subgroup, n_clusters, errors, at_fpr)

print(data)

          0.001
folds          
fold1  4.10e-03
fold2  5.02e-03
fold3  1.86e-03
fold4  6.53e-04
fold5  9.56e-04


In [12]:
def produce_df_for(objective, global_error):
    calibration_method = 'beta'
    approaches = ['baseline', 'faircal']
    n_clusters = 100
    
    errors = 'fpr at fpr' if objective == 'predictive_equality' else 'fnr at fnr'

    datasets = ['bfw', 'rfw']
    indices = {
        'rfw' : ['facenet', 'facenet-webface'],
        'bfw' : ['facenet-webface', 'arcface']
    }

    tuples = []
    for dataset in indices:
        for feature in indices[dataset]:
            for approach in approaches:
                tuples.append((dataset, feature, approach))

    index = pd.MultiIndex.from_tuples(tuples, names=['dataset', 'feature', 'approach'])
    data = pd.DataFrame(index=index, dtype="string")
    metrics = ['aad', 'mad', 'std']
    for metric in metrics:
        data[metric] = ''

    for dataset in indices:
        att = attributes_datasets[dataset]
        _, subgroups = get_sensitive_attributes_subgroups(dataset)    
        for feature in indices[dataset]:
            for approach in approaches:
                folds_x_subgroups = pd.DataFrame() 
                for i, subgroup in enumerate(subgroups[att]):
                    folds_x_subgroups[subgroup] = get_error1_at_error2(calibration_method, dataset, feature, approach, subgroup, n_clusters, errors, global_error)  

                # computing metrics for averaged over each subgroup
                mean = folds_x_subgroups.mean(axis=1)
                data_work = {'aad': folds_x_subgroups.sub(mean, axis=0).abs().mean(axis=1),
                             'mad': folds_x_subgroups.sub(mean, axis=0).abs().max(axis=1),
                             'std': folds_x_subgroups.std(axis=1)
                            }

                # putting values in the data table
                for metric in metrics:
                    mean = round(data_work[metric].mean() * 100, 2) 
                    std = round(data_work[metric].std() * 100, 2)
                    data.loc[dataset, feature, approach][metric] = f'{mean} ({std})'
        
    return data

### Visualising example results for predictive equality and equal opportunity

In [13]:
objectives = ['predictive equality', 'equal opportunity']
global_errors = [1e-3, 1e-2]

for objective in objectives:
    for global_error in global_errors:
        current_df = produce_df_for(objective, global_error)
        print(f'\n{objective} at a global error rate = {global_error * 100}%')                
        print(current_df)


predictive equality at a global error rate = 0.1%
                                          aad          mad          std
dataset feature         approach                                       
rfw     facenet         baseline  0.14 (0.01)  0.27 (0.02)  0.18 (0.02)
                        faircal   0.13 (0.01)  0.27 (0.03)  0.18 (0.02)
        facenet-webface baseline  0.09 (0.03)  0.18 (0.07)  0.12 (0.04)
                        faircal   0.13 (0.01)  0.26 (0.02)  0.17 (0.02)
bfw     facenet-webface baseline  0.09 (0.02)  0.24 (0.08)  0.12 (0.03)
                        faircal    0.1 (0.01)  0.27 (0.09)  0.13 (0.02)
        arcface         baseline   0.1 (0.03)  0.29 (0.15)  0.14 (0.05)
                        faircal   0.11 (0.03)  0.36 (0.18)  0.16 (0.06)

predictive equality at a global error rate = 1.0%
                                          aad          mad          std
dataset feature         approach                                       
rfw     facenet         baseline  

### Example conversion from pandas to latex

In [14]:
txt = current_df.to_latex()
print(txt)

\begin{tabular}{llllll}
\toprule
    &         &         &          aad &          mad &          std \\
dataset & feature & approach &              &              &              \\
\midrule
rfw & facenet & baseline &  0.53 (0.13) &  0.97 (0.36) &  0.72 (0.23) \\
    &         & faircal &  0.37 (0.13) &  0.61 (0.22) &  0.48 (0.16) \\
    & facenet-webface & baseline &  0.44 (0.23) &  0.79 (0.51) &  0.58 (0.33) \\
    &         & faircal &  0.44 (0.09) &  0.81 (0.16) &   0.6 (0.11) \\
bfw & facenet-webface & baseline &   0.5 (0.09) &  0.98 (0.21) &  0.61 (0.11) \\
    &         & faircal &  0.44 (0.16) &   0.89 (0.3) &  0.55 (0.17) \\
    & arcface & baseline &  0.56 (0.22) &   1.6 (0.77) &  0.79 (0.31) \\
    &         & faircal &  0.74 (0.13) &  2.01 (0.73) &  1.02 (0.21) \\
\bottomrule
\end{tabular}



  txt = current_df.to_latex()
