In [14]:
import numpy as np
import pandas as pd
pd.set_option("display.precision", 2)
import seaborn as sns
import sklearn.metrics
import matplotlib.pyplot as plt
import os

In [15]:
title_approaches = {
    'baseline':'Naive',
    # 'fsn':'Fair Score',
    'faircal':'FairCal (Ours)',
    # 'oracle':'Oracle (Ours)'
    }
title_calibration_methods = {
    'beta': 'Beta Calibration'
}
title_features = {
    'facenet':'FaceNet (VGGFace2)',
    'facenet-webface':'FaceNet (Webface)',
    'arcface': 'ArcFace'}
title_metrics = {
    'mean': 'Mean',
    'aad': 'AAD',
    'mad': 'MAD',
    'std': 'STD'}
caption_metrics = {
     'mean': 'Mean',
     'aad': 'AAD (Average Absolute Deviation)',
     'mad': 'MAD (Maximum Absolute Deviation)',
     'std': 'STD (Standard Deviation)',
}
title_keys = {
    'baseline': 'Baseline',
    # 'agenda': 'AGENDA',
    # 'ftc': 'FTC',
    # 'fsn': 'FSN',
    'faircal': 'FairCal (Ours)',
    # 'oracle': 'Oracle (Ours)'
    }
header_titles = {
    'African': 'Af',
    'Asian': 'As',
    'Caucasian': 'Ca',
    'Indian': 'In',
    'asian_females': 'AsF',
    'asian_males': 'AsM',
    'black_females': 'AfF',
    'black_males': 'AfM',
    'indian_females': 'IF',
    'indian_males': 'IM',
    'white_females': 'CF',
    'white_males': 'CM',
    'Global': 'Gl',
    'B': 'Af',
    'A': 'As',
    'W': 'C',
    'I': 'I',
    'F': 'F',
    'M': 'M'
}
title_datasets = {
    'rfw': 'RFW',
    'bfw': 'BFW'
}
caption_calibration_methods = {
    'beta': 'beta calibration'
}
caption_measures = {
    'ks': 'KS'
}
features_datasets = {
    'rfw': ['facenet', 'facenet-webface'],
    'bfw': ['facenet-webface', 'arcnet']
}
attributes_datasets = {
    'rfw': 'ethnicity',
    'bfw': 'att',
}

In [16]:
def load_measures(dataset, feature, approach, subgroups, att, measure, calibration_method, nbins, n_clusters):
    filename = f'../experiments/{dataset}/{feature}/{approach}/{calibration_method}/nbins_{nbins}'
    if approach == 'faircal':
        filename += f'_nclusters_{n_clusters}'
    if approach == 'fsn':
        filename += f'_nclusters_{n_clusters}_fpr_1e-03'

    results = np.load(f'{filename}.npy', allow_pickle=True).item()

    data = pd.DataFrame()
    data['folds'] = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']
    data = data.set_index('folds')
    for fold in range(1, 6):
        for j, subgroup in enumerate(subgroups[att]):
            data.loc[f'fold{str(fold)}', f'{subgroup}'] = results[f'fold{str(fold)}'][measure][att][subgroup]
    return data

In [17]:
subgroups = {
            'e':['B', 'A', 'W', 'I'],
            'g':['F','M'],
            'att': ['black_females', 'black_males', 'asian_females', 'asian_males', 'white_females', 'white_males', 'indian_females', 'indian_males']
        }
att = 'e'   
test = load_measures('bfw','facenet-webface','faircal',subgroups,att,'ks','beta',25,100)
print(test)

          B     A     W     I
folds                        
fold1  0.01  0.02  0.03  0.01
fold2  0.03  0.03  0.02  0.02
fold3  0.04  0.02  0.02  0.06
fold4  0.03  0.04  0.03  0.01
fold5  0.02  0.03  0.02  0.04


In [18]:
def get_sensitive_attributes_subgroups(dataset):
    if dataset == 'rfw':
        sensitive_attributes = ['ethnicity']
        subgroups = {'ethnicity':['African', 'Asian', 'Caucasian', 'Indian']}
    elif 'bfw' in dataset:
        sensitive_attributes = ['e', 'g', 'att']
        subgroups = {
            'e':['B', 'A', 'W', 'I'],
            'g':['F','M'],
            'att': ['black_females', 'black_males', 'asian_females', 'asian_males', 'white_females', 'white_males', 'indian_females', 'indian_males']
        }
    return sensitive_attributes, subgroups

In [22]:
ks = np.array([5,10,15,20,25,50,75,100])
folds = [1,2,3,4,5]
data = pd.DataFrame()

measure = 'ks'
calibration = 'beta'

indices = {
    'rfw' : {
        'facenet': ['African', 'Asian', 'Caucasian', 'Indian'],
        'facenet-webface': ['African', 'Asian', 'Caucasian', 'Indian'],
    },
    'bfw' : {
        'facenet-webface': ['B', 'A', 'W', 'I', 'F','M', 
            'black_females', 'black_males', 'asian_females', 'asian_males', 'white_females', 'white_males', 'indian_females', 'indian_males']
        }
    }

# Create tuples from multi-indices
approaches = ['baseline', 'faircal']
tuples = []
for dataset in indices:
    for feature, sens in indices[dataset].items():
        for att in sens:
            for approach in approaches:
                tuples.append((dataset, feature, att, approach))

index = pd.MultiIndex.from_tuples(tuples, names=['dataset', 'feature', 'attribute', 'approach'])

data = pd.DataFrame(index=index)
for metric in ['mean', 'aad', 'mad', 'std']:
    data[metric] = np.nan

# For now, because we only have one experiment
for dataset in indices:
    for feature in indices[dataset]:
        for approach in approaches:
            sensitive_attributes, subgroups = get_sensitive_attributes_subgroups(dataset)
            for att in sensitive_attributes:
                nbins = 25 if dataset == 'bfw' else 10
                data_work = load_measures(dataset, feature, approach, subgroups, att, 'ks', 'beta', nbins=nbins, n_clusters=100)
                data_work = data_work * 100
                for subgroup in data_work.columns:
                    group_mean = data_work[subgroup].mean()
                    data.loc[dataset, feature, subgroup, approach]['mean'] = group_mean
                    data.loc[dataset, feature, subgroup, approach]['aad'] = np.abs(data_work[subgroup] - group_mean).mean()
                    data.loc[dataset, feature, subgroup, approach]['mad'] = np.abs(data_work[subgroup] - group_mean).max()
                    data.loc[dataset, feature, subgroup, approach]['std'] = np.std(data_work[subgroup])
print(data)

                                                  mean   aad   mad   std
dataset feature         attribute      approach                         
rfw     facenet         African        baseline   6.29  0.56  1.00  0.61
                                       faircal    1.71  0.67  0.83  0.69
                        Asian          baseline   5.66  0.40  0.63  0.43
                                       faircal    1.58  0.47  0.75  0.53
                        Caucasian      baseline  11.22  0.85  1.09  0.91
                                       faircal    1.36  0.31  0.49  0.35
                        Indian         baseline   1.99  0.91  1.63  0.98
                                       faircal    2.07  0.68  1.24  0.79
        facenet-webface African        baseline   4.32  0.59  1.46  0.76
                                       faircal    1.57  0.19  0.46  0.24
                        Asian          baseline   5.34  0.33  0.80  0.44
                                       faircal    1

In [23]:
def get_overall_stats(calibration_method, nbins, dataset,feature,approach,att,n_clusters,fpr_def):
    filename = f'../experiments/{dataset}/{feature}/{approach}/{calibration_method}/nbins_{nbins}'
    if approach == 'faircal':
        filename += f'_nclusters_{n_clusters}'
    if approach == 'fsn':
        filename += f'_nclusters_{n_clusters}_fpr_1e-03'
    key = 'calibration' if approach in ['faircal', 'baseline'] else 'pre_calibration'
    
    results = np.load(f'{filename}.npy', allow_pickle=True).item()

    data = pd.DataFrame()
    data['folds'] = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']
    data['auc'] = np.nan
    data['fpr_1e-3'] = np.nan
    data['fpr_1e-2'] = np.nan
    data = data.set_index('folds')

    for fold in range(1,6):
        fpr = results['fold'+str(fold)]['fpr'][att]['Global'][key]
        tpr = results['fold'+str(fold)]['tpr'][att]['Global'][key]
        data.loc[f'fold{str(fold)}', 'auc'] = sklearn.metrics.auc(fpr,tpr)
        inter = np.interp(fpr_def, fpr, tpr)
        data.iloc[fold-1, 1:] = inter
    return data

In [24]:
# Accuracy table
keys = ['baseline', 'faircal']
error = [1e-3, 1e-2]
title_stat = ['AUROC', '0.1\% FPR', '1\% FPR']
n_clusters = 100
calibration = 'beta'
datasets = ['bfw', 'rfw']

indices = {
    'rfw' : ['facenet', 'facenet-webface'],
    'bfw' : ['facenet-webface']
}
approaches = ['baseline', 'faircal']

tuples = []
for dataset in indices:
    for feature in indices[dataset]:
        for approach in approaches:
            tuples.append((dataset, feature, approach))

index = pd.MultiIndex.from_tuples(tuples, names=['dataset', 'feature', 'approach'])
data = pd.DataFrame(index=index)
metrics = ['auc', 'fpr_1e-3', 'fpr_1e-2']
for metric in metrics:
    data[metric] = ''

for dataset in indices:
    for feature in indices[dataset]:
        for approach in approaches:
            nbins = 25 if dataset == 'bfw' else 10
            att = 'att' if dataset == 'bfw' else 'ethnicity'
            data_work = get_overall_stats(calibration, nbins, dataset, feature, approach, att ,n_clusters,error)
            data_work *= 100
            for metric in metrics:
                mean = round(data_work[metric].mean(), 2)
                std = round(data_work[metric].std(), 2)
                data.loc[dataset, feature, approach][metric] = f'{str(mean)} ({str(std)})'
print(data)

                                           auc      fpr_1e-3      fpr_1e-2
dataset feature         approach                                          
rfw     facenet         baseline  89.97 (0.58)  25.27 (6.51)   39.92 (2.4)
                        faircal   92.15 (0.45)  29.65 (2.57)  50.57 (3.59)
        facenet-webface baseline  84.46 (0.47)  11.14 (5.34)   26.45 (4.9)
                        faircal   86.96 (0.72)  23.16 (5.54)  33.94 (4.37)
bfw     facenet-webface baseline  94.62 (0.17)  27.93 (2.02)  52.79 (1.74)
                        faircal   95.65 (0.15)  38.07 (0.89)  60.08 (1.09)


In [25]:
def get_overall_stats_temp(calibration_method, nbins, dataset,feature,approach,att,n_clusters,fpr_def, new):
    filename = f'../experiments/{dataset}/{feature}/{approach}/{calibration_method}/nbins_{nbins}'
    if approach == 'faircal':
        filename += f'_nclusters_{n_clusters} ({new})'
    if approach == 'fsn':
        filename += f'_nclusters_{n_clusters}_fpr_1e-03'
    key = 'calibration' if approach in ['faircal', 'baseline'] else 'pre_calibration'
    
    results = np.load(f'{filename}.npy', allow_pickle=True).item()

    data = pd.DataFrame()
    data['folds'] = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']
    data['auc'] = np.nan
    data['fpr_1e-3'] = np.nan
    data['fpr_1e-2'] = np.nan
    data = data.set_index('folds')

    for fold in range(1,6):
        fpr = results['fold'+str(fold)]['fpr'][att]['Global'][key]
        tpr = results['fold'+str(fold)]['tpr'][att]['Global'][key]
        data.loc[f'fold{str(fold)}', 'auc'] = sklearn.metrics.auc(fpr,tpr)
        inter = np.interp(fpr_def, fpr, tpr)
        data.iloc[fold-1, 1:] = inter
    return data

# Accuracy table
keys = ['baseline', 'faircal']
error = [1e-3, 1e-2]
title_stat = ['AUROC', '0.1\% FPR', '1\% FPR']
n_clusters = 100
calibration = 'beta'

version = ['new', 'old']
data = pd.DataFrame(index=version)
metrics = ['auc', 'fpr_1e-3', 'fpr_1e-2']
for metric in metrics:
    data[metric] = ''

dataset = 'bfw'
feature = 'facenet-webface'
approach = 'faircal'

for current in version:
    nbins = 25 if dataset == 'bfw' else 10
    att = 'att' if dataset == 'bfw' else 'ethnicity'
    data_work = get_overall_stats_temp(calibration, nbins, dataset, feature, approach, att ,n_clusters,error, current)
    data_work *= 100
    for metric in metrics:
        mean = round(data_work[metric].mean(), 2)
        std = round(data_work[metric].std(), 2)
        data.loc[current][metric] = f'{str(mean)} ({str(std)})'

print(data)

FileNotFoundError: [Errno 2] No such file or directory: '../experiments/bfw/facenet-webface/faircal/beta/nbins_25_nclusters_100 (new).npy'