# Tables for FairCal based on Salvador et al., 2022
## Reproduced by Group 42 of FACT-AI 2022/23 @ UvA

### Tables Reproduced

1. Global Accuracy Measures
2. Fairness Calibration
3. Predictive Equality ('fpr at fpr')
4. Equal Opportunity ('fnr at fnr')

### Imports

In [19]:
import numpy as np
import pandas as pd
pd.set_option("display.precision", 2)
import seaborn as sns
import sklearn.metrics
import matplotlib.pyplot as plt
import os

### Data Manager to load pickle files and provide data based on specific table of interest

In [62]:
class DataManager():
    '''
    This class stores the information about the data corresponding to a specific dataset, feature, approach, and objective combination, among other factors.
    Input: 
        dataset (str), e.g. 'rfw'
        feature (str), e.g. 'facenet'
        approach (str), e.g. 'faircal'
        objective (str), e.g. 'accuracy'
        ...
        
    Methods
        load_pickle: loads a pickle file into a results dictionary
        provide_data: returns a dataframe based on pre-specified objective and other self attributes
        get_sensitive_attributes_subgroups: returns a list of sensitive attributes and a dictionary with keys = sensitive attributes and values = subgroups imposed by that attribute

    '''
    def __init__(self, dataset, feature, approach, objective, calibration_method='beta', n_clusters=100, measure='ece', at_error='1e-2', subgroup='African', fpr_def=[1e-3, 1e-2]):
        self.dataset = dataset
        self.feature = feature
        self.approach = approach
        self.objective = objective
        self.calibration_method = calibration_method
        self.n_clusters = n_clusters
        self.measure = measure
        self.at_error = at_error
        self.subgroup = subgroup
        self.fpr_def = fpr_def
        self.errors = 'fpr at fpr' if objective == 'predictive_equality' else 'fnr at fnr'
        self.nbins = 25 if dataset == 'bfw' else 10
        self.att = 'att' if dataset == 'bfw' else 'ethnicity'
        self.key = 'calibration' if approach in ['faircal', 'baseline', 'gmm-discrete', 'oracle'] else 'pre_calibration'
        self.sensitive_attributes, self.subgroups = self.get_sensitive_attributes_subgroups(self.dataset)
        self.load_pickle()        

    def load_pickle(self):
        filename = f'../experiments/{self.dataset}/{self.feature}/{self.approach}/{self.calibration_method}/nbins_{self.nbins}'
        if self.approach in ['faircal', 'gmm-discrete'] :
            filename += f'_nclusters_{self.n_clusters}'
        if self.approach == 'fsn':
            filename += f'_nclusters_{self.n_clusters}_fpr_1e-03'
        self.results = np.load(f'{filename}.npy', allow_pickle=True).item()

    def provide_data(self, objective):
        data = pd.DataFrame()
        data['folds'] = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']
        data = data.set_index('folds')
        
        if self.objective == 'accuracy':
            data['AUROC'] = np.nan
            data['TPR @ 0.1% FPR'] = np.nan
            data['TPR @ 1% FPR'] = np.nan
            for fold in range(1,6):
                fpr = self.results['fold'+str(fold)]['fpr'][self.att]['Global'][self.key]
                tpr = self.results['fold'+str(fold)]['tpr'][self.att]['Global'][self.key]
                data.loc[f'fold{str(fold)}', 'AUROC'] = sklearn.metrics.auc(fpr,tpr)
                inter = np.interp(self.fpr_def, fpr, tpr)
                data.iloc[fold-1, 1:] = inter
        
        elif self.objective == 'fairness_calibration':
            for fold in range(1, 6):
                for j, subgroup in enumerate(self.subgroups[self.att]):
                    data.loc[f'fold{str(fold)}', f'{subgroup}'] = self.results[f'fold{str(fold)}'][self.measure][self.att][subgroup]
      
        elif self.objective == 'predictive_equality' or objective == 'equal_opportunity':
            data[self.at_error] = np.nan
            
            for fold in range(1,6):
                if self.subgroup == 'Global':
                    data.iloc[fold-1,:] = self.at_error
                else:
                    fpr_global = self.results['fold'+str(fold)]['fpr'][self.att]['Global'][self.key]
                    tpr_global = self.results['fold'+str(fold)]['tpr'][self.att]['Global'][self.key]
                    thr_global = np.fmin(self.results['fold'+str(fold)]['thresholds'][self.att]['Global'][self.key], 1)

                    fpr = self.results['fold'+str(fold)]['fpr'][self.att][self.subgroup][self.key]
                    tpr = self.results['fold'+str(fold)]['tpr'][self.att][self.subgroup][self.key]
                    thr = np.fmin(self.results['fold'+str(fold)]['thresholds'][self.att][self.subgroup][self.key], 1)

                    if self.errors == 'fpr at fpr':
                        thr_at_error = np.interp(self.at_error,fpr_global,thr_global)
                        data.iloc[fold-1,:] = np.interp(thr_at_error,thr[::-1],fpr[::-1])
                    elif self.errors == 'fnr at fnr':
                        thr_at_error = np.interp(1-np.array(self.at_error),tpr_global,thr_global)
                        data.iloc[fold-1,:] = 1-np.interp(thr_at_error,thr[::-1],tpr[::-1])  
        else:
            print('Please specify a valid objective.')
        return data

    @staticmethod
    def get_sensitive_attributes_subgroups(dataset):
        '''
        A helper function to get sensitive attributes per subgroup, depending on the dataset.
        
        '''
        if dataset == 'rfw':
            sensitive_attributes = ['ethnicity']
            subgroups = {'ethnicity':['African', 'Asian', 'Caucasian', 'Indian']}
        else:
            sensitive_attributes = ['e', 'g', 'att']
            subgroups = {
                'e':['B', 'A', 'W', 'I'],
                'g':['F','M'],
                'att': ['black_females', 'black_males', 'asian_females', 'asian_males', 'white_females', 'white_males', 'indian_females', 'indian_males']
            }
        return sensitive_attributes, subgroups
        

### Tables

#### A general multi-index table

In [54]:
def create_multi_table(objective):
    approaches = ['baseline', 'agenda', 'fsn', 'faircal', 'oracle']
    tuples = []
    
    if objective == 'fairness_calibration':
        indices = {
        'rfw' : {
            'facenet': ['African', 'Asian', 'Caucasian', 'Indian'],
            'facenet-webface': ['African', 'Asian', 'Caucasian', 'Indian'],
        },
        'bfw' : {
            'facenet-webface': ['B', 'A', 'W', 'I', 'F','M', 
                'black_females', 'black_males', 'asian_females', 'asian_males', 'white_females', 'white_males', 'indian_females', 'indian_males'],
            'arcface': ['B', 'A', 'W', 'I', 'F','M', 
                'black_females', 'black_males', 'asian_females', 'asian_males', 'white_females', 'white_males', 'indian_females', 'indian_males'],
            },
        }
        for dataset in indices:
            for feature, sens in indices[dataset].items():
                for att in sens:
                    for approach in approaches:
                        tuples.append((dataset, feature, att, approach))
        index = pd.MultiIndex.from_tuples(tuples, names=['dataset', 'feature', 'subgroup', 'approach'])
    
    else:
        indices = {
        'rfw' : ['facenet', 'facenet-webface'],
        'bfw' : ['facenet-webface', 'arcface']
        }
        for dataset in indices:
            for feature in indices[dataset]:
                for approach in approaches:
                    tuples.append((dataset, feature, approach))
        index = pd.MultiIndex.from_tuples(tuples, names=['dataset', 'feature', 'approach'])
    
    data = pd.DataFrame(index=index)
    return approaches, indices, data

In [55]:
def get_table_for(objective, global_error=1e-2):
    approaches, indices, data = create_multi_table(objective)

    if objective == 'accuracy':
        metrics = ['AUROC', 'TPR @ 0.1% FPR', 'TPR @ 1% FPR']
        for metric in metrics:
            data[metric] = ''
        error = [1e-3, 1e-2]
    
        for dataset in indices:
            for feature in indices[dataset]:
                for approach in approaches:
                    data_object = DataManager(dataset, feature, approach, objective, fpr_def=error)
                    data_work = data_object.provide_data(objective)
                    data_work *= 100
                    for metric in metrics:
                        mean = '%.2f' % data_work[metric].mean()
                        std = '%.2f' % data_work[metric].std()
                        data.loc[dataset, feature, approach][metric] = f'{str(mean)} ({str(std)})'
    
    elif objective == 'fairness_calibration':
        metrics = ['mean', 'aad', 'mad', 'std']
        for metric in metrics:
            data[metric] = np.nan

        for dataset in indices:
            for feature in indices[dataset]:
                for approach in approaches:
                    sensitive_attributes = indices[dataset][feature]
                    for att in sensitive_attributes:
                        data_object = DataManager(dataset, feature, approach, objective)
                        data_work = data_object.provide_data(objective)
                        data_work = data_work * 100
                        for subgroup in data_work.columns:
                            group_mean = data_work[subgroup].mean()
                            data.loc[dataset, feature, subgroup, approach]['mean'] = group_mean
                            data.loc[dataset, feature, subgroup, approach]['aad'] = np.abs(data_work[subgroup] - group_mean).mean()
                            data.loc[dataset, feature, subgroup, approach]['mad'] = np.abs(data_work[subgroup] - group_mean).max()
                            data.loc[dataset, feature, subgroup, approach]['std'] = np.std(data_work[subgroup])
    
    elif objective == 'predictive_equality' or objective == 'equal_opportunity':
        metrics = ['aad', 'mad', 'std']
        for metric in metrics:
            data[metric] = ''

        for dataset in indices:
            att = 'att' if dataset == 'bfw' else 'ethnicity'
            _, subgroups = DataManager.get_sensitive_attributes_subgroups(dataset) 
            for feature in indices[dataset]:
                for approach in approaches:
                    folds_x_subgroups = pd.DataFrame() 
                    for i, subgroup in enumerate(subgroups[att]):
                        data_object = DataManager(dataset, feature, approach, objective, subgroup=subgroup, at_error=global_error)
                        folds_x_subgroups[subgroup] = data_object.provide_data(objective)

                    # computing metrics for averaged over each subgroup
                    mean = folds_x_subgroups.mean(axis=1)
                    data_work = {'aad': folds_x_subgroups.sub(mean, axis=0).abs().mean(axis=1),
                                 'mad': folds_x_subgroups.sub(mean, axis=0).abs().max(axis=1),
                                 'std': folds_x_subgroups.std(axis=1)
                                }

                    for metric in metrics:
                        mean = '%.2f' % (data_work[metric].mean() * 100)
                        std = '%.2f' % (data_work[metric].std() * 100)
                        data.loc[dataset, feature, approach][metric] = f'{mean} ({std})'
    else:
        print('Please specify a valid objective.')
    
    return data

#### 1. Global accuracy measures

In [150]:
accuracy_table = get_table_for('accuracy')
accuracy_table = accuracy_table.reset_index()
accuracy_table = accuracy_table.pivot(index='approach', columns=['dataset', 'feature']).reorder_levels(['dataset', 'feature', None], axis=1)

good_order = []
for tup in [('rfw', 'facenet'), ('rfw', 'facenet-webface'), ('bfw', 'facenet-webface'), ('bfw', 'arcface')]:
    for metric in ['AUROC', 'TPR @ 0.1% FPR', 'TPR @ 1% FPR']: 
        good_order.append(tup + (metric,))
accuracy_table = accuracy_table[good_order]

print(accuracy_table.to_latex())


TypeError: can only concatenate tuple (not "str") to tuple

#### 2. Fairness Calibration

In [48]:
fairness_cal_table = get_table_for('fairness_calibration')
from IPython.display import HTML, display
HTML(fairness_cal_table.to_html())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mean,aad,mad,std
dataset,feature,subgroup,approach,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
rfw,facenet,African,baseline,6.87,0.97,1.28,1.01
rfw,facenet,African,faircal,3.87,0.96,1.91,1.08
rfw,facenet,African,gmm-discrete,3.75,0.91,1.86,1.08
rfw,facenet,Asian,baseline,6.06,0.38,0.95,0.49
rfw,facenet,Asian,faircal,3.32,0.57,0.79,0.6
rfw,facenet,Asian,gmm-discrete,3.5,1.13,2.67,1.44
rfw,facenet,Caucasian,baseline,11.84,0.53,0.71,0.56
rfw,facenet,Caucasian,faircal,2.6,0.87,1.34,0.98
rfw,facenet,Caucasian,gmm-discrete,2.71,0.47,1.04,0.6
rfw,facenet,Indian,baseline,4.33,0.61,1.38,0.75


#### 3. Predictive Equality ('fpr at fpr') and 4. Equal Opportunity ('fnr at fnr')

In [52]:
objectives = ['predictive_equality', 'equal_opportunity']
global_errors = [1e-3, 1e-2]


objectives = ['equal_opportunity']
for objective in objectives:
    for global_error in global_errors:
        current_df = get_table_for(objective, global_error=global_error)
        print(f'\n{objective} at a global error rate = {global_error * 100}%')                
        display(HTML(current_df.to_html()))


equal_opportunity at a global error rate = 0.1%


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,aad,mad,std
dataset,feature,approach,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rfw,facenet,baseline,0.14 (0.01),0.27 (0.02),0.18 (0.02)
rfw,facenet,faircal,0.12 (0.03),0.24 (0.05),0.16 (0.03)
rfw,facenet,gmm-discrete,0.12 (0.01),0.25 (0.01),0.17 (0.01)
rfw,facenet-webface,baseline,0.09 (0.03),0.18 (0.08),0.12 (0.05)
rfw,facenet-webface,faircal,0.12 (0.01),0.24 (0.02),0.16 (0.01)
rfw,facenet-webface,gmm-discrete,0.10 (0.05),0.21 (0.09),0.14 (0.06)
bfw,facenet-webface,baseline,0.09 (0.02),0.24 (0.08),0.12 (0.03)
bfw,facenet-webface,faircal,0.10 (0.01),0.24 (0.08),0.13 (0.02)
bfw,facenet-webface,gmm-discrete,0.09 (0.03),0.22 (0.09),0.12 (0.03)
bfw,arcface,baseline,0.10 (0.03),0.29 (0.15),0.14 (0.05)



equal_opportunity at a global error rate = 1.0%


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,aad,mad,std
dataset,feature,approach,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rfw,facenet,baseline,0.53 (0.13),0.97 (0.36),0.72 (0.23)
rfw,facenet,faircal,0.39 (0.26),0.55 (0.31),0.47 (0.30)
rfw,facenet,gmm-discrete,0.39 (0.08),0.75 (0.15),0.54 (0.11)
rfw,facenet-webface,baseline,0.44 (0.23),0.79 (0.51),0.58 (0.33)
rfw,facenet-webface,faircal,0.43 (0.14),0.64 (0.25),0.54 (0.18)
rfw,facenet-webface,gmm-discrete,0.47 (0.18),0.86 (0.43),0.64 (0.29)
bfw,facenet-webface,baseline,0.50 (0.09),0.98 (0.21),0.61 (0.11)
bfw,facenet-webface,faircal,0.43 (0.15),0.89 (0.44),0.55 (0.19)
bfw,facenet-webface,gmm-discrete,0.58 (0.14),1.62 (0.66),0.82 (0.22)
bfw,arcface,baseline,0.56 (0.22),1.60 (0.77),0.79 (0.31)


### Visualising example results for predictive equality and equal opportunity

### Example conversion from dataframe to latex

In [26]:
txt = current_df.to_latex()
print(txt)

\begin{tabular}{llllll}
\toprule
    &         &         &          aad &          mad &          std \\
dataset & feature & approach &              &              &              \\
\midrule
rfw & facenet & baseline &  0.53 (0.13) &  0.97 (0.36) &  0.72 (0.23) \\
    &         & faircal &  0.39 (0.26) &  0.55 (0.31) &  0.47 (0.30) \\
    & facenet-webface & baseline &  0.44 (0.23) &  0.79 (0.51) &  0.58 (0.33) \\
    &         & faircal &  0.43 (0.14) &  0.64 (0.25) &  0.54 (0.18) \\
bfw & facenet-webface & baseline &  0.50 (0.09) &  0.98 (0.21) &  0.61 (0.11) \\
    &         & faircal &  0.43 (0.15) &  0.89 (0.44) &  0.55 (0.19) \\
    & arcface & baseline &  0.56 (0.22) &  1.60 (0.77) &  0.79 (0.31) \\
    &         & faircal &  0.59 (0.22) &  1.61 (0.85) &  0.81 (0.33) \\
\bottomrule
\end{tabular}



  txt = current_df.to_latex()


### Useful title/label conversions for latex

In [27]:
title_approaches = {
    'baseline':'Naive',
    # 'fsn':'Fair Score',
    'faircal':'FairCal (Ours)',
    # 'oracle':'Oracle (Ours)'
    }
title_calibration_methods = {
    'beta': 'Beta Calibration'
}
title_features = {
    'facenet':'FaceNet (VGGFace2)',
    'facenet-webface':'FaceNet (Webface)',
    'arcface': 'ArcFace'}
title_metrics = {
    'mean': 'Mean',
    'aad': 'AAD',
    'mad': 'MAD',
    'std': 'STD'}
caption_metrics = {
     'mean': 'Mean',
     'aad': 'AAD (Average Absolute Deviation)',
     'mad': 'MAD (Maximum Absolute Deviation)',
     'std': 'STD (Standard Deviation)',
}
title_keys = {
    'baseline': 'Baseline',
    # 'agenda': 'AGENDA',
    # 'ftc': 'FTC',
    # 'fsn': 'FSN',
    'faircal': 'FairCal (Ours)',
    # 'oracle': 'Oracle (Ours)'
    }
header_titles = {
    'African': 'Af',
    'Asian': 'As',
    'Caucasian': 'Ca',
    'Indian': 'In',
    'asian_females': 'AsF',
    'asian_males': 'AsM',
    'black_females': 'AfF',
    'black_males': 'AfM',
    'indian_females': 'IF',
    'indian_males': 'IM',
    'white_females': 'CF',
    'white_males': 'CM',
    'Global': 'Gl',
    'B': 'Af',
    'A': 'As',
    'W': 'C',
    'I': 'I',
    'F': 'F',
    'M': 'M'
}
title_datasets = {
    'rfw': 'RFW',
    'bfw': 'BFW'
}
caption_calibration_methods = {
    'beta': 'beta calibration'
}
caption_measures = {
    'ks': 'KS'
}
features_datasets = {
    'rfw': ['facenet', 'facenet-webface'],
    'bfw': ['facenet-webface']
}
attributes_datasets = {
    'rfw': 'ethnicity',
    'bfw': 'att',
}