In [65]:
import numpy as np
import pandas as pd
pd.set_option("display.precision", 2)
import seaborn as sns
import sklearn.metrics
import matplotlib.pyplot as plt
import os

In [66]:
title_approaches = {
    'baseline':'Naive',
    # 'fsn':'Fair Score',
    'faircal':'FairCal (Ours)',
    # 'oracle':'Oracle (Ours)',
    'gmm-discrete':'GMM-Discrete (Ours)'
    }
title_calibration_methods = {
    'beta': 'Beta Calibration'
}
title_features = {
    'facenet':'FaceNet (VGGFace2)',
    'facenet-webface':'FaceNet (Webface)',
    'arcface': 'ArcFace'}
title_metrics = {
    'mean': 'Mean',
    'aad': 'AAD',
    'mad': 'MAD',
    'std': 'STD'}
caption_metrics = {
     'mean': 'Mean',
     'aad': 'AAD (Average Absolute Deviation)',
     'mad': 'MAD (Maximum Absolute Deviation)',
     'std': 'STD (Standard Deviation)',
}
title_keys = {
    'baseline': 'Baseline',
    # 'agenda': 'AGENDA',
    # 'ftc': 'FTC',
    # 'fsn': 'FSN',
    'faircal': 'FairCal (Ours)',
    # 'oracle': 'Oracle (Ours)',
    'gmm-discrete': 'GMM-Discrete (Ours)'
    }
header_titles = {
    'African': 'Af',
    'Asian': 'As',
    'Caucasian': 'Ca',
    'Indian': 'In',
    'asian_females': 'AsF',
    'asian_males': 'AsM',
    'black_females': 'AfF',
    'black_males': 'AfM',
    'indian_females': 'IF',
    'indian_males': 'IM',
    'white_females': 'CF',
    'white_males': 'CM',
    'Global': 'Gl',
    'B': 'Af',
    'A': 'As',
    'W': 'C',
    'I': 'I',
    'F': 'F',
    'M': 'M'
}
title_datasets = {
    'rfw': 'RFW',
    'bfw': 'BFW'
}
caption_calibration_methods = {
    'beta': 'beta calibration'
}
caption_measures = {
    'ks': 'KS'
}
features_datasets = {
    'rfw': ['facenet', 'facenet-webface'],
    'bfw': ['facenet-webface', 'arcnet']
}
attributes_datasets = {
    'rfw': 'ethnicity',
    'bfw': 'att',
}

In [67]:
def load_measures(dataset, feature, approach, subgroups, att, measure, calibration_method, nbins, n_clusters):
    filename = f'../experiments/{dataset}/{feature}/{approach}/{calibration_method}/nbins_{nbins}'
    if approach == 'faircal':
        filename += f'_nclusters_{n_clusters}'
    if approach == 'fsn':
        filename += f'_nclusters_{n_clusters}_fpr_1e-03'

    results = np.load(f'{filename}.npy', allow_pickle=True).item()

    data = pd.DataFrame()
    data['folds'] = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']
    data = data.set_index('folds')
    for fold in range(1, 6):
        for j, subgroup in enumerate(subgroups[att]):
            data.loc[f'fold{str(fold)}', f'{subgroup}'] = results[f'fold{str(fold)}'][measure][att][subgroup]
    return data

In [68]:
subgroups = {
            'e':['B', 'A', 'W', 'I'],
            'g':['F','M'],
            'att': ['black_females', 'black_males', 'asian_females', 'asian_males', 'white_females', 'white_males', 'indian_females', 'indian_males']
        }
att = 'e'   
test = load_measures('bfw','facenet-webface','gmm-discrete',subgroups,att,'ks','beta',25,100)
print(test)

          B     A     W     I
folds                        
fold1  0.02  0.02  0.05  0.02
fold2  0.02  0.01  0.05  0.02
fold3  0.04  0.02  0.05  0.07
fold4  0.03  0.04  0.06  0.02
fold5  0.03  0.04  0.03  0.04


In [69]:
def get_sensitive_attributes_subgroups(dataset):
    if dataset == 'rfw':
        sensitive_attributes = ['ethnicity']
        subgroups = {'ethnicity':['African', 'Asian', 'Caucasian', 'Indian']}
    elif 'bfw' in dataset:
        sensitive_attributes = ['e', 'g', 'att']
        subgroups = {
            'e':['B', 'A', 'W', 'I'],
            'g':['F','M'],
            'att': ['black_females', 'black_males', 'asian_females', 'asian_males', 'white_females', 'white_males', 'indian_females', 'indian_males']
        }
    return sensitive_attributes, subgroups

In [87]:
ks = np.array([5,10,15,20,25,50,75,100])
folds = [1,2,3,4,5]
data = pd.DataFrame()

measure = 'ks'
calibration = 'beta'

indices = {
    'rfw' : {
        'facenet': ['African', 'Asian', 'Caucasian', 'Indian'],
        'facenet-webface': ['African', 'Asian', 'Caucasian', 'Indian'],
    },
    'bfw' : {
        'facenet-webface': ['B', 'A', 'W', 'I', 'F','M', 
            'black_females', 'black_males', 'asian_females', 'asian_males', 'white_females', 'white_males', 'indian_females', 'indian_males']
        }
    }

# Create tuples from multi-indices
approaches = ['baseline', 'faircal', 'gmm-discrete']
tuples = []
for dataset in indices:
    for feature, sens in indices[dataset].items():
        for att in sens:
            for approach in approaches:
                tuples.append((dataset, feature, att, approach))

index = pd.MultiIndex.from_tuples(tuples, names=['dataset', 'feature', 'attribute', 'approach'])

data = pd.DataFrame(index=index)
for metric in ['mean', 'aad', 'mad', 'std']:
    data[metric] = np.nan

# For now, because we only have one experiment
for dataset in indices:
    for feature in indices[dataset]:
        for approach in approaches:
            sensitive_attributes, subgroups = get_sensitive_attributes_subgroups(dataset)
            for att in sensitive_attributes:
                nbins = 25 if dataset == 'bfw' else 10
                data_work = load_measures(dataset, feature, approach, subgroups, att, 'ks', 'beta', nbins=nbins, n_clusters=100)
                data_work = data_work * 100
                for subgroup in data_work.columns:
                    group_mean = data_work[subgroup].mean()
                    data.loc[dataset, feature, subgroup, approach]['mean'] = group_mean
                    data.loc[dataset, feature, subgroup, approach]['aad'] = np.abs(data_work[subgroup] - group_mean).mean()
                    data.loc[dataset, feature, subgroup, approach]['mad'] = np.abs(data_work[subgroup] - group_mean).max()
                    data.loc[dataset, feature, subgroup, approach]['std'] = np.std(data_work[subgroup])

print(data.to_html())

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th>mean</th>
      <th>aad</th>
      <th>mad</th>
      <th>std</th>
    </tr>
    <tr>
      <th>dataset</th>
      <th>feature</th>
      <th>attribute</th>
      <th>approach</th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th rowspan="24" valign="top">rfw</th>
      <th rowspan="12" valign="top">facenet</th>
      <th rowspan="3" valign="top">African</th>
      <th>baseline</th>
      <td>6.29</td>
      <td>0.56</td>
      <td>1.00</td>
      <td>0.61</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>1.71</td>
      <td>0.67</td>
      <td>0.83</td>
      <td>0.69</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>1.77</td>
      <td>0.50</td>
      <td>0.81</td>
      <td>0.53</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">Asian</th

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th>mean</th>
      <th>aad</th>
      <th>mad</th>
      <th>std</th>
    </tr>
    <tr>
      <th>dataset</th>
      <th>feature</th>
      <th>attribute</th>
      <th>approach</th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th rowspan="24" valign="top">rfw</th>
      <th rowspan="12" valign="top">facenet</th>
      <th rowspan="3" valign="top">African</th>
      <th>baseline</th>
      <td>6.29</td>
      <td>0.56</td>
      <td>1.00</td>
      <td>0.61</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>1.71</td>
      <td>0.67</td>
      <td>0.83</td>
      <td>0.69</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>1.77</td>
      <td>0.50</td>
      <td>0.81</td>
      <td>0.53</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">Asian</th>
      <th>baseline</th>
      <td>5.66</td>
      <td>0.40</td>
      <td>0.63</td>
      <td>0.43</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>1.58</td>
      <td>0.47</td>
      <td>0.75</td>
      <td>0.53</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>1.63</td>
      <td>0.39</td>
      <td>0.87</td>
      <td>0.50</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">Caucasian</th>
      <th>baseline</th>
      <td>11.22</td>
      <td>0.85</td>
      <td>1.09</td>
      <td>0.91</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>1.36</td>
      <td>0.31</td>
      <td>0.49</td>
      <td>0.35</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>1.46</td>
      <td>0.15</td>
      <td>0.29</td>
      <td>0.18</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">Indian</th>
      <th>baseline</th>
      <td>1.99</td>
      <td>0.91</td>
      <td>1.63</td>
      <td>0.98</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>2.07</td>
      <td>0.68</td>
      <td>1.24</td>
      <td>0.79</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>2.24</td>
      <td>1.13</td>
      <td>1.99</td>
      <td>1.21</td>
    </tr>
    <tr>
      <th rowspan="12" valign="top">facenet-webface</th>
      <th rowspan="3" valign="top">African</th>
      <th>baseline</th>
      <td>4.32</td>
      <td>0.59</td>
      <td>1.46</td>
      <td>0.76</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>1.57</td>
      <td>0.19</td>
      <td>0.46</td>
      <td>0.24</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>1.50</td>
      <td>0.31</td>
      <td>0.78</td>
      <td>0.41</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">Asian</th>
      <th>baseline</th>
      <td>5.34</td>
      <td>0.33</td>
      <td>0.80</td>
      <td>0.44</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>1.50</td>
      <td>0.16</td>
      <td>0.29</td>
      <td>0.18</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>1.38</td>
      <td>0.23</td>
      <td>0.47</td>
      <td>0.28</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">Caucasian</th>
      <th>baseline</th>
      <td>10.16</td>
      <td>0.42</td>
      <td>0.80</td>
      <td>0.50</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>1.74</td>
      <td>0.47</td>
      <td>0.66</td>
      <td>0.51</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>2.59</td>
      <td>0.52</td>
      <td>0.87</td>
      <td>0.58</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">Indian</th>
      <th>baseline</th>
      <td>2.39</td>
      <td>1.07</td>
      <td>2.03</td>
      <td>1.26</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>2.19</td>
      <td>0.92</td>
      <td>1.50</td>
      <td>1.06</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>2.84</td>
      <td>1.06</td>
      <td>1.65</td>
      <td>1.17</td>
    </tr>
    <tr>
      <th rowspan="42" valign="top">bfw</th>
      <th rowspan="42" valign="top">facenet-webface</th>
      <th rowspan="3" valign="top">B</th>
      <th>baseline</th>
      <td>1.61</td>
      <td>0.55</td>
      <td>1.37</td>
      <td>0.72</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>2.48</td>
      <td>0.49</td>
      <td>1.05</td>
      <td>0.61</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>2.69</td>
      <td>0.60</td>
      <td>1.50</td>
      <td>0.85</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">A</th>
      <th>baseline</th>
      <td>3.73</td>
      <td>0.53</td>
      <td>1.27</td>
      <td>0.69</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>2.65</td>
      <td>0.56</td>
      <td>0.92</td>
      <td>0.64</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>2.72</td>
      <td>0.89</td>
      <td>1.40</td>
      <td>0.99</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">W</th>
      <th>baseline</th>
      <td>8.96</td>
      <td>0.45</td>
      <td>0.77</td>
      <td>0.50</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>2.66</td>
      <td>0.56</td>
      <td>0.84</td>
      <td>0.63</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>4.71</td>
      <td>0.61</td>
      <td>1.52</td>
      <td>0.86</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">I</th>
      <th>baseline</th>
      <td>2.33</td>
      <td>1.22</td>
      <td>2.18</td>
      <td>1.31</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>2.83</td>
      <td>1.42</td>
      <td>2.64</td>
      <td>1.61</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>3.22</td>
      <td>1.54</td>
      <td>3.57</td>
      <td>1.89</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">F</th>
      <th>baseline</th>
      <td>1.45</td>
      <td>0.41</td>
      <td>0.78</td>
      <td>0.46</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>1.74</td>
      <td>0.53</td>
      <td>0.83</td>
      <td>0.56</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>2.65</td>
      <td>0.75</td>
      <td>1.06</td>
      <td>0.77</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">M</th>
      <th>baseline</th>
      <td>1.77</td>
      <td>0.53</td>
      <td>1.22</td>
      <td>0.73</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>1.38</td>
      <td>0.39</td>
      <td>0.64</td>
      <td>0.45</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>1.66</td>
      <td>0.68</td>
      <td>1.33</td>
      <td>0.78</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">black_females</th>
      <th>baseline</th>
      <td>5.04</td>
      <td>1.26</td>
      <td>1.97</td>
      <td>1.41</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>4.30</td>
      <td>1.26</td>
      <td>1.93</td>
      <td>1.38</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>5.29</td>
      <td>1.43</td>
      <td>3.14</td>
      <td>1.69</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">black_males</th>
      <th>baseline</th>
      <td>1.83</td>
      <td>0.34</td>
      <td>0.59</td>
      <td>0.37</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>2.45</td>
      <td>0.73</td>
      <td>1.30</td>
      <td>0.87</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>2.72</td>
      <td>1.46</td>
      <td>2.37</td>
      <td>1.56</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">asian_females</th>
      <th>baseline</th>
      <td>7.85</td>
      <td>1.84</td>
      <td>2.65</td>
      <td>2.00</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>3.55</td>
      <td>1.14</td>
      <td>1.56</td>
      <td>1.25</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>3.59</td>
      <td>1.14</td>
      <td>2.06</td>
      <td>1.27</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">asian_males</th>
      <th>baseline</th>
      <td>3.00</td>
      <td>0.82</td>
      <td>2.04</td>
      <td>1.05</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>4.75</td>
      <td>0.81</td>
      <td>2.03</td>
      <td>1.05</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>5.17</td>
      <td>1.41</td>
      <td>3.39</td>
      <td>1.78</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">white_females</th>
      <th>baseline</th>
      <td>12.34</td>
      <td>0.50</td>
      <td>0.97</td>
      <td>0.60</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>4.15</td>
      <td>1.15</td>
      <td>1.54</td>
      <td>1.18</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>7.38</td>
      <td>1.51</td>
      <td>3.77</td>
      <td>2.12</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">white_males</th>
      <th>baseline</th>
      <td>8.22</td>
      <td>0.83</td>
      <td>1.94</td>
      <td>1.09</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>2.63</td>
      <td>0.56</td>
      <td>1.22</td>
      <td>0.70</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>3.91</td>
      <td>0.48</td>
      <td>0.84</td>
      <td>0.56</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">indian_females</th>
      <th>baseline</th>
      <td>3.28</td>
      <td>0.30</td>
      <td>0.51</td>
      <td>0.34</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>3.91</td>
      <td>1.82</td>
      <td>2.89</td>
      <td>1.93</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>4.87</td>
      <td>2.31</td>
      <td>3.72</td>
      <td>2.52</td>
    </tr>
    <tr>
      <th rowspan="3" valign="top">indian_males</th>
      <th>baseline</th>
      <td>4.68</td>
      <td>1.78</td>
      <td>3.16</td>
      <td>2.05</td>
    </tr>
    <tr>
      <th>faircal</th>
      <td>3.28</td>
      <td>1.82</td>
      <td>2.96</td>
      <td>2.03</td>
    </tr>
    <tr>
      <th>gmm-discrete</th>
      <td>3.38</td>
      <td>2.07</td>
      <td>3.98</td>
      <td>2.29</td>
    </tr>
  </tbody>
</table>

In [74]:
def get_overall_stats(calibration_method, nbins, dataset,feature,approach,att,n_clusters,fpr_def):
    filename = f'../experiments/{dataset}/{feature}/{approach}/{calibration_method}/nbins_{nbins}'
    if approach == 'faircal':
        filename += f'_nclusters_{n_clusters}'
    if approach == 'fsn':
        filename += f'_nclusters_{n_clusters}_fpr_1e-03'
    key = 'calibration' if approach in ['faircal', 'baseline', 'gmm-discrete'] else 'pre_calibration'
    
    results = np.load(f'{filename}.npy', allow_pickle=True).item()
    data = pd.DataFrame()
    data['folds'] = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']
    data['auc'] = np.nan
    data['fpr_1e-3'] = np.nan
    data['fpr_1e-2'] = np.nan
    data = data.set_index('folds')

    for fold in range(1,6):
        fpr = results['fold'+str(fold)]['fpr'][att]['Global'][key]
        tpr = results['fold'+str(fold)]['tpr'][att]['Global'][key]
        data.loc[f'fold{str(fold)}', 'auc'] = sklearn.metrics.auc(fpr,tpr)
        inter = np.interp(fpr_def, fpr, tpr)
        data.iloc[fold-1, 1:] = inter
    return data

In [77]:
# Accuracy table
keys = ['baseline', 'faircal', 'gmm-discrete']
error = [1e-3, 1e-2]
title_stat = ['AUROC', '0.1\% FPR', '1\% FPR']
n_clusters = 100
calibration = 'beta'
datasets = ['bfw', 'rfw']

indices = {
    'rfw' : ['facenet', 'facenet-webface'],
    'bfw' : ['facenet-webface', 'arcface']
}
approaches = ['baseline', 'faircal', 'gmm-discrete']

tuples = []
for dataset in indices:
    for feature in indices[dataset]:
        for approach in approaches:
            tuples.append((dataset, feature, approach))

index = pd.MultiIndex.from_tuples(tuples, names=['dataset', 'feature', 'approach'])
data = pd.DataFrame(index=index)
metrics = ['auc', 'fpr_1e-3', 'fpr_1e-2']
for metric in metrics:
    data[metric] = ''

for dataset in indices:
    for feature in indices[dataset]:
        for approach in approaches:
            nbins = 25 if dataset == 'bfw' else 10
            att = 'att' if dataset == 'bfw' else 'ethnicity'
            data_work = get_overall_stats(calibration, nbins, dataset, feature, approach, att ,n_clusters,error)
            data_work *= 100
            for metric in metrics:
                mean = round(data_work[metric].mean(), 2)
                std = round(data_work[metric].std(), 2)
                data.loc[dataset, feature, approach][metric] = f'{str(mean)} ({str(std)})'
print(data)

                                               auc      fpr_1e-3      fpr_1e-2
dataset feature         approach                                              
rfw     facenet         baseline      89.97 (0.58)  25.27 (6.51)   39.92 (2.4)
                        faircal       92.15 (0.45)  29.65 (2.57)  50.57 (3.59)
                        gmm-discrete   92.2 (0.37)  30.49 (3.46)  49.99 (4.16)
        facenet-webface baseline      84.46 (0.47)  11.14 (5.34)   26.45 (4.9)
                        faircal       86.96 (0.72)  23.16 (5.54)  33.94 (4.37)
                        gmm-discrete  86.87 (0.74)   18.5 (5.29)  32.25 (3.12)
bfw     facenet-webface baseline      94.62 (0.17)  27.93 (2.02)  52.79 (1.74)
                        faircal       95.65 (0.15)  38.07 (0.89)  60.08 (1.09)
                        gmm-discrete  95.26 (0.11)   33.68 (0.7)  57.33 (1.05)
        arcface         baseline      96.68 (0.38)  86.65 (1.44)  89.65 (1.19)
                        faircal       96.99 (0.41)  

In [25]:
def get_overall_stats_temp(calibration_method, nbins, dataset,feature,approach,att,n_clusters,fpr_def, new):
    filename = f'../experiments/{dataset}/{feature}/{approach}/{calibration_method}/nbins_{nbins}'
    if approach == 'faircal' or approach == 'gmm-discrete':
        filename += f'_nclusters_{n_clusters} ({new})'
    if approach == 'fsn':
        filename += f'_nclusters_{n_clusters}_fpr_1e-03'
    key = 'calibration' if approach in ['faircal', 'baseline', 'gmm-discrete'] else 'pre_calibration'
    
    results = np.load(f'{filename}.npy', allow_pickle=True).item()

    data = pd.DataFrame()
    data['folds'] = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']
    data['auc'] = np.nan
    data['fpr_1e-3'] = np.nan
    data['fpr_1e-2'] = np.nan
    data = data.set_index('folds')

    for fold in range(1,6):
        fpr = results['fold'+str(fold)]['fpr'][att]['Global'][key]
        tpr = results['fold'+str(fold)]['tpr'][att]['Global'][key]
        data.loc[f'fold{str(fold)}', 'auc'] = sklearn.metrics.auc(fpr,tpr)
        inter = np.interp(fpr_def, fpr, tpr)
        data.iloc[fold-1, 1:] = inter
    return data

# Accuracy table
keys = ['baseline', 'faircal', 'gmm-discrete']
error = [1e-3, 1e-2]
title_stat = ['AUROC', '0.1\% FPR', '1\% FPR']
n_clusters = 100
calibration = 'beta'

version = ['new', 'old']
data = pd.DataFrame(index=version)
metrics = ['auc', 'fpr_1e-3', 'fpr_1e-2']
for metric in metrics:
    data[metric] = ''

dataset = 'bfw'
feature = 'facenet-webface'
approach = 'faircal'

for current in version:
    nbins = 25 if dataset == 'bfw' else 10
    att = 'att' if dataset == 'bfw' else 'ethnicity'
    data_work = get_overall_stats_temp(calibration, nbins, dataset, feature, approach, att ,n_clusters,error, current)
    data_work *= 100
    for metric in metrics:
        mean = round(data_work[metric].mean(), 2)
        std = round(data_work[metric].std(), 2)
        data.loc[current][metric] = f'{str(mean)} ({str(std)})'

print(data)

FileNotFoundError: [Errno 2] No such file or directory: '../experiments/bfw/facenet-webface/faircal/beta/nbins_25_nclusters_100 (new).npy'