# Calibration

Evaluating calibration methods on convolutional neural networks.

In [1]:
import numpy as np
import pandas as pd
from betacal import BetaCalibration
from os.path import join
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
from cal_methods import HistogramBinning, TemperatureScaling, evaluate, cal_results

Using TensorFlow backend.


## Calibration approaches

#### 1-vs-K calibration
Histogram binning, isotonic regression and beta calibration are calibrated in 1-vs-K fashion. This means K different models are trained for K classes, so 1 model for each class
#### Multiclass calibration
Temperature scaling calibrates one model for all the classes together.

## Calibration of Predictions.

Paths to files with logits.

In [17]:
PATH = join('..','..', 'logits')
files_10 = ('probs_resnet_wide32_c10_logits.p', 'probs_densenet40_c10_logits.p',
            'probs_lenet5_c10_logits.p', 'probs_resnet110_SD_c10_logits.p',
           'probs_resnet110_c10_logits.p', 'probs_resnet152_SD_SVHN_logits.p')
files_100 = ('probs_resnet_wide32_c100_logits.p', 'probs_densenet40_c100_logits.p',
             'probs_lenet5_c100_logits.p', 'probs_resnet110_SD_c100_logits.p')
files_200 = ('probs_resnet50_birds_logits.p',)
files_1k = ('probs_resnet152_imgnet_logits.p', 'probs_densenet161_imgnet_logits.p')

files = ('probs_resnet110_c10_logits.p', 'probs_resnet110_c100_logits.p', 
         'probs_densenet40_c10_logits.p', 'probs_densenet40_c100_logits.p',
        'probs_resnet_wide32_c10_logits.p', 'probs_resnet_wide32_c100_logits.p',
         'probs_resnet50_birds_logits.p', 'probs_resnet110_SD_c10_logits.p',
         'probs_resnet110_SD_c100_logits.p', 'probs_resnet152_SD_SVHN_logits.p',
        'probs_resnet152_imgnet_logits.p', 'probs_densenet161_imgnet_logits.p' ,
         'probs_sky_243_logits.p'# ImageNet calibration takes rather long time.
        )


### Isotonic Regression

In [18]:
df_iso = cal_results(IsotonicRegression, PATH, files, {'y_min':0, 'y_max':1}, approach = "single")

resnet110_c10
file: ../../logits/probs_resnet110_c10_logits.p


UnpicklingError: invalid load key, 'v'.

### Temperature scaling

In [7]:
df_temp_scale = cal_results(TemperatureScaling, PATH, files_sky, approach = "all")


file: ../logits/p


FileNotFoundError: [Errno 2] No such file or directory: '../logits/p'

### Beta methods

In [15]:
df_beta = cal_results(BetaCalibration, PATH, files, {'parameters':"abm"}, approach = "single")

resnet110_c10
Accuracy: 93.56
Error: 6.44
ECE: 0.047503518641
MCE: 0.295798957348
Loss: 0.358274134971
brier: 0.831574102734
Error 5.460000; ece 0.009282; mce 0.192622; loss 0.179388, brier 0.797178
Time taken: 3.0589025020599365 

resnet110_c100
Accuracy: 71.48
Error: 28.52
ECE: 0.184804543945
MCE: 0.398817125148
Loss: 1.69371210955
brier: 0.661139586803
Error 27.440000; ece 0.037628; mce 0.131316; loss 1.055094, brier 0.531179
Time taken: 3.998060464859009 

densenet40_c10
Accuracy: 92.42
Error: 7.58
ECE: 0.055002704373
MCE: 0.333955179269
Loss: 0.42820705658
brier: 0.819165176082
Error 6.220000; ece 0.010305; mce 0.089124; loss 0.190338, brier 0.787227
Time taken: 3.4932668209075928 

densenet40_c100
Accuracy: 70.0
Error: 30.0
ECE: 0.211562852757
MCE: 0.45400331452
Loss: 2.0173981798
brier: 0.655000941434
Error 29.000000; ece 0.054667; mce 0.160630; loss 1.134922, brier 0.506427
Time taken: 4.3965606689453125 

resnet_wide32_c10
Accuracy: 93.93
Error: 6.07
ECE: 0.0450543206543
MCE: 

In [16]:
df_beta_am = cal_results(BetaCalibration, PATH, files, {'parameters':"am"}, approach = "single")

resnet110_c10
Accuracy: 93.56
Error: 6.44
ECE: 0.047503518641
MCE: 0.295798957348
Loss: 0.358274134971
brier: 0.831574102734
Error 5.400000; ece 0.009133; mce 0.805941; loss 0.181602, brier 0.800448
Time taken: 4.529136657714844 

resnet110_c100
Accuracy: 71.48
Error: 28.52
ECE: 0.184804543945
MCE: 0.398817125148
Loss: 1.69371210955
brier: 0.661139586803
Error 27.540000; ece 0.039731; mce 0.153032; loss 1.067957, brier 0.544350
Time taken: 4.575221538543701 

densenet40_c10
Accuracy: 92.42
Error: 7.58
ECE: 0.055002704373
MCE: 0.333955179269
Loss: 0.42820705658
brier: 0.819165176082
Error 6.260000; ece 0.015303; mce 0.257938; loss 0.192059, brier 0.790721
Time taken: 3.3815395832061768 

densenet40_c100
Accuracy: 70.0
Error: 30.0
ECE: 0.211562852757
MCE: 0.45400331452
Loss: 2.0173981798
brier: 0.655000941434
Error 29.280000; ece 0.063273; mce 0.156157; loss 1.154323, brier 0.528591
Time taken: 5.6050074100494385 

resnet_wide32_c10
Accuracy: 93.93
Error: 6.07
ECE: 0.0450543206543
MCE: 0

In [17]:
df_beta_ab = cal_results(BetaCalibration, PATH, files, {'parameters':"ab"}, approach = "single")

resnet110_c10
Accuracy: 93.56
Error: 6.44
ECE: 0.047503518641
MCE: 0.295798957348
Loss: 0.358274134971
brier: 0.831574102734
Error 5.400000; ece 0.010646; mce 0.821793; loss 0.179115, brier 0.799107
Time taken: 3.7485249042510986 

resnet110_c100
Accuracy: 71.48
Error: 28.52
ECE: 0.184804543945
MCE: 0.398817125148
Loss: 1.69371210955
brier: 0.661139586803
Error 27.920000; ece 0.047538; mce 0.125963; loss 1.063148, brier 0.560795
Time taken: 4.245985507965088 

densenet40_c10
Accuracy: 92.42
Error: 7.58
ECE: 0.055002704373
MCE: 0.333955179269
Loss: 0.42820705658
brier: 0.819165176082
Error 6.180000; ece 0.010143; mce 0.128621; loss 0.190937, brier 0.788985
Time taken: 2.7198073863983154 

densenet40_c100
Accuracy: 70.0
Error: 30.0
ECE: 0.211562852757
MCE: 0.45400331452
Loss: 2.0173981798
brier: 0.655000941434
Error 29.740000; ece 0.063487; mce 0.207877; loss 1.145645, brier 0.529303
Time taken: 4.053758144378662 

resnet_wide32_c10
Accuracy: 93.93
Error: 6.07
ECE: 0.0450543206543
MCE: 0

### Histogram binning

In [18]:
df_hb = cal_results(HistogramBinning, PATH, files, {'M':15}, approach = "single")

resnet110_c10
Accuracy: 93.56
Error: 6.44
ECE: 0.047503518641
MCE: 0.295798957348
Loss: 0.358274134971
brier: 0.831574102734
Error 4.840000; ece 0.008652; mce 0.751707; loss 0.250228, brier 0.790489
Time taken: 4.652876138687134 

resnet110_c100
Accuracy: 71.48
Error: 28.52
ECE: 0.184804543945
MCE: 0.398817125148
Loss: 1.69371210955
brier: 0.661139586803
Error 23.380000; ece 0.074361; mce 0.128896; loss 1.380544, brier 0.555317
Time taken: 21.35610008239746 

densenet40_c10
Accuracy: 92.42
Error: 7.58
ECE: 0.055002704373
MCE: 0.333955179269
Loss: 0.42820705658
brier: 0.819165176082
Error 5.300000; ece 0.008776; mce 0.191703; loss 0.263463, brier 0.778683
Time taken: 5.367844104766846 

densenet40_c100
Accuracy: 70.0
Error: 30.0
ECE: 0.211562852757
MCE: 0.45400331452
Loss: 2.0173981798
brier: 0.655000941434
Error 25.060000; ece 0.143022; mce 0.206992; loss 1.558495, brier 0.517520
Time taken: 21.429028749465942 

resnet_wide32_c10
Accuracy: 93.93
Error: 6.07
ECE: 0.0450543206543
MCE: 0.

#### Calibrated scores for Imagenet datasets.

In [101]:
df_iso

Unnamed: 0,Name,Error,ECE,MCE,Loss,Brier
0,resnet152_imgnet,23.796,0.06543,0.142914,0.988479,0.647568
1,resnet152_imgnet_calib,28.156,0.070545,0.110293,2.852046,0.585573
2,densenet161_imgnet,22.952,0.057199,0.130711,0.943955,0.651801
3,densenet161_imgnet_calib,26.82,0.069016,0.133854,2.88574,0.597491


In [99]:
df_temp_scale

Unnamed: 0,Name,Error,ECE,MCE,Loss,Brier
0,resnet152_imgnet,23.796,0.06543,0.142914,0.988479,0.647568
1,resnet152_imgnet_calib,23.796,0.020775,0.067791,0.942067,0.594931
2,densenet161_imgnet,22.952,0.057199,0.130711,0.943955,0.651801
3,densenet161_imgnet_calib,22.952,0.019422,0.049666,0.909261,0.604853


In [105]:
df_beta

Unnamed: 0,Name,Error,ECE,MCE,Loss,Brier
0,resnet152_imgnet,23.796,0.06543,0.142914,0.988479,0.647568
1,resnet152_imgnet_calib,23.716,0.034685,0.088536,0.994656,0.596155
2,densenet161_imgnet,22.952,0.057199,0.130711,0.943955,0.651801
3,densenet161_imgnet_calib,22.768,0.030964,0.108879,0.973253,0.606716


In [108]:
df_beta_am

Unnamed: 0,Name,Error,ECE,MCE,Loss,Brier
0,resnet152_imgnet,23.796,0.06543,0.142914,0.988479,0.647568
1,resnet152_imgnet_calib,23.76,0.033151,0.066534,0.976855,0.603427
2,densenet161_imgnet,22.952,0.057199,0.130711,0.943955,0.651801
3,densenet161_imgnet_calib,22.82,0.03024,0.070295,0.94779,0.613146


In [109]:
df_beta_ab

Unnamed: 0,Name,Error,ECE,MCE,Loss,Brier
0,resnet152_imgnet,23.796,0.06543,0.142914,0.988479,0.647568
1,resnet152_imgnet_calib,23.84,0.032485,0.108533,0.973891,0.602902
2,densenet161_imgnet,22.952,0.057199,0.130711,0.943955,0.651801
3,densenet161_imgnet_calib,22.904,0.028832,0.071684,0.950764,0.60879


## Dataframe with results 

In [21]:
dfs = [df_hb, df_iso, df_temp_scale, df_beta, df_beta_am, df_beta_ab]
names = ["Name", "Uncalibrated", "Histogram Binning", "Isotonic Regression", "Temperature Scaling", "Beta Calibration",
        "BC am", "BC ab"]


def get_dataframe(dfs, column, names):

    df_res = pd.DataFrame(columns=names)

    for i in range(1, len(df_iso), 2):

        name = dfs[0].iloc[i-1]["Name"] # Get name of method
        uncalibrated = dfs[0].iloc[i-1][column]  # Get uncalibrated score

        row = [name, uncalibrated]  # Add scores to row

        for df in dfs:
            row.append(df.iloc[i][column])

        df_res.loc[(i-1)//2] = row
        #errordf.iloc[i]
    
    df_res.set_index('Name', inplace = True)
        
    return df_res

In [22]:
df_error = get_dataframe(dfs, "Error", names)
df_ece = get_dataframe(dfs, "ECE", names)
df_mce = get_dataframe(dfs, "MCE", names)
df_loss = get_dataframe(dfs, "Loss", names)
df_brier = get_dataframe(dfs, "Brier", names)

## Scores

In [23]:
def highlight_min(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_max]

## Error Rate

In [24]:
df_error.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Histogram Binning,Isotonic Regression,Temperature Scaling,Beta Calibration,BC am,BC ab
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
resnet110_c10,6.44,6.59,6.36,6.44,6.44,6.4,6.45
resnet110_c100,28.52,31.26,29.31,28.52,28.36,28.28,28.47
densenet40_c10,7.58,7.93,7.65,7.58,7.59,7.59,7.57
densenet40_c100,30.0,32.49,30.22,30.0,29.81,29.7,30.02
resnet_wide32_c10,6.07,6.18,5.98,6.07,5.94,5.85,6.06
resnet_wide32_c100,26.18,28.89,26.33,26.18,25.98,26.03,26.18
resnet50_birds,26.8554,43.7694,34.3114,26.8554,27.2006,26.9934,27.166
resnet110_SD_c10,5.96,6.2,5.91,5.96,5.86,5.91,5.96
resnet110_SD_c100,27.17,29.74,27.47,27.17,26.32,26.3,27.12
resnet152_SD_SVHN,1.84773,2.07437,1.95529,1.84773,1.82468,1.82084,1.84773


## ECE

In [25]:
df_ece.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Histogram Binning,Isotonic Regression,Temperature Scaling,Beta Calibration,BC am,BC ab
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
resnet110_c10,0.0475035,0.0125078,0.0147359,0.0113208,0.0142481,0.0128235,0.0137544
resnet110_c100,0.184805,0.0905575,0.0653516,0.0237971,0.0460063,0.0480109,0.0527756
densenet40_c10,0.0550027,0.021302,0.0168437,0.00946355,0.0170027,0.0155363,0.0156559
densenet40_c100,0.211563,0.119705,0.0525144,0.00902072,0.0603064,0.0633407,0.062958
resnet_wide32_c10,0.0450543,0.0101492,0.0118988,0.00783782,0.00966474,0.0102104,0.00935039
resnet_wide32_c100,0.187838,0.0763677,0.058183,0.0147224,0.0701321,0.0571588,0.0586123
resnet50_birds,0.023535,0.126325,0.0864541,0.0168713,0.0310897,0.0235877,0.0201728
resnet110_SD_c10,0.0411256,0.0118827,0.010278,0.00555243,0.00979727,0.00905688,0.00862628
resnet110_SD_c100,0.158609,0.0759842,0.0521188,0.0121431,0.0355159,0.0364592,0.0431665
resnet152_SD_SVHN,0.00862197,0.00527611,0.00245605,0.006071,0.00502064,0.00497707,0.00488229


## MCE

In [26]:
df_mce.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Histogram Binning,Isotonic Regression,Temperature Scaling,Beta Calibration,BC am,BC ab
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
resnet110_c10,0.295799,0.444502,0.245814,0.236393,0.262211,0.25012,0.244292
resnet110_c100,0.398817,0.316352,0.133803,0.0709914,0.11442,0.120193,0.127771
densenet40_c10,0.333955,0.451015,0.0849184,0.0992925,0.250324,0.146299,0.105473
densenet40_c100,0.454003,0.169395,0.121643,0.0221282,0.129234,0.143593,0.106531
resnet_wide32_c10,0.372155,0.32977,0.267203,0.0705982,0.745817,0.245661,0.23136
resnet_wide32_c100,0.456392,0.418677,0.140664,0.036059,0.151586,0.146253,0.146764
resnet50_birds,0.272882,0.428966,0.228927,0.271395,0.131822,0.0894848,0.0953204
resnet110_SD_c10,0.324843,0.490926,0.08144,0.0782316,0.100296,0.138632,0.106758
resnet110_SD_c100,0.482906,0.289727,0.115797,0.0409927,0.1083,0.115986,0.127269
resnet152_SD_SVHN,0.250317,0.222757,0.247784,0.182437,0.259019,0.216105,0.18475


## Loss

In [27]:
df_loss.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Histogram Binning,Isotonic Regression,Temperature Scaling,Beta Calibration,BC am,BC ab
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
resnet110_c10,0.358274,0.547192,0.270758,0.209261,0.21385,0.215255,0.212043
resnet110_c100,1.69371,4.21391,1.89263,1.09169,1.13182,1.12919,1.13673
densenet40_c10,0.428207,0.572461,0.277319,0.225086,0.239184,0.239227,0.238213
densenet40_c100,2.0174,4.18285,1.64908,1.05713,1.15317,1.16477,1.15021
resnet_wide32_c10,0.381704,0.513157,0.232681,0.191482,0.202013,0.203047,0.204758
resnet_wide32_c100,1.80215,3.9352,1.56074,0.944526,1.03864,1.04555,1.03759
resnet50_birds,0.985922,10.5162,4.36454,0.986208,1.29258,1.2689,1.18341
resnet110_SD_c10,0.303252,0.499434,0.254419,0.177605,0.185573,0.186039,0.185175
resnet110_SD_c100,1.3525,4.01872,1.63712,0.942142,0.964267,0.95699,0.981194
resnet152_SD_SVHN,0.0854228,0.288669,0.109272,0.0786087,0.0796386,0.0798676,0.0794824


## Brier

In [28]:
df_brier.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Histogram Binning,Isotonic Regression,Temperature Scaling,Beta Calibration,BC am,BC ab
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
resnet110_c10,0.831574,0.779163,0.790379,0.788059,0.786434,0.789272,0.788312
resnet110_c100,0.66114,0.510992,0.537771,0.524876,0.519578,0.534147,0.551702
densenet40_c10,0.819165,0.763132,0.778261,0.770248,0.774672,0.778086,0.776613
densenet40_c100,0.655001,0.471067,0.517046,0.491403,0.495494,0.522265,0.521963
resnet_wide32_c10,0.838772,0.781067,0.796844,0.78971,0.794616,0.797888,0.794589
resnet_wide32_c100,0.698676,0.534717,0.58487,0.563071,0.56257,0.586534,0.584154
resnet50_birds,0.526291,0.507185,0.530254,0.531263,0.568665,0.560904,0.538991
resnet110_SD_c10,0.833803,0.785381,0.797051,0.792032,0.795086,0.797334,0.796442
resnet110_SD_c100,0.661362,0.542829,0.566608,0.535649,0.552287,0.554653,0.577542
resnet152_SD_SVHN,0.907179,0.88591,0.888147,0.887131,0.884714,0.884732,0.886552
