# Calibration

Testing calibration method on neural networks on image data.

In [1]:
from sklearn.metrics import log_loss, brier_score_loss
from keras.losses import categorical_crossentropy
from scipy.optimize import minimize 
import numpy as np
from utility.unpickle_probs import unpickle_probs
from utility.calibration import ECE, MCE
from os.path import join
import sklearn.metrics as metrics
import pandas as pd
from betacal import BetaCalibration
from sklearn.isotonic import IsotonicRegression

Using TensorFlow backend.


## Load in Data

Some data for testing methods.

In [2]:
PATH = join('..', '..', 'Semester IV', 'NN_image_probs')
files = ('probs_resnet110_c10_logits.p', 'probs_resnet110_c100_logits.p', )

FILE_PATH = join(PATH, files[0])
#(y_probs_val, y_val), (y_probs_test, y_test) = unpickle_probs(FILE_PATH, True)
(y_logits_val, y_val), (y_logits_test, y_test) = unpickle_probs(FILE_PATH, True)


y_probs_val: (5000, 10)
y_true_val: (5000, 1)
y_probs_test: (10000, 10)
y_true_test: (10000, 1)


In [3]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=1, keepdims=1)

### Check out the Negative Log Likelihood (NLL).

In [4]:
y_probs_val = softmax(y_logits_val)
y_probs_test = softmax(y_logits_test)

In [5]:
y_probs_val

array([[  3.50495610e-09,   1.19858523e-09,   9.99999762e-01, ...,
          3.58365365e-10,   1.65589658e-08,   5.31787308e-08],
       [  1.00000000e+00,   3.42834561e-09,   1.12174181e-09, ...,
          3.91965106e-12,   5.52859358e-09,   1.04786575e-10],
       [  7.83461230e-12,   5.50028426e-11,   1.36148398e-10, ...,
          6.90314739e-09,   1.84315185e-11,   5.23301680e-10],
       ..., 
       [  9.51968539e-13,   1.73751768e-12,   4.28410606e-12, ...,
          1.00000000e+00,   4.82873968e-14,   4.22664204e-12],
       [  3.68825553e-10,   1.33624167e-09,   9.49588463e-11, ...,
          1.00000000e+00,   1.06427133e-11,   6.58591723e-11],
       [  2.29454624e-08,   3.61428859e-10,   2.38165171e-06, ...,
          2.24385076e-06,   4.55320764e-12,   2.68068068e-10]], dtype=float32)

In [6]:
log_loss(y_true=y_val, y_pred=y_probs_val)

0.30016390198023241

In [7]:
log_loss(y_true=y_test, y_pred=y_probs_test)

0.35827413497065691

The NLL is different from the actual output for some reason. Look more into it.

## Temperature Scaling

In [20]:
def temp_scale(y_probs, x):
    return y_probs/x

In [21]:
def loss_fun(x, y_probs, y_true):
    scaled_probs = softmax(temp_scale(y_probs, x))    
    loss = log_loss(y_true=y_true, y_pred=scaled_probs)
    
    return loss

In [17]:
def evaluate(y_probs, y_true, verbose = False, normalize = False, bins = 15):
    
    y_preds = np.argmax(y_probs, axis=1)  # Take maximum confidence as prediction
    
    if normalize:
        y_confs = np.max(y_probs, axis=1)/np.sum(y_probs, axis=1)
        # Check if everything below or equal to 1?
    else:
        y_confs = np.max(y_probs, axis=1)  # Take only maximum confidence
    
    accuracy = metrics.accuracy_score(y_true, y_preds) * 100
    error = 100 - accuracy
    
        # Calculate ECE
    ece = ECE(y_confs, y_preds, y_true, bin_size = 1/bins)
    # Calculate MCE
    mce = MCE(y_confs, y_preds, y_true, bin_size = 1/bins)
    
    loss = log_loss(y_true=y_true, y_pred=y_probs)
    
    y_prob_true = np.array([y_probs[i, idx] for i, idx in enumerate(y_true)])  # Probability of positive class
    brier = brier_score_loss(y_true=y_true, y_prob=y_prob_true)  # Brier Score (MSE)
    
    if verbose:
        print("Accuracy:", accuracy)
        print("Error:", error)
        print("ECE:", ece)
        print("MCE:", mce)
        print("Loss:", loss)
        print("brier:", brier)
    
    return (error, ece, mce, loss, brier)

In [18]:
evaluate(softmax(y_logits_test), y_test, verbose=True)

Accuracy: 93.56
Error: 6.44
ECE: 0.047503518641
MCE: 0.295798957348
Loss: 0.358274134971
brier: 0.831574102734


(6.4399999999999977,
 0.047503518640995043,
 0.29579895734786987,
 0.35827413497065691,
 0.83157410273384924)

In [22]:
evaluate(softmax(temp_scale(y_logits_test, 2.395011)), y_test, verbose=True)

Accuracy: 93.56
Error: 6.44
ECE: 0.011320804061
MCE: 0.236392512918
Loss: 0.20926125583
brier: 0.788059206672


(6.4399999999999977,
 0.011320804060995518,
 0.23639251291751862,
 0.20926125582950877,
 0.78805920667178175)

In [25]:
PATH = join('..', '..', 'Semester IV', 'NN_image_probs')
files = ('probs_resnet110_c10_logits.p', 'probs_resnet110_c100_logits.p', 
         'probs_densenet40_c10_logits.p', 'probs_densenet40_c100_logits.p',
        'probs_resnet_wide32_c10_logits.p', 'probs_resnet_wide32_c100_logits.p',
        'probs_resnet152_imgnet_logits.p', 'probs_densenet161_imgnet_logits.p')

def temp_results(path, files, method = "BFGS"):
    
    df = pd.DataFrame(columns=["Name", "Error", "ECE", "MCE", "Loss", "Brier", "Temperature"])
    
    
    for i, f in enumerate(files):
        
        name = "_".join(f.split("_")[1:-1])
        print(name)

        FILE_PATH = join(path, f)
        (y_logits_val, y_val), (y_logits_test, y_test) = unpickle_probs(FILE_PATH)
        
        opt = minimize(loss_fun, x0 = 1, args=(y_logits_val, y_val), options={'maxiter':200}, method = method)
        temp = opt.x[0]
        
        _, _, _, _, _ = evaluate(softmax(temp_scale(y_logits_val, temp)), y_val, verbose=True)
        error, ece, mce, loss, brier = evaluate(softmax(y_logits_test), y_test, verbose=False)  # Test before scaling
        error2, ece2, mce2, loss2, brier2 = evaluate(softmax(temp_scale(y_logits_test, temp)), y_test, verbose=False)
        
        df.loc[i*2] = [name, error, ece, mce, loss, brier, temp]
        df.loc[i*2+1] = [(name + "_temp"), error2, ece2, mce2, loss2, brier2, temp]

        
    return df
    

In [26]:
df_temp_scale = temp_results(PATH, files)

resnet110_c10
Accuracy: 94.62
Error: 5.38
ECE: 0.00728217680454
MCE: 0.816615432501
Loss: 0.180503754565
brier: 0.799735165234
resnet110_c100
Accuracy: 72.1
Error: 27.9
ECE: 0.0208565565631
MCE: 0.056080172789
Loss: 1.0636127493
brier: 0.531455581933
densenet40_c10
Accuracy: 93.82
Error: 6.18
ECE: 0.00594705242515
MCE: 0.10719075799
Loss: 0.186612238607
brier: 0.783365853767
densenet40_c100
Accuracy: 70.26
Error: 29.74
ECE: 0.0144420869678
MCE: 0.045564418529
Loss: 1.0773751477
brier: 0.493179245312
resnet_wide32_c10
Accuracy: 96.02
Error: 3.98
ECE: 0.00712747764289
MCE: 0.242319499453
Loss: 0.134110293426
brier: 0.826368418946
resnet_wide32_c100
Accuracy: 78.46
Error: 21.54
ECE: 0.0407689332046
MCE: 0.156267919276
Loss: 0.836059538963
brier: 0.641082268727
resnet152_imgnet
Accuracy: 75.36
Error: 24.64
ECE: 0.0245080424418
MCE: 0.0559367342452
Loss: 1.00401962933
brier: 0.574710464688
densenet161_imgnet
Accuracy: 77.26
Error: 22.74
ECE: 0.0208654103999
MCE: 0.0781966217362
Loss: 0.9092

In [27]:
df_temp_scale

Unnamed: 0,Name,Error,ECE,MCE,Loss,Brier,Temperature
0,resnet110_c10,6.44,0.047504,0.295799,0.358274,0.831574,2.395011
1,resnet110_c10_temp,6.44,0.011321,0.236393,0.209261,0.788059,2.395011
2,resnet110_c100,28.52,0.184805,0.398817,1.693712,0.66114,2.310216
3,resnet110_c100_temp,28.52,0.023797,0.070991,1.091691,0.524876,2.310216
4,densenet40_c10,7.58,0.055003,0.333955,0.428207,0.819165,2.88255
5,densenet40_c10_temp,7.58,0.009464,0.099293,0.225086,0.770248,2.88255
6,densenet40_c100,30.0,0.211563,0.454003,2.017398,0.655001,3.194026
7,densenet40_c100_temp,30.0,0.009021,0.022128,1.057131,0.491403,3.194026
8,resnet_wide32_c10,5.01,0.026291,0.270675,0.18078,0.837849,1.518296
9,resnet_wide32_c10_temp,5.01,0.007963,0.254111,0.154391,0.814687,1.518296


## Extension of Binning Methods

## Isotonic  Regression

In [43]:
from sklearn.linear_model import LogisticRegression
from calmap import plot_calibration_map
from matplotlib import pyplot as plt


In [37]:
PATH = join('..', '..', 'Semester IV', 'NN_image_probs')
files = ('probs_resnet110_c10_logits.p', 'probs_resnet110_c100_logits.p', 
         'probs_densenet40_c10_logits.p', 'probs_densenet40_c100_logits.p',
        'probs_resnet_wide32_c10_logits.p', 'probs_resnet_wide32_c100_logits.p',
        'probs_resnet152_imgnet_logits.p', 'probs_densenet161_imgnet_logits.p')

# Calibration results for one-vs-all method
def cal_results(method, path, files, m_kwargs = {}, check_val = False):
    
    df = pd.DataFrame(columns=["Name", "Error", "ECE", "MCE", "Loss", "Brier"])
    
    for i, f in enumerate(files):
        
        name = "_".join(f.split("_")[1:-1])
        print("\n", name)

        FILE_PATH = join(path, f)
        (y_logits_val, y_val), (y_logits_test, y_test) = unpickle_probs(FILE_PATH)

        y_probs_val = softmax(y_logits_val)  # Softmax logits
        y_probs_test = softmax(y_logits_test)
        K = y_probs_test.shape[1]

        # Go through all the classes
        for k in range(K):
            # Prep class labels (1 fixed true class, 0 other classes)
            y_cal = np.array(y_val == k, dtype="int")[:, 0]

            # Train model
            model = method(**m_kwargs)
            model.fit(y_probs_val[:, k], y_cal) # Get only one column with probs for given class "k"

            y_probs_val[:, k] = model.predict(y_probs_val[:, k])  # Predict new values based on the fittting
            y_probs_test[:, k] = model.predict(y_probs_test[:, k])

            # Replace NaN with 0, as it should be close to zero  # TODO is it needed?
            idx_nan = np.where(np.isnan(y_probs_test))
            y_probs_test[idx_nan] = 0
            
            idx_nan = np.where(np.isnan(y_probs_val))
            y_probs_val[idx_nan] = 0

        if check_val:  # Check results on validation set
            _, _, _, _, _ = evaluate(softmax(y_logits_val), y_val, verbose=True, normalize=False)  #
            _, _, _, _, _ = evaluate(y_probs_val, y_val, verbose=True, normalize=True)
            
        # Get results for test set
        error, ece, mce, loss, brier = evaluate(softmax(y_logits_test), y_test, verbose=True, normalize=False)
        error2, ece2, mce2, loss2, brier2 = evaluate(y_probs_test, y_test, verbose=False, normalize=True)

        # Add results into DataFrame
        df.loc[i*2] = [name, error, ece, mce, loss, brier]
        df.loc[i*2+1] = [(name + "_calib"), error2, ece2, mce2, loss2, brier2]

        
    return df

In [34]:
params = {'y_min':0, 'y_max':1}
df_iso = cal_results(IsotonicRegression, PATH, files, params)


 resnet110_c10
Accuracy: 93.56
Error: 6.44
ECE: 0.047503518641
MCE: 0.295798957348
Loss: 0.358274134971
brier: 0.831574102734

 resnet110_c100
Accuracy: 71.48
Error: 28.52
ECE: 0.184804543945
MCE: 0.398817125148
Loss: 1.69371210955
brier: 0.661139586803


  
  filtered_tuples = [x for x in zip(pred, true, conf) if x[2] > conf_thresh_lower and x[2] <= conf_thresh_upper]



 densenet40_c10
Accuracy: 92.42
Error: 7.58
ECE: 0.055002704373
MCE: 0.333955179269
Loss: 0.42820705658
brier: 0.819165176082

 densenet40_c100
Accuracy: 70.0
Error: 30.0
ECE: 0.211562852757
MCE: 0.45400331452
Loss: 2.0173981798
brier: 0.655000941434

 resnet_wide32_c10
Accuracy: 94.99
Error: 5.01
ECE: 0.0262912020743
MCE: 0.270675277616
Loss: 0.18078037455
brier: 0.837849477046

 resnet_wide32_c100
Accuracy: 77.2
Error: 22.8
ECE: 0.0824934380092
MCE: 0.205758135969
Loss: 0.924750990686
brier: 0.679727633367

 resnet152_imgnet
Accuracy: 74.708
Error: 25.292
ECE: 0.0665072336364
MCE: 0.133044166026
Loss: 1.06397533857
brier: 0.624490802257

 densenet161_imgnet
Accuracy: 76.6
Error: 23.4
ECE: 0.0589743148203
MCE: 0.138486335666
Loss: 0.962239300614
brier: 0.645603191194


In [35]:
df_iso

Unnamed: 0,Name,Error,ECE,MCE,Loss,Brier
0,resnet110_c10,6.44,0.047504,0.295799,0.358274,0.831574
1,resnet110_c10_iso,6.36,0.014736,0.245814,0.270758,0.790379
2,resnet110_c100,28.52,0.184805,0.398817,1.693712,0.66114
3,resnet110_c100_iso,29.31,0.065352,0.133803,1.892628,0.537771
4,densenet40_c10,7.58,0.055003,0.333955,0.428207,0.819165
5,densenet40_c10_iso,7.65,0.016844,0.084918,0.277319,0.778261
6,densenet40_c100,30.0,0.211563,0.454003,2.017398,0.655001
7,densenet40_c100_iso,30.22,0.052514,0.121643,1.64908,0.517046
8,resnet_wide32_c10,5.01,0.026291,0.270675,0.18078,0.837849
9,resnet_wide32_c10_iso,5.01,0.010596,0.101016,0.234374,0.817886


In [38]:
df_beta = cal_results(BetaCalibration, PATH, files, {'parameters':"abm"})


 resnet110_c10
Accuracy: 93.56
Error: 6.44
ECE: 0.047503518641
MCE: 0.295798957348
Loss: 0.358274134971
brier: 0.831574102734

 resnet110_c100
Accuracy: 71.48
Error: 28.52
ECE: 0.184804543945
MCE: 0.398817125148
Loss: 1.69371210955
brier: 0.661139586803

 densenet40_c10
Accuracy: 92.42
Error: 7.58
ECE: 0.055002704373
MCE: 0.333955179269
Loss: 0.42820705658
brier: 0.819165176082

 densenet40_c100
Accuracy: 70.0
Error: 30.0
ECE: 0.211562852757
MCE: 0.45400331452
Loss: 2.0173981798
brier: 0.655000941434

 resnet_wide32_c10
Accuracy: 94.99
Error: 5.01
ECE: 0.0262912020743
MCE: 0.270675277616
Loss: 0.18078037455
brier: 0.837849477046

 resnet_wide32_c100
Accuracy: 77.2
Error: 22.8
ECE: 0.0824934380092
MCE: 0.205758135969
Loss: 0.924750990686
brier: 0.679727633367

 resnet152_imgnet
Accuracy: 74.708
Error: 25.292
ECE: 0.0665072336364
MCE: 0.133044166026
Loss: 1.06397533857
brier: 0.624490802257

 densenet161_imgnet
Accuracy: 76.6
Error: 23.4
ECE: 0.0589743148203
MCE: 0.138486335666
Loss: 0.

In [39]:
df_beta

Unnamed: 0,Name,Error,ECE,MCE,Loss,Brier
0,resnet110_c10,6.44,0.047504,0.295799,0.358274,0.831574
1,resnet110_c10_calib,6.44,0.014248,0.262211,0.21385,0.786434
2,resnet110_c100,28.52,0.184805,0.398817,1.693712,0.66114
3,resnet110_c100_calib,28.36,0.046006,0.11442,1.131824,0.519578
4,densenet40_c10,7.58,0.055003,0.333955,0.428207,0.819165
5,densenet40_c10_calib,7.59,0.017003,0.250324,0.239184,0.774672
6,densenet40_c100,30.0,0.211563,0.454003,2.017398,0.655001
7,densenet40_c100_calib,29.81,0.060306,0.129234,1.153169,0.495494
8,resnet_wide32_c10,5.01,0.026291,0.270675,0.18078,0.837849
9,resnet_wide32_c10_calib,5.0,0.009261,0.094009,0.154432,0.814365


In [40]:
df_beta_am = cal_results(BetaCalibration, PATH, files, {'parameters':"am"})


 resnet110_c10
Accuracy: 93.56
Error: 6.44
ECE: 0.047503518641
MCE: 0.295798957348
Loss: 0.358274134971
brier: 0.831574102734

 resnet110_c100
Accuracy: 71.48
Error: 28.52
ECE: 0.184804543945
MCE: 0.398817125148
Loss: 1.69371210955
brier: 0.661139586803

 densenet40_c10
Accuracy: 92.42
Error: 7.58
ECE: 0.055002704373
MCE: 0.333955179269
Loss: 0.42820705658
brier: 0.819165176082

 densenet40_c100
Accuracy: 70.0
Error: 30.0
ECE: 0.211562852757
MCE: 0.45400331452
Loss: 2.0173981798
brier: 0.655000941434

 resnet_wide32_c10
Accuracy: 94.99
Error: 5.01
ECE: 0.0262912020743
MCE: 0.270675277616
Loss: 0.18078037455
brier: 0.837849477046

 resnet_wide32_c100
Accuracy: 77.2
Error: 22.8
ECE: 0.0824934380092
MCE: 0.205758135969
Loss: 0.924750990686
brier: 0.679727633367

 resnet152_imgnet
Accuracy: 74.708
Error: 25.292
ECE: 0.0665072336364
MCE: 0.133044166026
Loss: 1.06397533857
brier: 0.624490802257

 densenet161_imgnet
Accuracy: 76.6
Error: 23.4
ECE: 0.0589743148203
MCE: 0.138486335666
Loss: 0.

In [41]:
df_beta_am

Unnamed: 0,Name,Error,ECE,MCE,Loss,Brier
0,resnet110_c10,6.44,0.047504,0.295799,0.358274,0.831574
1,resnet110_c10_calib,6.4,0.012823,0.25012,0.215255,0.789272
2,resnet110_c100,28.52,0.184805,0.398817,1.693712,0.66114
3,resnet110_c100_calib,28.28,0.048011,0.120193,1.129195,0.534147
4,densenet40_c10,7.58,0.055003,0.333955,0.428207,0.819165
5,densenet40_c10_calib,7.59,0.015536,0.146299,0.239227,0.778086
6,densenet40_c100,30.0,0.211563,0.454003,2.017398,0.655001
7,densenet40_c100_calib,29.7,0.063341,0.143593,1.164765,0.522265
8,resnet_wide32_c10,5.01,0.026291,0.270675,0.18078,0.837849
9,resnet_wide32_c10_calib,5.0,0.008012,0.081865,0.153624,0.814909


In [42]:
df_beta_ab = cal_results(BetaCalibration, PATH, files, {'parameters':"ab"})


 resnet110_c10
Accuracy: 93.56
Error: 6.44
ECE: 0.047503518641
MCE: 0.295798957348
Loss: 0.358274134971
brier: 0.831574102734

 resnet110_c100
Accuracy: 71.48
Error: 28.52
ECE: 0.184804543945
MCE: 0.398817125148
Loss: 1.69371210955
brier: 0.661139586803

 densenet40_c10
Accuracy: 92.42
Error: 7.58
ECE: 0.055002704373
MCE: 0.333955179269
Loss: 0.42820705658
brier: 0.819165176082

 densenet40_c100
Accuracy: 70.0
Error: 30.0
ECE: 0.211562852757
MCE: 0.45400331452
Loss: 2.0173981798
brier: 0.655000941434

 resnet_wide32_c10
Accuracy: 94.99
Error: 5.01
ECE: 0.0262912020743
MCE: 0.270675277616
Loss: 0.18078037455
brier: 0.837849477046

 resnet_wide32_c100
Accuracy: 77.2
Error: 22.8
ECE: 0.0824934380092
MCE: 0.205758135969
Loss: 0.924750990686
brier: 0.679727633367

 resnet152_imgnet
Accuracy: 74.708
Error: 25.292
ECE: 0.0665072336364
MCE: 0.133044166026
Loss: 1.06397533857
brier: 0.624490802257

 densenet161_imgnet
Accuracy: 76.6
Error: 23.4
ECE: 0.0589743148203
MCE: 0.138486335666
Loss: 0.

In [43]:
df_beta_ab

Unnamed: 0,Name,Error,ECE,MCE,Loss,Brier
0,resnet110_c10,6.44,0.047504,0.295799,0.358274,0.831574
1,resnet110_c10_calib,6.45,0.013754,0.244292,0.212043,0.788312
2,resnet110_c100,28.52,0.184805,0.398817,1.693712,0.66114
3,resnet110_c100_calib,28.47,0.052776,0.127771,1.136727,0.551702
4,densenet40_c10,7.58,0.055003,0.333955,0.428207,0.819165
5,densenet40_c10_calib,7.57,0.015656,0.105473,0.238213,0.776613
6,densenet40_c100,30.0,0.211563,0.454003,2.017398,0.655001
7,densenet40_c100_calib,30.02,0.062958,0.106531,1.15021,0.521963
8,resnet_wide32_c10,5.01,0.026291,0.270675,0.18078,0.837849
9,resnet_wide32_c10_calib,5.02,0.009303,0.146759,0.152603,0.815624


## Histogram Binning

In [48]:
class HistogramBinning():
    
    
    def __init__(self, M=15):
        self.bin_size = 1./M  # Calculate bin size
        self.conf = []  # Initiate confidence list
        self.upper_bounds = np.arange(self.bin_size, 1+self.bin_size, self.bin_size)  # Set bin bounds for intervals

    
    def _get_conf(self, conf_thresh_lower, conf_thresh_upper, probs, true):

        filtered = [x[0] for x in zip(true, probs) if x[1] > conf_thresh_lower and x[1] <= conf_thresh_upper]
        nr_elems = len(filtered)

        if nr_elems < 1:
            return 0
        else:
            conf = sum(filtered)/nr_elems
            return conf
    

    def fit(self, probs, true):

        conf = []

        # Got through intervals and add confidence to list
        for conf_thresh in self.upper_bounds:
            temp_conf = self._get_conf((conf_thresh - self.bin_size), conf_thresh, probs = probs, true = true)
            conf.append(temp_conf)

        self.conf = conf

            


    # Fit based on predicted confidence
    def predict(self, probs):

        # Go through all the probs and check what confidence is suitable for it.
        for i, prob in enumerate(probs):
            idx = np.searchsorted(self.upper_bounds, prob)
            probs[i] = self.conf[idx]

        return probs    

In [49]:
df_hb = cal_results(HistogramBinning, PATH, files, {'M':15})


 resnet110_c10
Accuracy: 93.56
Error: 6.44
ECE: 0.047503518641
MCE: 0.295798957348
Loss: 0.358274134971
brier: 0.831574102734

 resnet110_c100
Accuracy: 71.48
Error: 28.52
ECE: 0.184804543945
MCE: 0.398817125148
Loss: 1.69371210955
brier: 0.661139586803

 densenet40_c10
Accuracy: 92.42
Error: 7.58
ECE: 0.055002704373
MCE: 0.333955179269
Loss: 0.42820705658
brier: 0.819165176082

 densenet40_c100
Accuracy: 70.0
Error: 30.0
ECE: 0.211562852757
MCE: 0.45400331452
Loss: 2.0173981798
brier: 0.655000941434

 resnet_wide32_c10
Accuracy: 94.99
Error: 5.01
ECE: 0.0262912020743
MCE: 0.270675277616
Loss: 0.18078037455
brier: 0.837849477046

 resnet_wide32_c100
Accuracy: 77.2
Error: 22.8
ECE: 0.0824934380092
MCE: 0.205758135969
Loss: 0.924750990686
brier: 0.679727633367

 resnet152_imgnet
Accuracy: 74.708
Error: 25.292
ECE: 0.0665072336364
MCE: 0.133044166026
Loss: 1.06397533857
brier: 0.624490802257

 densenet161_imgnet
Accuracy: 76.6
Error: 23.4
ECE: 0.0589743148203
MCE: 0.138486335666
Loss: 0.

In [50]:
df_hb

Unnamed: 0,Name,Error,ECE,MCE,Loss,Brier
0,resnet110_c10,6.44,0.047504,0.295799,0.358274,0.831574
1,resnet110_c10_calib,6.59,0.012508,0.444502,0.547192,0.779163
2,resnet110_c100,28.52,0.184805,0.398817,1.693712,0.66114
3,resnet110_c100_calib,31.26,0.090558,0.316352,4.213913,0.510992
4,densenet40_c10,7.58,0.055003,0.333955,0.428207,0.819165
5,densenet40_c10_calib,7.93,0.021302,0.451015,0.572461,0.763132
6,densenet40_c100,30.0,0.211563,0.454003,2.017398,0.655001
7,densenet40_c100_calib,32.49,0.119705,0.169395,4.182849,0.471067
8,resnet_wide32_c10,5.01,0.026291,0.270675,0.18078,0.837849
9,resnet_wide32_c10_calib,5.31,0.014299,0.266247,0.54436,0.816282


## Dirilecht Calibration

In [None]:
class DirilechCalibration():
    
    def __init__(self, parameters)

## Dataframe of results 

In [51]:
# df_temp_scale, df_iso, df_beta, df_hb

dfs = [df_hb, df_iso, df_temp_scale, df_beta, df_beta_am, df_beta_ab]
names = ["Name", "Uncalibrated", "Histogram Binning", "Isotonic Regression", "Temperature Scaling", "Beta Calibration",
        "BC am", "BC ab"]


def get_dataframe(dfs, column, names):

    df_res = pd.DataFrame(columns=names)

    for i in range(1, len(df_iso), 2):

        name = dfs[0].iloc[i-1]["Name"] # Get name of method
        uncalibrated = dfs[0].iloc[i-1][column]  # Get uncalibrated score

        row = [name, uncalibrated]  # Add scores to row

        for df in dfs:
            row.append(df.iloc[i][column])

        df_res.loc[(i-1)//2] = row
        #errordf.iloc[i]
    
    df_res.set_index('Name', inplace = True)
        
    return df_res

In [52]:
df_error = get_dataframe(dfs, "Error", names)
df_ece = get_dataframe(dfs, "ECE", names)
df_mce = get_dataframe(dfs, "MCE", names)
df_loss = get_dataframe(dfs, "Loss", names)
df_brier = get_dataframe(dfs, "Brier", names)

## Scores

In [53]:
def highlight_min(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_max]

## Error

In [54]:
df_error.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Histogram Binning,Isotonic Regression,Temperature Scaling,Beta Calibration,BC am,BC ab
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
resnet110_c10,6.44,6.59,6.36,6.44,6.44,6.4,6.45
resnet110_c100,28.52,31.26,29.31,28.52,28.36,28.28,28.47
densenet40_c10,7.58,7.93,7.65,7.58,7.59,7.59,7.57
densenet40_c100,30.0,32.49,30.22,30.0,29.81,29.7,30.02
resnet_wide32_c10,5.01,5.31,5.01,5.01,5.0,5.0,5.02
resnet_wide32_c100,22.8,26.64,24.09,22.8,22.93,22.88,22.71
resnet152_imgnet,25.292,33.296,29.308,25.292,25.26,25.228,25.248
densenet161_imgnet,23.4,31.092,27.264,23.4,23.276,23.236,23.404


## ECE

In [55]:
df_ece.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Histogram Binning,Isotonic Regression,Temperature Scaling,Beta Calibration,BC am,BC ab
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
resnet110_c10,0.0475035,0.0125078,0.0147359,0.0113208,0.0142481,0.0128235,0.0137544
resnet110_c100,0.184805,0.0905575,0.0653516,0.0237971,0.0460063,0.0480109,0.0527756
densenet40_c10,0.0550027,0.021302,0.0168437,0.00946355,0.0170027,0.0155363,0.0156559
densenet40_c100,0.211563,0.119705,0.0525144,0.00902072,0.0603064,0.0633407,0.062958
resnet_wide32_c10,0.0262912,0.0142993,0.010596,0.00796326,0.00926128,0.00801195,0.0093031
resnet_wide32_c100,0.0824934,0.0942779,0.0533204,0.0422688,0.0484455,0.0457086,0.0462717
resnet152_imgnet,0.0665072,0.0871103,0.0738013,0.0222174,0.0333611,0.0322548,0.0332408
densenet161_imgnet,0.0589743,0.0879124,0.0722485,0.0209126,0.0334077,0.0300838,0.0320456


## MCE

In [56]:
df_mce.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Histogram Binning,Isotonic Regression,Temperature Scaling,Beta Calibration,BC am,BC ab
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
resnet110_c10,0.295799,0.444502,0.245814,0.236393,0.262211,0.25012,0.244292
resnet110_c100,0.398817,0.316352,0.133803,0.0709914,0.11442,0.120193,0.127771
densenet40_c10,0.333955,0.451015,0.0849184,0.0992925,0.250324,0.146299,0.105473
densenet40_c100,0.454003,0.169395,0.121643,0.0221282,0.129234,0.143593,0.106531
resnet_wide32_c10,0.270675,0.266247,0.101016,0.254111,0.0940089,0.0818646,0.146759
resnet_wide32_c100,0.205758,0.280764,0.141408,0.108004,0.138961,0.126308,0.138236
resnet152_imgnet,0.133044,0.278709,0.117242,0.0605335,0.0856277,0.0786869,0.0691434
densenet161_imgnet,0.138486,0.290375,0.271487,0.0509325,0.0706651,0.111615,0.0665525


## Loss

In [57]:
df_loss.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Histogram Binning,Isotonic Regression,Temperature Scaling,Beta Calibration,BC am,BC ab
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
resnet110_c10,0.358274,0.547192,0.270758,0.209261,0.21385,0.215255,0.212043
resnet110_c100,1.69371,4.21391,1.89263,1.09169,1.13182,1.12919,1.13673
densenet40_c10,0.428207,0.572461,0.277319,0.225086,0.239184,0.239227,0.238213
densenet40_c100,2.0174,4.18285,1.64908,1.05713,1.15317,1.16477,1.15021
resnet_wide32_c10,0.18078,0.54436,0.234374,0.154391,0.154432,0.153624,0.152603
resnet_wide32_c100,0.924751,4.07853,1.53304,0.871434,0.889862,0.879483,0.884919
resnet152_imgnet,1.06398,6.52519,2.99466,1.01687,1.06143,1.03939,1.05078
densenet161_imgnet,0.962239,6.28319,2.93137,0.926957,0.99954,0.968818,0.964951


## Brier

In [58]:
df_brier.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Histogram Binning,Isotonic Regression,Temperature Scaling,Beta Calibration,BC am,BC ab
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
resnet110_c10,0.831574,0.779163,0.790379,0.788059,0.786434,0.789272,0.788312
resnet110_c100,0.66114,0.510992,0.537771,0.524876,0.519578,0.534147,0.551702
densenet40_c10,0.819165,0.763132,0.778261,0.770248,0.774672,0.778086,0.776613
densenet40_c100,0.655001,0.471067,0.517046,0.491403,0.495494,0.522265,0.521963
resnet_wide32_c10,0.837849,0.816282,0.817886,0.814687,0.814365,0.814909,0.815624
resnet_wide32_c100,0.679728,0.613907,0.623213,0.63246,0.611984,0.627775,0.622837
resnet152_imgnet,0.624491,0.565937,0.565222,0.571829,0.573767,0.581115,0.579993
densenet161_imgnet,0.645603,0.593834,0.59419,0.599938,0.601961,0.60862,0.603064
