# Calibration

Testing calibration method on neural networks on image data.

In [1]:
from sklearn.metrics import log_loss
from keras.losses import categorical_crossentropy
from scipy.optimize import minimize 
import numpy as np
from utility.unpickle_probs import unpickle_probs
from utility.calibration import ECE, MCE
from os.path import join
import sklearn.metrics as metrics
import pandas as pd
from betacal import BetaCalibration
from sklearn.isotonic import IsotonicRegression

Using TensorFlow backend.


## Load in Data

In [18]:
PATH = join('..', '..', 'Semester IV', 'NN_image_probs')
files = ('probs_resnet110_c10_logits.p', 'probs_resnet110_c100_logits.p', )

FILE_PATH = join(PATH, files[0])
#(y_probs_val, y_val), (y_probs_test, y_test) = unpickle_probs(FILE_PATH, True)
(y_logits_val, y_val), (y_logits_test, y_test) = unpickle_probs(FILE_PATH, True)


y_probs_val: (5000, 10)
y_true_val: (5000, 1)
y_probs_test: (10000, 10)
y_true_test: (10000, 1)


In [3]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=1, keepdims=1)

### Check out the Negative Log Likelihood (NLL).

In [4]:
y_probs_val = softmax(y_logits_val)
y_probs_test = softmax(y_logits_test)

In [5]:
y_probs_val

array([[  8.40427012e-13,   1.42780250e-08,   3.99260486e-11, ...,
          9.59866611e-07,   1.91911340e-08,   1.83874263e-07],
       [  1.76513110e-10,   3.60708616e-11,   1.21648813e-09, ...,
          1.18096375e-11,   8.99881142e-11,   2.75708501e-11],
       [  7.99159494e-08,   4.21872346e-06,   2.94991787e-09, ...,
          7.87907517e-09,   1.10995479e-07,   9.70341682e-01],
       ..., 
       [  1.70235054e-07,   6.34476027e-09,   5.21218390e-05, ...,
          1.09700418e-08,   2.53646437e-08,   1.10628315e-08],
       [  2.51490473e-09,   1.28695821e-11,   7.32585312e-11, ...,
          9.52419654e-08,   8.75499239e-12,   1.06539312e-10],
       [  2.74159762e-09,   1.33526710e-08,   2.28795694e-09, ...,
          4.32334724e-11,   1.46111478e-09,   1.17422613e-10]], dtype=float32)

In [6]:
log_loss(y_true=y_val, y_pred=y_probs_val)

1.6219570767824834

In [7]:
log_loss(y_true=y_test, y_pred=y_probs_test)

1.693712109546009

The NLL is different from the actual output for some reason. Look more into it.

## Temperature Scaling

In [8]:
def temp_scale(y_probs, x):
    return y_probs/x

In [9]:
def loss_fun(x, y_probs, y_true):
    scaled_probs = softmax(temp_scale(y_probs, x))    
    loss = log_loss(y_true=y_true, y_pred=scaled_probs)
    
    return loss

In [10]:
opt = minimize(loss_fun, x0 = 1, args=(y_logits_val, y_val), options={'maxiter':50}, method = "BFGS")

In [11]:
opt

      fun: 1.0636127548279124
 hess_inv: array([[ 2.99245655]])
      jac: array([ -1.49011612e-08])
  message: 'Optimization terminated successfully.'
     nfev: 21
      nit: 6
     njev: 7
   status: 0
  success: True
        x: array([ 2.31021564])

In [111]:
def evaluate(y_probs, y_true, verbose = False, normalize = False, bins = 15):
    
    y_preds = np.argmax(y_probs, axis=1)  # Take maximum confidence as prediction
    
    if normalize:
        y_confs = np.max(y_probs, axis=1)/np.sum(y_probs, axis=1)
        # Check if everything below or equal to 1?
    else:
        y_confs = np.max(y_probs, axis=1)  # Take only maximum confidence
    
    accuracy = metrics.accuracy_score(y_true, y_preds) * 100
    error = 100 - accuracy
    
        # Calculate ECE
    ece = ECE(y_confs, y_preds, y_true, bin_size = 1/bins)
    # Calculate MCE
    mce = MCE(y_confs, y_preds, y_true, bin_size = 1/bins)
    
    loss = log_loss(y_true=y_true, y_pred=y_probs)
    
    if verbose:
        print("Accuracy:", accuracy)
        print("Error:", error)
        print("ECE:", ece)
        print("MCE:", mce)
        print("Loss:", loss)
    
    return (error, ece, mce, loss)

In [26]:
evaluate(softmax(y_logits_test), y_test, verbose=True)

Accuracy: 71.48
Error: 28.52
ECE: 0.184804543945
MCE: 0.398817125148
Loss: 1.69371210955


(28.519999999999996,
 0.18480454394519324,
 0.39881712514822332,
 1.693712109546009)

In [27]:
evaluate(softmax(temp_scale(y_logits_test, temp)), y_test, verbose=True)

Accuracy: 71.48
Error: 28.52
ECE: 0.0237969729677
MCE: 0.0709915722652
Loss: 1.09169134071


(28.519999999999996,
 0.023796972967684245,
 0.070991572265217934,
 1.0916913407060644)

In [14]:
PATH = join('..', '..', 'Semester IV', 'NN_image_probs')
files = ('probs_resnet110_c10_logits.p', 'probs_resnet110_c100_logits.p', 
         'probs_densenet40_c10_logits.p', 'probs_densenet40_c100_logits.p',
        'probs_resnet_wide32_c10_logits.p', 'probs_resnet_wide32_c100_logits.p',
        'probs_resnet152_imgnet_logits.p', 'probs_densenet161_imgnet_logits.p')

def temp_results(path, files, method = "BFGS"):
    
    df = pd.DataFrame(columns=["Name", "Error", "ECE", "MCE", "Loss", "Temperature"])
    
    
    for i, f in enumerate(files):
        
        name = "_".join(f.split("_")[1:-1])
        print(name)

        FILE_PATH = join(path, f)
        (y_logits_val, y_val), (y_logits_test, y_test) = unpickle_probs(FILE_PATH)
        
        opt = minimize(loss_fun, x0 = 1, args=(y_logits_val, y_val), options={'maxiter':50}, method = method)
        temp = opt.x[0]
        
        _, _, _, _ = evaluate(softmax(temp_scale(y_logits_val, temp)), y_val, verbose=True)
        error, ece, mce, loss = evaluate(softmax(y_logits_test), y_test, verbose=False)  # Test before scaling
        error2, ece2, mce2, loss2 = evaluate(softmax(temp_scale(y_logits_test, temp)), y_test, verbose=False)
        
        df.loc[i*2] = [name, error, ece, mce, loss, temp]
        df.loc[i*2+1] = [(name + "_temp"), error2, ece2, mce2, loss2, temp]

        
    return df
    

In [15]:
df = temp_results(PATH, files)

resnet110_c10
Accuracy: 94.62
Error: 5.38
ECE: 0.00728217680454
MCE: 0.816615432501
Loss: 0.180503754565
resnet110_c100
Accuracy: 72.1
Error: 27.9
ECE: 0.0208565565631
MCE: 0.056080172789
Loss: 1.0636127493
densenet40_c10
Accuracy: 93.82
Error: 6.18
ECE: 0.00594705242515
MCE: 0.10719075799
Loss: 0.186612238607
densenet40_c100
Accuracy: 70.26
Error: 29.74
ECE: 0.0144420869678
MCE: 0.045564418529
Loss: 1.0773751477
resnet_wide32_c10
Accuracy: 96.02
Error: 3.98
ECE: 0.00712747764289
MCE: 0.242319499453
Loss: 0.134110293426
resnet_wide32_c100
Accuracy: 78.46
Error: 21.54
ECE: 0.0407689332046
MCE: 0.156267919276
Loss: 0.836059538963
resnet152_imgnet
Accuracy: 75.36
Error: 24.64
ECE: 0.0245080424418
MCE: 0.0559367342452
Loss: 1.00401962933
densenet161_imgnet
Accuracy: 77.26
Error: 22.74
ECE: 0.0208654103999
MCE: 0.0781966217362
Loss: 0.909294445006


In [94]:
df

Unnamed: 0,Name,Error,ECE,MCE,Loss,Temperature
0,resnet110_c10,6.44,0.047504,0.295799,0.358274,2.395011
1,resnet110_c10_temp,6.44,0.011321,0.236393,0.209261,2.395011
2,resnet110_c100,28.52,0.184805,0.398817,1.693712,2.310216
3,resnet110_c100_temp,28.52,0.023797,0.070991,1.091691,2.310216
4,densenet40_c10,7.58,0.055003,0.333955,0.428207,2.88255
5,densenet40_c10_temp,7.58,0.009464,0.099293,0.225086,2.88255
6,densenet40_c100,30.0,0.211563,0.454003,2.017398,3.194026
7,densenet40_c100_temp,30.0,0.009021,0.022128,1.057131,3.194026
8,resnet_wide32_c10,5.01,0.026291,0.270675,0.18078,1.518296
9,resnet_wide32_c10_temp,5.01,0.007963,0.254111,0.154391,1.518296


## Extension of Binning Methods

## Isotonic  Regression

In [43]:
from sklearn.linear_model import LogisticRegression
from calmap import plot_calibration_map
from matplotlib import pyplot as plt


In [181]:

(y_logits_val, y_val), (y_logits_test, y_test) = unpickle_probs(FILE_PATH)

y_probs_val = softmax(y_logits_val)
y_probs_test = softmax(y_logits_test)
K = y_probs_test.shape[1]

# Go through all the classes
for i in range(K):
    # Prep class labels (1 fixed true class, 0 other classes)
    y_cal = np.array(y_val == i, dtype="int")[:, 0]
    
    # Train isotonic regression model
    iso = IsotonicRegression(y_min = 0, y_max = 1)
    iso.fit(y_probs_val[:, i], y_cal) # Get only one column with probs for given class "k"
    
    y_probs_val[:, i] = iso.predict(y_probs_val[:, i])
    y_probs_test[:, i] = iso.predict(y_probs_test[:, i])
    
    # Replace NaN with 0, as it should be close to zero
    idx_nan = np.where(np.isnan(y_probs_test))
    y_probs_test[idx_nan] = 0

_, _, _, _ = evaluate(softmax(y_logits_val), y_val, verbose=True, normalize=True)
_, _, _, _ = evaluate(y_probs_val, y_val, verbose=True, normalize=True)
print()
error, ece, mce, loss = evaluate(softmax(y_logits_test), y_test, verbose=True, normalize=True)  # Test before scaling
error2, ece2, mce2, loss2 = evaluate(y_probs_test, y_test, verbose=True, normalize=True)

Accuracy: 94.62
Error: 5.38
ECE: 0.0398645869374
MCE: 0.77392500639
Loss: 0.30016390198
Accuracy: 94.84
Error: 5.16
ECE: 0.00596762105823
MCE: 0.258956101206
Loss: 0.157976749619

Accuracy: 93.56
Error: 6.44
ECE: 0.0475035226107
MCE: 0.295798957348
Loss: 0.358274134971
Accuracy: 93.64
Error: 6.36
ECE: 0.0147359234944
MCE: 0.245814445118
Loss: 0.270758222537


In [196]:
PATH = join('..', '..', 'Semester IV', 'NN_image_probs')
files = ('probs_resnet110_c10_logits.p', 'probs_resnet110_c100_logits.p', 
         'probs_densenet40_c10_logits.p', 'probs_densenet40_c100_logits.p',
        'probs_resnet_wide32_c10_logits.p', 'probs_resnet_wide32_c100_logits.p',
        'probs_resnet152_imgnet_logits.p', 'probs_densenet161_imgnet_logits.p')

def iso_results(path, files):
    
    df = pd.DataFrame(columns=["Name", "Error", "ECE", "MCE", "Loss"])
    
    
    for i, f in enumerate(files):
        
        name = "_".join(f.split("_")[1:-1])
        print("\n", name)

        FILE_PATH = join(path, f)
        (y_logits_val, y_val), (y_logits_test, y_test) = unpickle_probs(FILE_PATH)

        y_probs_val = softmax(y_logits_val)  # Softmax logits
        y_probs_test = softmax(y_logits_test)
        K = y_probs_test.shape[1]

        # Go through all the classes
        for k in range(K):
            # Prep class labels (1 fixed true class, 0 other classes)
            y_cal = np.array(y_val == k, dtype="int")[:, 0]

            # Train isotonic regression model
            iso = IsotonicRegression(y_min = 0, y_max = 1)
            iso.fit(y_probs_val[:, k], y_cal) # Get only one column with probs for given class "k"

            y_probs_val[:, k] = iso.predict(y_probs_val[:, k])
            y_probs_test[:, k] = iso.predict(y_probs_test[:, k])

            # Replace NaN with 0, as it should be close to zero
            idx_nan = np.where(np.isnan(y_probs_test))
            y_probs_test[idx_nan] = 0
            
            idx_nan = np.where(np.isnan(y_probs_val))
            y_probs_val[idx_nan] = 0

        _, _, _, _ = evaluate(softmax(y_logits_val), y_val, verbose=True, normalize=False)
        _, _, _, _ = evaluate(y_probs_val, y_val, verbose=True, normalize=True)
        error, ece, mce, loss = evaluate(softmax(y_logits_test), y_test, verbose=False, normalize=False)  # Test before scaling
        error2, ece2, mce2, loss2 = evaluate(y_probs_test, y_test, verbose=False, normalize=True)

        df.loc[i*2] = [name, error, ece, mce, loss]
        df.loc[i*2+1] = [(name + "_iso"), error2, ece2, mce2, loss2]

        
    return df

In [197]:
df_iso = iso_results(PATH, files)


 resnet110_c10
Accuracy: 94.62
Error: 5.38
ECE: 0.0398645839274
MCE: 0.77392500639
Loss: 0.30016390198
Accuracy: 94.84
Error: 5.16
ECE: 0.00596762105823
MCE: 0.258956101206
Loss: 0.157976749619

 resnet110_c100
Accuracy: 72.1
Error: 27.9
ECE: 0.176872791523
MCE: 0.385346045511
Loss: 1.62195707678
Accuracy: 74.36
Error: 25.64
ECE: 0.0353931856498
MCE: 0.104217011713
Loss: 0.893552282339


  
  filtered_tuples = [x for x in zip(pred, true, conf) if x[2] > conf_thresh_lower and x[2] <= conf_thresh_upper]



 densenet40_c10
Accuracy: 93.82
Error: 6.18
ECE: 0.0435649270296
MCE: 0.452000847459
Loss: 0.331312119976
Accuracy: 94.16
Error: 5.84
ECE: 0.00565517083406
MCE: 0.264927625656
Loss: 0.16504376636

 densenet40_c100
Accuracy: 70.26
Error: 29.74
ECE: 0.208725975981
MCE: 0.456638015358
Loss: 2.0778621932
Accuracy: 72.86
Error: 27.14
ECE: 0.0228267120495
MCE: 0.0722426075851
Loss: 0.919950814755

 resnet_wide32_c10
Accuracy: 96.02
Error: 3.98
ECE: 0.0194082392514
MCE: 0.318583955367
Loss: 0.155429932939
Accuracy: 96.24
Error: 3.76
ECE: 0.00631881810129
MCE: 0.318580420954
Loss: 0.113946554589

 resnet_wide32_c100
Accuracy: 78.46
Error: 21.54
ECE: 0.0739752606288
MCE: 0.217305355811
Loss: 0.880842745036
Accuracy: 80.66
Error: 19.34
ECE: 0.0346710777164
MCE: 0.100182712495
Loss: 0.674024034348

 resnet152_imgnet
Accuracy: 75.36
Error: 24.64
ECE: 0.0600496283878
MCE: 0.945953991264
Loss: 1.04840690425
Accuracy: 79.276
Error: 20.724
ECE: 0.0207196526243
MCE: 0.186014785431
Loss: 0.739944031432

In [198]:
df_iso

Unnamed: 0,Name,Error,ECE,MCE,Loss
0,resnet110_c10,6.44,0.047504,0.295799,0.358274
1,resnet110_c10_temp,6.36,0.014736,0.245814,0.270758
2,resnet110_c100,28.52,0.184805,0.398817,1.693712
3,resnet110_c100_temp,29.31,0.065352,0.133803,1.892628
4,densenet40_c10,7.58,0.055003,0.333955,0.428207
5,densenet40_c10_temp,7.65,0.016844,0.084918,0.277319
6,densenet40_c100,30.0,0.211563,0.454003,2.017398
7,densenet40_c100_temp,30.22,0.052514,0.121643,1.64908
8,resnet_wide32_c10,5.01,0.026291,0.270675,0.18078
9,resnet_wide32_c10_temp,5.01,0.010596,0.101016,0.234374


In [207]:
PATH = join('..', '..', 'Semester IV', 'NN_image_probs')
files = ('probs_resnet110_c10_logits.p', 'probs_resnet110_c100_logits.p', 
         'probs_densenet40_c10_logits.p', 'probs_densenet40_c100_logits.p',
        'probs_resnet_wide32_c10_logits.p', 'probs_resnet_wide32_c100_logits.p',
        'probs_resnet152_imgnet_logits.p', 'probs_densenet161_imgnet_logits.p')

def beta_results(path, files):
    
    df = pd.DataFrame(columns=["Name", "Error", "ECE", "MCE", "Loss"])
    
    
    for i, f in enumerate(files):
        
        name = "_".join(f.split("_")[1:-1])
        print("\n", name)

        FILE_PATH = join(path, f)
        (y_logits_val, y_val), (y_logits_test, y_test) = unpickle_probs(FILE_PATH)

        y_probs_val = softmax(y_logits_val)  # Softmax logits
        y_probs_test = softmax(y_logits_test)
        K = y_probs_test.shape[1]

        # Go through all the classes
        for k in range(K):
            # Prep class labels (1 fixed true class, 0 other classes)
            y_cal = np.array(y_val == k, dtype="int")[:, 0]

            # Train isotonic regression model
            bc = BetaCalibration(parameters="abm")
            bc.fit(y_probs_val[:, k], y_cal) # Get only one column with probs for given class "k"

            y_probs_val[:, k] = bc.predict(y_probs_val[:, k])
            y_probs_test[:, k] = bc.predict(y_probs_test[:, k])

            # Replace NaN with 0, as it should be close to zero
            idx_nan = np.where(np.isnan(y_probs_test))
            y_probs_test[idx_nan] = 0
            
            idx_nan = np.where(np.isnan(y_probs_val))
            y_probs_val[idx_nan] = 0

        print("Results on Validation Set:")
        _, _, _, _ = evaluate(softmax(y_logits_val), y_val, verbose=True, normalize=False)
        _, _, _, _ = evaluate(y_probs_val, y_val, verbose=True, normalize=True)
        error, ece, mce, loss = evaluate(softmax(y_logits_test), y_test, verbose=False, normalize=False)  # Test before scaling
        error2, ece2, mce2, loss2 = evaluate(y_probs_test, y_test, verbose=False, normalize=True)

        df.loc[i*2] = [name, error, ece, mce, loss]
        df.loc[i*2+1] = [(name + "_beta"), error2, ece2, mce2, loss2]
        
    return df

In [208]:
df_beta = beta_results(PATH, files)


 resnet110_c10
Results on Validation Set:
Accuracy: 94.62
Error: 5.38
ECE: 0.0398645839274
MCE: 0.77392500639
Loss: 0.30016390198
Accuracy: 94.54
Error: 5.46
ECE: 0.00928249204457
MCE: 0.192622438073
Loss: 0.179388467793

 resnet110_c100
Results on Validation Set:
Accuracy: 72.1
Error: 27.9
ECE: 0.176872791523
MCE: 0.385346045511
Loss: 1.62195707678
Accuracy: 72.56
Error: 27.44
ECE: 0.0376279561684
MCE: 0.131316392343
Loss: 1.05509424744

 densenet40_c10
Results on Validation Set:
Accuracy: 93.82
Error: 6.18
ECE: 0.0435649270296
MCE: 0.452000847459
Loss: 0.331312119976
Accuracy: 93.78
Error: 6.22
ECE: 0.0103049218595
MCE: 0.0891237031846
Loss: 0.190338227394

 densenet40_c100
Results on Validation Set:
Accuracy: 70.26
Error: 29.74
ECE: 0.208725975981
MCE: 0.456638015358
Loss: 2.0778621932
Accuracy: 71.0
Error: 29.0
ECE: 0.0546674773142
MCE: 0.160630202703
Loss: 1.13492212385

 resnet_wide32_c10
Results on Validation Set:
Accuracy: 96.02
Error: 3.98
ECE: 0.0194082392514
MCE: 0.31858395

Unnamed: 0,Name,Error,ECE,MCE,Loss
0,resnet110_c10,6.44,0.047504,0.295799,0.358274
1,resnet110_c10_beta,6.44,0.014248,0.262211,0.21385
2,resnet110_c100,28.52,0.184805,0.398817,1.693712
3,resnet110_c100_beta,28.36,0.046006,0.11442,1.131824
4,densenet40_c10,7.58,0.055003,0.333955,0.428207
5,densenet40_c10_beta,7.59,0.017003,0.250324,0.239184
6,densenet40_c100,30.0,0.211563,0.454003,2.017398
7,densenet40_c100_beta,29.81,0.060306,0.129234,1.153169
8,resnet_wide32_c10,5.01,0.026291,0.270675,0.18078
9,resnet_wide32_c10_beta,5.0,0.009261,0.094009,0.154432


In [209]:
df_beta = beta_results(PATH, files)


 resnet110_c10
Results on Validation Set:
Accuracy: 94.62
Error: 5.38
ECE: 0.0398645839274
MCE: 0.77392500639
Loss: 0.30016390198
Accuracy: 94.54
Error: 5.46
ECE: 0.00928249204457
MCE: 0.192622438073
Loss: 0.179388467793

 resnet110_c100
Results on Validation Set:
Accuracy: 72.1
Error: 27.9
ECE: 0.176872791523
MCE: 0.385346045511
Loss: 1.62195707678
Accuracy: 72.56
Error: 27.44
ECE: 0.0376279561684
MCE: 0.131316392343
Loss: 1.05509424744

 densenet40_c10
Results on Validation Set:
Accuracy: 93.82
Error: 6.18
ECE: 0.0435649270296
MCE: 0.452000847459
Loss: 0.331312119976
Accuracy: 93.78
Error: 6.22
ECE: 0.0103049218595
MCE: 0.0891237031846
Loss: 0.190338227394

 densenet40_c100
Results on Validation Set:
Accuracy: 70.26
Error: 29.74
ECE: 0.208725975981
MCE: 0.456638015358
Loss: 2.0778621932
Accuracy: 71.0
Error: 29.0
ECE: 0.0546674773142
MCE: 0.160630202703
Loss: 1.13492212385

 resnet_wide32_c10
Results on Validation Set:
Accuracy: 96.02
Error: 3.98
ECE: 0.0194082392514
MCE: 0.31858395

## Histogram Binning

In [289]:
class HistogramBinning():
    
    
    def __init__(self, M=15):
        self.bin_size = 1./M  # Calculate bin size
        self.conf = []  # Initiate confidence list
        self.upper_bounds = np.arange(self.bin_size, 1+self.bin_size, self.bin_size)  # Set bin bounds for intervals

    

    def fit(self, probs, true):

        conf = []

        # Got through intervals and add confidence to list
        for conf_thresh in self.upper_bounds:
            temp_conf = get_conf((conf_thresh - self.bin_size), conf_thresh, probs = probs, true = true)
            conf.append(temp_conf)

        self.conf = conf

            
    def get_conf(self, conf_thresh_lower, conf_thresh_upper, probs, true):

        filtered = [x[0] for x in zip(true, probs) if x[1] > conf_thresh_lower and x[1] <= conf_thresh_upper]
        nr_elems = len(filtered)

        if nr_elems < 1:
            return 0
        else:
            conf = sum(filtered)/nr_elems
            return conf

    # Fit based on predicted confidence
    def predict(self, probs):

        # Go through all the probs and check what confidence is suitable for it.
        for i, prob in enumerate(probs):
            idx = np.searchsorted(self.upper_bounds, prob)
            probs[i] = self.conf[idx]

        return probs    

In [293]:
hb = HistogramBinning(M = 15)
hb.fit(softmax(y_logits_val)[:, 0], y_cal)
hb.predict(softmax(y_logits_val)[:, 0])[150:200]

array([ 0.00382883,  0.00382883,  0.00382883,  0.00382883,  0.00382883,
        0.00382883,  0.00382883,  0.00382883,  0.00382883,  0.00382883,
        0.96787149,  0.00382883,  0.00382883,  0.96787149,  0.00382883,
        0.00382883,  0.00382883,  0.00382883,  0.00382883,  0.00382883,
        0.00382883,  0.00382883,  0.00382883,  0.00382883,  0.96787149,
        0.00382883,  0.00382883,  0.00382883,  0.96787149,  0.00382883,
        0.00382883,  0.00382883,  0.00382883,  0.00382883,  0.00382883,
        0.00382883,  0.00382883,  0.00382883,  0.00382883,  0.00382883,
        0.00382883,  0.1       ,  0.00382883,  0.00382883,  0.96787149,
        0.00382883,  0.00382883,  0.00382883,  0.96787149,  0.00382883], dtype=float32)

In [294]:
PATH = join('..', '..', 'Semester IV', 'NN_image_probs')
files = ('probs_resnet110_c10_logits.p', 'probs_resnet110_c100_logits.p', 
         'probs_densenet40_c10_logits.p', 'probs_densenet40_c100_logits.p',
        'probs_resnet_wide32_c10_logits.p', 'probs_resnet_wide32_c100_logits.p',
        'probs_resnet152_imgnet_logits.p', 'probs_densenet161_imgnet_logits.p')

def hist_results(path, files):
    
    df = pd.DataFrame(columns=["Name", "Error", "ECE", "MCE", "Loss"])
    
    
    for i, f in enumerate(files):
        
        name = "_".join(f.split("_")[1:-1])
        print("\n", name)

        FILE_PATH = join(path, f)
        (y_logits_val, y_val), (y_logits_test, y_test) = unpickle_probs(FILE_PATH)

        y_probs_val = softmax(y_logits_val)  # Softmax logits
        y_probs_test = softmax(y_logits_test)
        K = y_probs_test.shape[1]

        # Go through all the classes
        for k in range(K):
            # Prep class labels (1 fixed true class, 0 other classes)
            y_cal = np.array(y_val == k, dtype="int")[:, 0]

            # Train isotonic regression model
            hb = HistogramBinning(M = 15)
            hb.fit(y_probs_val[:, k], y_cal) # Get only one column with probs for given class "k"

            y_probs_val[:, k] = hb.predict(y_probs_val[:, k])
            y_probs_test[:, k] = hb.predict(y_probs_test[:, k])

            # Replace NaN with 0, as it should be close to zero
            idx_nan = np.where(np.isnan(y_probs_test))
            y_probs_test[idx_nan] = 0
            
            idx_nan = np.where(np.isnan(y_probs_val))
            y_probs_val[idx_nan] = 0

        print("Results on Validation Set:")
        _, _, _, _ = evaluate(softmax(y_logits_val), y_val, verbose=True, normalize=False)
        _, _, _, _ = evaluate(y_probs_val, y_val, verbose=True, normalize=True)
        error, ece, mce, loss = evaluate(softmax(y_logits_test), y_test, verbose=False, normalize=False)  # Test before scaling
        error2, ece2, mce2, loss2 = evaluate(y_probs_test, y_test, verbose=False, normalize=True)

        df.loc[i*2] = [name, error, ece, mce, loss]
        df.loc[i*2+1] = [(name + "_beta"), error2, ece2, mce2, loss2]
        
    return df

In [295]:
df_hb = hist_results(PATH, files)


 resnet110_c10
Results on Validation Set:
Accuracy: 94.62
Error: 5.38
ECE: 0.0398645839274
MCE: 0.77392500639
Loss: 0.30016390198
Accuracy: 95.16
Error: 4.84
ECE: 0.00865248595774
MCE: 0.751706972718
Loss: 0.250228365459

 resnet110_c100
Results on Validation Set:
Accuracy: 72.1
Error: 27.9
ECE: 0.176872791523
MCE: 0.385346045511
Loss: 1.62195707678
Accuracy: 76.62
Error: 23.38
ECE: 0.0743607075159
MCE: 0.128895563238
Loss: 1.38054436247

 densenet40_c10
Results on Validation Set:
Accuracy: 93.82
Error: 6.18
ECE: 0.0435649270296
MCE: 0.452000847459
Loss: 0.331312119976
Accuracy: 94.7
Error: 5.3
ECE: 0.00877601345778
MCE: 0.191703160604
Loss: 0.263462564128

 densenet40_c100
Results on Validation Set:
Accuracy: 70.26
Error: 29.74
ECE: 0.208725975981
MCE: 0.456638015358
Loss: 2.0778621932
Accuracy: 74.94
Error: 25.06
ECE: 0.143021515149
MCE: 0.206991854032
Loss: 1.55849503173

 resnet_wide32_c10
Results on Validation Set:
Accuracy: 96.02
Error: 3.98
ECE: 0.0194082392514
MCE: 0.318583955

In [296]:
df_hb

Unnamed: 0,Name,Error,ECE,MCE,Loss
0,resnet110_c10,6.44,0.047504,0.295799,0.358274
1,resnet110_c10_beta,6.59,0.012508,0.444502,0.547192
2,resnet110_c100,28.52,0.184805,0.398817,1.693712
3,resnet110_c100_beta,31.26,0.090558,0.316352,4.213913
4,densenet40_c10,7.58,0.055003,0.333955,0.428207
5,densenet40_c10_beta,7.93,0.021302,0.451015,0.572461
6,densenet40_c100,30.0,0.211563,0.454003,2.017398
7,densenet40_c100_beta,32.49,0.119705,0.169395,4.182849
8,resnet_wide32_c10,5.01,0.026291,0.270675,0.18078
9,resnet_wide32_c10_beta,5.31,0.014299,0.266247,0.54436


In [317]:
# df, df_iso, df_beta, df_hb

dfs = [df, df_hb, df_iso, df_beta]
names = ["Name", "Uncalibrated", "Histogram Binning", "Isotonic Regression", "Temperature Scaling", "Beta Calibration"]


def get_dataframe(dfs, column, names):
    pass


column = "Error"

df_res = pd.DataFrame(columns=names)

for i in range(1, len(df_iso), 2):
    
    name = dfs[0].iloc[i-1]["Name"]
    uncalibrated = dfs[0].iloc[i-1][column]
    
    row = [name, uncalibrated]  # Add scores to row
    
    for df in dfs:
        row.append(df.iloc[i][column])
        
    df_res.loc[(i-1)//2] = row
    #errordf.iloc[i]

In [318]:
df_res

Unnamed: 0,Name,Uncalibrated,Histogram Binning,Isotonic Regression,Temperature Scaling,Beta Calibration
0,resnet110_c10,6.44,6.44,6.59,6.36,6.44
1,resnet110_c100,28.52,28.36,31.26,29.31,28.36
2,densenet40_c10,7.58,7.59,7.93,7.65,7.59
3,densenet40_c100,30.0,29.81,32.49,30.22,29.81
4,resnet_wide32_c10,5.01,5.0,5.31,5.01,5.0
5,resnet_wide32_c100,22.8,22.93,26.64,24.09,22.93
6,resnet152_imgnet,25.292,25.26,33.296,29.308,25.26
7,densenet161_imgnet,23.4,23.276,31.092,27.264,23.276
