# Temperature Scaling

Testing calibration method on neural networks on image data.

In [29]:
from sklearn.metrics import log_loss
from keras.losses import categorical_crossentropy
from scipy.optimize import minimize 
import numpy as np
from utility.unpickle_probs import unpickle_probs
from utility.calibration import ECE, MCE
from os.path import join
import sklearn.metrics as metrics


## Load in Data

In [117]:
PATH = join('..', '..', 'Semester IV', 'NN_image_probs')
files = ('probs_resnet110_c100_logits.p')

#(y_probs_val, y_val), (y_probs_test, y_test) = unpickle_probs(FILE_PATH, True)
(y_logits_val, y_val), (y_logits_test, y_test) = unpickle_probs(FILE_PATH, True)


y_probs_val: (5000, 100)
y_true_val: (5000, 1)
y_probs_test: (10000, 100)
y_true_test: (10000, 1)


### Check out the Negative Log Likelihood (NLL).

In [120]:
y_probs_val = softmax(y_logits_val)
y_probs_test = softmax(y_logits_test)

### Sklearn method

In [173]:
log_loss(y_true=y_val, y_pred=y_probs_val)

1.999253023824082

In [174]:
log_loss(y_true=y_test, y_pred=y_probs_test)

2.6449219236379986

The NLL is different from the actual output for some reason. Look more into it.

## Temperature Scaling

In [76]:
def temp_scale(y_probs, x):
    return y_probs/x

In [122]:
def loss(x, y_probs, y_true):
    scaled_probs = softmax(temp_scale(y_probs, x))    
    loss = log_loss(y_true=y_true, y_pred=scaled_probs)
    
    return loss

In [127]:
opt = minimize(loss, x0 = 0.1, args=(y_logits_val, y_val), options={'maxiter':50}, method = "BFGS")

In [128]:
opt

      fun: 1.0636127548289283
 hess_inv: array([[ 64.640641]])
      jac: array([  8.19563866e-07])
  message: 'Optimization terminated successfully.'
     nfev: 30
      nit: 2
     njev: 10
   status: 0
  success: True
        x: array([ 2.31021816])

In [80]:
def evaluate(y_probs, y_true, verbose = False, bins = 15):
    
    y_preds = np.argmax(y_probs, axis=1)  # Take maximum confidence as prediction
    y_confs = np.max(y_probs, axis=1)  # Take only maximum confidence
    
    accuracy = metrics.accuracy_score(y_true, y_preds) * 100
    error = 100 - accuracy
    
        # Calculate ECE
    ece = ECE(y_confs, y_preds, y_true, bin_size = 1/bins)
    # Calculate MCE
    mce = MCE(y_confs, y_preds, y_true, bin_size = 1/bins)
    
    if verbose:
        print("Accuracy:", accuracy)
        print("Error:", error)
        print("ECE:", ece)
        print("MCE:", mce)

In [130]:
temp = 2.31021816
log_loss(y_true=y_test, y_pred=softmax(temp_scale(y_logits_test, temp)))

1.0916913444706431

In [171]:
evaluate(softmax(y_logits_test), y_test, verbose=True)

Accuracy: 71.48
Error: 28.52
ECE: 0.184804543945
MCE: 0.398817125148


In [172]:
evaluate(softmax(temp_scale(y_logits_test, temp)), y_test, verbose=True)

Accuracy: 71.48
Error: 28.52
ECE: 0.0237969729677
MCE: 0.0709915722652


In [168]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=1, keepdims=1)