# Inspect Model Predictions

This notebook explores the model predictions made on the new test set.  The following cell contains setup code.

In [7]:
%load_ext autoreload
%autoreload 2

import json
import math
import pickle
import random

from IPython.display import display
from ipywidgets import widgets
from ipywidgets import Layout
import numpy as np
import pandas as pd

import qgrid
import tqdm
import sys
import os

repo_root = os.path.join(os.getcwd(), '../code')
sys.path.append(repo_root)

import tinyimages
import utils
import cifar10

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
version = '4'
df = utils.get_prediction_dataframe(version)
qgrid_widget = qgrid.show_grid(df, precision=2, grid_options={'maxVisibleRows': 30})
qgrid_widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

## Accuracy per class

In [9]:
orig_test_indices_by_class = []
new_test_indices_by_class = []

orig_per_class_accuracies = {}
new_per_class_accuracies = {}
num_classes = len(utils.cifar10_label_names)
cifar = cifar10.CIFAR10Data('../other_data/cifar10')
cifar_label_names = utils.cifar10_label_names
original_test_labels = cifar.eval_labels
_, new_true_labels = utils.load_new_test_data(version)
model_names = utils.get_model_names()
original_predictions = utils.get_original_predictions()
new_predictions = utils.get_new_predictions(version)

# init
for ii in range(num_classes):
    orig_test_indices_by_class.append([])
    new_test_indices_by_class.append([])

# get indices for each class
for ii in range(len(original_test_labels)):
    orig_test_indices_by_class[original_test_labels[ii]].append(ii)

for ii in range(len(new_true_labels)):
    new_test_indices_by_class[new_true_labels[ii]].append(ii)


for m in model_names:
    cur_orig_accuracies = np.zeros(10)
    cur_new_accuracies = np.zeros(10)
    for ii in range(num_classes):
        cur_orig_accuracies[ii] = utils.compute_accuracy(original_predictions[m][orig_test_indices_by_class[ii]],
                                                   original_test_labels[orig_test_indices_by_class[ii]])
        cur_new_accuracies[ii] = utils.compute_accuracy(new_predictions[m][new_test_indices_by_class[ii]],
                                                  new_true_labels[new_test_indices_by_class[ii]])
    orig_per_class_accuracies[m] = cur_orig_accuracies
    new_per_class_accuracies[m] = cur_new_accuracies


pd_data_orig_class_accuracy = {}
for m in model_names:
    cur_dict = {}
    pd_data_orig_class_accuracy[m] = cur_dict
    for ii, c in enumerate(cifar_label_names):
        cur_dict[c] = 100.0 * orig_per_class_accuracies[m][ii]

df_data_orig_class_accuracy = pd.DataFrame(pd_data_orig_class_accuracy).transpose()
    
qgrid_widget_orig = qgrid.show_grid(df_data_orig_class_accuracy, precision=2, grid_options={'maxVisibleRows': 30})
qgrid_widget_orig

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [10]:
pd_data_new_class_accuracy = {}
for m in model_names:
    cur_dict = {}
    pd_data_new_class_accuracy[m] = cur_dict
    for ii, c in enumerate(cifar_label_names):
        cur_dict[c] = 100.0 * new_per_class_accuracies[m][ii]

df_data_new_class_accuracy = pd.DataFrame(pd_data_new_class_accuracy).transpose()
    
qgrid_widget_new = qgrid.show_grid(df_data_new_class_accuracy, precision=2, grid_options={'maxVisibleRows': 30})
qgrid_widget_new

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [11]:
pd_data_diff_class_accuracy = {}
for m in model_names:
    cur_dict = {}
    pd_data_diff_class_accuracy[m] = cur_dict
    for ii, c in enumerate(cifar_label_names):
        cur_dict[c] = 100.0 * (orig_per_class_accuracies[m][ii] - new_per_class_accuracies[m][ii])

df_data_diff_class_accuracy = pd.DataFrame(pd_data_diff_class_accuracy).transpose()
    
qgrid_widget_diff = qgrid.show_grid(df_data_diff_class_accuracy, precision=2, grid_options={'maxVisibleRows': 30})
qgrid_widget_diff

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [12]:
classes_to_exclude = ['airplane', 'automobile']

orig_test_indices_included = []
new_test_indices_included = []
for ii in range(num_classes):
    if cifar_label_names[ii] not in classes_to_exclude:
        orig_test_indices_included.extend(orig_test_indices_by_class[ii])
        new_test_indices_included.extend(new_test_indices_by_class[ii])

print('Number of included original test images: {}'.format(len(orig_test_indices_included)))
print('Number of included new test images: {}'.format(len(new_test_indices_included)))

pd_data_partial = {}
for m in model_names:
    cur_dict = {}
    pd_data_partial[m] = cur_dict
    cur_dict['new_accuracy'] = 100 * utils.compute_accuracy(new_predictions[m][new_test_indices_included],
                                                      new_true_labels[new_test_indices_included])
    cur_dict['orig_accuracy'] = 100 * utils.compute_accuracy(original_predictions[m][orig_test_indices_included],
                                                       original_test_labels[orig_test_indices_included])
    cur_dict['gap'] = cur_dict['orig_accuracy'] - cur_dict['new_accuracy']
    cur_dict['orig_error'] = 100 - cur_dict['orig_accuracy']
    cur_dict['new_error'] = 100 - cur_dict['new_accuracy']
    cur_dict['error_ratio'] = cur_dict['new_error'] / cur_dict['orig_error']

df_partial = pd.DataFrame(pd_data_partial).transpose()[['orig_accuracy', 'new_accuracy', 'gap', 'orig_error', 'new_error', 'error_ratio']]
    
qgrid_widget = qgrid.show_grid(df_partial, precision=2, grid_options={'maxVisibleRows': 30})
qgrid_widget

Number of included original test images: 8000
Number of included new test images: 1601


QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…