> This notebook is used to evaluate the performance of each model with the test data and then highlight the best one. 

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras.models import Model, load_model
import keras
from keras.preprocessing.image import ImageDataGenerator

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import os

Using TensorFlow backend.


In [4]:
base_dir = "../skin-cancer-mnist-ham10000"

test_directory = os.path.join(base_dir, "image_data_test_balanced")

In [5]:
test_data_gen = ImageDataGenerator(rescale= 1./255)

> Model 1:
 - 56 x 75 x 3
 - mini_batch size = 256
 - 20 epochs
 - 16 layers

In [10]:
test_generator1 = test_data_gen.flow_from_directory(test_directory, target_size = (56,75), class_mode= None, batch_size = 256, shuffle = False)

model_1 = load_model("../saved_models/model_1.h5")

test_generator1.reset()
preds = model_1.predict_generator(test_generator1, steps = 196/256)
preds1 = np.argmax(preds, axis =1)

Found 196 images belonging to 7 classes.


In [11]:
y_true = test_generator1.classes

accuracy_score(y_true, preds1)

0.5561224489795918

In [12]:
print(classification_report(y_true, preds1))

              precision    recall  f1-score   support

           0       0.46      0.43      0.44        28
           1       0.62      0.36      0.45        28
           2       0.52      0.54      0.53        28
           3       0.45      0.36      0.40        28
           4       0.45      0.96      0.61        28
           5       0.93      0.93      0.93        28
           6       0.60      0.32      0.42        28

    accuracy                           0.56       196
   macro avg       0.58      0.56      0.54       196
weighted avg       0.58      0.56      0.54       196



> Model 2:
 - 56 x 75 x 3
 - mini_batch size = 512
 - 20 epochs
 - 16 layers

In [13]:
batch_size = 512

test_generator2 = test_data_gen.flow_from_directory(test_directory, target_size = (56,75), class_mode= None, batch_size = 256, shuffle = False)

model_2 = load_model("../saved_models/model_2.h5")

test_generator2.reset()
preds = model_2.predict_generator(test_generator2, steps = 196/batch_size)
preds2 = np.argmax(preds, axis =1)


Found 196 images belonging to 7 classes.


In [14]:
accuracy_score(y_true, preds2)

0.33163265306122447

In [15]:
print(classification_report(y_true, preds2))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.22      0.54      0.31        28
           2       0.22      0.07      0.11        28
           3       0.00      0.00      0.00        28
           4       0.54      0.93      0.68        28
           5       0.67      0.07      0.13        28
           6       0.38      0.71      0.50        28

    accuracy                           0.33       196
   macro avg       0.29      0.33      0.25       196
weighted avg       0.29      0.33      0.25       196



> Model 3:
 - 56 x 75 x 3
 - mini_batch size = 124
 - 20 epochs
 - 16 layers

In [11]:
batch_size = 128

test_generator3 = test_data_gen.flow_from_directory(test_directory, target_size = (56,75), class_mode= None, batch_size = batch_size, shuffle = False)

model_3 = load_model("../saved_models/model_3.h5")

preds = model_3.predict_generator(test_generator3, steps = 196/batch_size)
preds3 = np.argmax(preds, axis =1)


Found 196 images belonging to 7 classes.


In [12]:
accuracy_score(y_true, preds3)

0.5510204081632653

In [13]:
print(classification_report(y_true, preds3))

              precision    recall  f1-score   support

           0       0.52      0.39      0.45        28
           1       0.71      0.43      0.53        28
           2       0.29      0.25      0.27        28
           3       0.47      0.50      0.48        28
           4       0.48      1.00      0.65        28
           5       0.93      0.89      0.91        28
           6       0.58      0.39      0.47        28

    accuracy                           0.55       196
   macro avg       0.57      0.55      0.54       196
weighted avg       0.57      0.55      0.54       196



> Model 4:
 - 56 x 75 x 3
 - mini_batch size = 256
 - 20 epochs
 - pretrained layers with mobile net v2
 - +6 top layers

In [14]:
batch_size = 256

test_generator4 = test_data_gen.flow_from_directory(test_directory, target_size = (56,75), class_mode= None, batch_size = 256, shuffle = False)

model_4 = load_model("../saved_models/model_4.h5")

preds = model_4.predict_generator(test_generator4, steps = 196/batch_size)
preds4 = np.argmax(preds, axis =1)


Found 196 images belonging to 7 classes.


W0825 18:18:02.892883 4608787904 deprecation_wrapper.py:119] From /Users/mdevlin/anaconda3/envs/deeplearning/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:1834: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.



In [15]:
accuracy_score(y_true, preds4)

0.5153061224489796

In [16]:
print(classification_report(y_true, preds4))

              precision    recall  f1-score   support

           0       0.50      0.36      0.42        28
           1       0.57      0.43      0.49        28
           2       0.42      0.61      0.50        28
           3       0.56      0.50      0.53        28
           4       0.40      0.61      0.49        28
           5       0.85      0.61      0.71        28
           6       0.50      0.50      0.50        28

    accuracy                           0.52       196
   macro avg       0.54      0.52      0.52       196
weighted avg       0.54      0.52      0.52       196



> Model 5:
 - 150 x 150 x 3
 - mini_batch size = 512
 - 20 epochs
 - pretrained layers with mobile net v2
 - +6 top layers

In [17]:
batch_size = 512

test_generator5 = test_data_gen.flow_from_directory(test_directory, target_size = (150,150), class_mode= None, batch_size = 256, shuffle = False)

model_5 = load_model("../saved_models/model_5.h5")

preds = model_5.predict_generator(test_generator5, steps = 196/batch_size)
preds5 = np.argmax(preds, axis =1)


Found 196 images belonging to 7 classes.


In [18]:
accuracy_score(y_true, preds5)

0.5153061224489796

In [19]:
print(classification_report(y_true, preds5))

              precision    recall  f1-score   support

           0       0.58      0.39      0.47        28
           1       0.69      0.39      0.50        28
           2       0.29      0.64      0.40        28
           3       0.73      0.29      0.41        28
           4       0.44      0.79      0.56        28
           5       0.96      0.82      0.88        28
           6       0.57      0.29      0.38        28

    accuracy                           0.52       196
   macro avg       0.61      0.52      0.52       196
weighted avg       0.61      0.52      0.52       196



> Model 6:
 - 150 x 150 x 3
 - mini_batch size = 256
 - 20 epochs
 - pretrained layers with mobile net v2
 - +6 top layers

In [20]:
batch_size = 256

test_generator6 = test_data_gen.flow_from_directory(test_directory, target_size = (150,150), class_mode= None, batch_size = 256, shuffle = False)

model_6 = load_model("../saved_models/model_6.h5")

preds = model_6.predict_generator(test_generator6, steps = 196/batch_size)
preds6 = np.argmax(preds, axis =1)


Found 196 images belonging to 7 classes.


In [21]:
accuracy_score(y_true, preds6)

0.5051020408163265

In [22]:
print(classification_report(y_true, preds6))

              precision    recall  f1-score   support

           0       0.58      0.39      0.47        28
           1       0.67      0.14      0.24        28
           2       0.29      0.43      0.35        28
           3       0.51      0.93      0.66        28
           4       0.47      0.75      0.58        28
           5       0.77      0.86      0.81        28
           6       0.33      0.04      0.06        28

    accuracy                           0.51       196
   macro avg       0.52      0.51      0.45       196
weighted avg       0.52      0.51      0.45       196



In [27]:
batch_size = 256

test_generator7 = test_data_gen.flow_from_directory(test_directory, target_size = (150,150), class_mode= None, batch_size = 256, shuffle = False)

model_7 = load_model("../saved_models/model_7.h5")

preds = model_7.predict_generator(test_generator7, steps = 196/batch_size)
preds7 = np.argmax(preds, axis =1)

Found 196 images belonging to 7 classes.


W0826 07:31:29.108899 4653954496 deprecation_wrapper.py:119] From /Users/mdevlin/anaconda3/envs/deeplearning/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:1834: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.



In [28]:
accuracy_score(y_true, preds7)

0.5816326530612245

In [29]:
print(classification_report(y_true, preds7))

              precision    recall  f1-score   support

           0       0.70      0.50      0.58        28
           1       0.68      0.61      0.64        28
           2       0.37      0.68      0.48        28
           3       0.86      0.21      0.34        28
           4       0.50      0.75      0.60        28
           5       1.00      0.79      0.88        28
           6       0.52      0.54      0.53        28

    accuracy                           0.58       196
   macro avg       0.66      0.58      0.58       196
weighted avg       0.66      0.58      0.58       196



## Best Model: Model 2?

In [22]:
pd.DataFrame(confusion_matrix(y_true, preds2), 
             columns = ["pred_0", "pred_1", "pred_2", "pred_3", "pred_4", "pred_5", "pred_6"],
             index = ["true_0", "true_1", "true_2", "true_3", "true_4", "true_5", "true_6"])

Unnamed: 0,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6
true_0,0,19,0,4,2,0,3
true_1,1,15,3,4,0,0,5
true_2,1,5,2,3,4,0,13
true_3,0,19,2,0,2,1,4
true_4,0,0,0,0,26,0,2
true_5,0,9,1,0,11,2,5
true_6,0,1,1,3,3,0,20


In [23]:
print(classification_report(y_true, preds2))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.22      0.54      0.31        28
           2       0.22      0.07      0.11        28
           3       0.00      0.00      0.00        28
           4       0.54      0.93      0.68        28
           5       0.67      0.07      0.13        28
           6       0.38      0.71      0.50        28

    accuracy                           0.33       196
   macro avg       0.29      0.33      0.25       196
weighted avg       0.29      0.33      0.25       196



> Low precision means that case 6 was predicted in many cases where it was not case 6. High recall means that we caught 71% of the cases that were melanoma.