# HAM10000 ConvNet

Convolutional neural network for identifying skin lesions, using the [HAM10000 dataset](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/DBW86T). 

I trained the ConvNet using [Google Colaboratory](https://colab.research.google.com/). 

## Setup

Import/install required libraries.

In [0]:
%tensorflow_version 2.x
%matplotlib inline 

import io
import json
import math
import numpy as np
import os
import pandas as pd
import seaborn as sn
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.python.keras.layers import Conv2D, MaxPooling2D
from tensorflow.python.keras.layers import Dense, Dropout, Flatten
from tensorflow.python.keras.layers.advanced_activations import LeakyReLU
from tensorflow.python.keras.layers.normalization import BatchNormalization
from tensorflow.python.keras.models import Sequential, Input, Model

# other important info
batch_size = 64
epochs = 30
dx_ints = {"akiec": 0, "bcc": 1, "bkl": 2,
           "df": 3, "nv": 4, "vasc": 5, "mel": 6}
dx_list = ["akiec", "bcc", "bkl", "df", "nv", "vasc", "mel"]
num_classes = len(dx_list)
base_dir = "/content/drive/My Drive/Colab Notebooks/HAM_ConvNet"

Mounts Google Drive, where I stored the input data sets. 

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Function to pre-process data, splits them into train, validation, and test subsets. 

In [0]:
def preprocess_data(file_name):
  # read data
  dataset = pd.read_csv(f"{base_dir}/{file_name}", header=0)
  
  # infer image resolution (assumes square images!)
  img_dim = int(math.sqrt(len(dataset.columns)))

  # split data to images and labels
  data_x = np.array([np.reshape(row.to_numpy(), (-1, img_dim)) for _, 
                     row in dataset.iloc[:,:-1].iterrows()])
  data_y = dataset.loc[:,"label"].to_numpy()

  # split to training and testing subsets
  train_X, test_X, train_Y, test_Y = train_test_split(data_x, data_y, 
                                                      test_size=0.2, 
                                                      random_state=624)

  # reshape data to 2D arrays
  train_X = train_X.reshape(-1, img_dim, img_dim, 1)
  test_X = test_X.reshape(-1, img_dim, img_dim, 1)

  # convert grayscale values to 0-1 scale
  train_X = train_X.astype('float32')
  test_X = test_X.astype('float32')
  train_X = train_X / 255.
  test_X = test_X / 255.

  # change the labels from categorical to one-hot encoding
  train_Y_one_hot = to_categorical(train_Y)
  test_Y_one_hot = to_categorical(test_Y)

  # split train data again to create validation subset
  train_X,valid_X,train_label,valid_label = train_test_split(
      train_X, train_Y_one_hot, test_size=0.2, random_state=624)
  
  # print(train_X.shape, test_X.shape)
  # print(train_Y.shape, test_Y.shape)
  # print('Original label:', train_Y[0])
  # print('After conversion to one-hot:', train_Y_one_hot[0])
  
  # package results in dictionary
  res = {"trX": train_X,"valX": valid_X,
         "trLab": train_label, "valLab": valid_label, 
         "tesX": test_X, "tesYhot": test_Y_one_hot, "tesY": test_Y,
         "img_dim": img_dim}
  
  return res

Function to create and compile the ConvNet model.

In [0]:
def create_conv_net_model(img_dim):

  # set up convNet model
  # configuration of the model from this tutorial:
  # https://www.datacamp.com/community/tutorials/convolutional-neural-networks-python
  ham_model = Sequential()
  ham_model.add(Conv2D(32, kernel_size=(3, 3),activation='linear',
                       padding='same',input_shape=(img_dim,img_dim,1)))
  ham_model.add(LeakyReLU(alpha=0.1))
  ham_model.add(MaxPooling2D((2, 2),padding='same'))
  ham_model.add(Dropout(0.25))
  ham_model.add(Conv2D(64, (3, 3), activation='linear',padding='same'))
  ham_model.add(LeakyReLU(alpha=0.1))
  ham_model.add(MaxPooling2D(pool_size=(2, 2),padding='same'))
  ham_model.add(Dropout(0.25))
  ham_model.add(Conv2D(128, (3, 3), activation='linear',padding='same'))
  ham_model.add(LeakyReLU(alpha=0.1))                  
  ham_model.add(MaxPooling2D(pool_size=(2, 2),padding='same'))
  ham_model.add(Dropout(0.4))
  ham_model.add(Flatten())
  ham_model.add(Dense(128, activation='linear'))
  ham_model.add(LeakyReLU(alpha=0.1))           
  ham_model.add(Dropout(0.3))
  ham_model.add(Dense(num_classes, activation='softmax'))

  # compile model
  ham_model.compile(loss=keras.losses.categorical_crossentropy,
                    optimizer=keras.optimizers.Adam(),metrics=['accuracy'])
  
  return ham_model

Function that trains and evaluates the network. 

In [0]:
def train_conv_net(data_file_name, batch_size=batch_size, epochs=epochs, 
                   verbose=1):
  # get data
  data = preprocess_data(data_file_name)

  # create model
  model = create_conv_net_model(data["img_dim"])

  # train model
  model_train_info = model.fit(data["trX"], data["trLab"],batch_size,epochs,
                                verbose,validation_data=(data["valX"],
                                                         data["valLab"]))
  
  # save trained model
  model_dir_name = f"{data_file_name.strip('.csv')}_model"
  model.save(f"{base_dir}/HAM_models/{model_dir_name}")

  # evaluate
  test_eval = model.evaluate(data["tesX"], data["tesYhot"])
  print('Test loss:', test_eval[0])
  print('Test accuracy:', test_eval[1])

  # get training info
  accuracy = model_train_info.history['accuracy']
  val_accuracy = model_train_info.history['val_accuracy']
  loss = model_train_info.history['loss']
  val_loss = model_train_info.history['val_loss']

  # make predictions
  predicted_classes = model.predict(data["tesX"])
  predicted_classes = np.argmax(np.round(predicted_classes),axis=1)

  # create classification report
  target_names = dx_list  # [f"Class {dx}" for dx in dx_list]
  report = classification_report(data["tesY"], predicted_classes, 
                        target_names=target_names, output_dict=True)
  report = pd.DataFrame(report)

  # create confusion matrix
  cm = confusion_matrix(data["tesY"], predicted_classes)
  df_cm = pd.DataFrame(cm, dx_list, dx_list)

  # package results 
  res = {"test_eval": test_eval, "accuracy": accuracy, 
         "val_accuracy": val_accuracy, "loss": loss, "val_loss": val_loss,
         "predicted_classes": predicted_classes, "test_Y": data["tesY"],
         "confusion_matrix": df_cm, "report": report, "model": model}
  
  return res

## Training networks

I trained the 5 ConvNets, using 5 different data sets:

1.   Data set with 8x8 pixel images (as provided in the HAM data set).
2.   Data set with 28x28 images (as provided in the HAM data set).
3.   Data set with 28x28 images, same as above, but with augmented lesion types to 2000 images per category. 
4.   Data set with 42x42 images, that I created, with augmented lesion types to 2000 per category. 
5.   Data set with 64x64 images, that I created, with augmented lesion types to 2000 per category.

The networks were all trained for 30 epochs. The cells below show information about the training of these networks, as well as their evaluation.

In [19]:
net8 = train_conv_net("hmnist_8_8_L.csv")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: /content/drive/My Drive/Colab Notebooks/HAM_ConvNet/HAM_models/hmnist_8_8_L_model/assets
Test loss: 0.8783152103424072
Test accuracy: 0.6959560513496399


  _warn_prf(average, modifier, msg_start, len(result))


In [0]:
net28 = train_conv_net("hmnist_28_28_L.csv")

In [0]:
net28_aug = train_conv_net("hmnist_28_28_2000.csv")

In [0]:
net42_aug = train_conv_net("hmnist_42_42_2000.csv")

In [0]:
net64_aug = train_conv_net("hmnist_64_64_2000.csv")

## Results

The results from the training of networks are presented in this section. 

Below I present the results in a series of graphs.

In [0]:
all_data = [net8, net28, net28_aug, net42_aug, net64_aug]

titles = ["8x8 (as provided)", "28x28 (as provided)", "28x28 (augmented)", 
          "42x42 (augmented)", "64x64 (augmented)"]

# Find colorbar max for confusion matrices
cbar_max = max(max(d["confusion_matrix"].max(axis=1)) for d in all_data)

# create figure
fig, axes = plt.subplots(4, 5, sharey="row", figsize=(18,14),
                         num="default", squeeze=True)
epch = range(epochs)

# add graphs to figure
for i, r in enumerate(all_data):

  axes[0, i].set_title(titles[i], fontsize=14)

  # Training and validation accuracy graphs
  if i == 0: axes[0, i].set_ylabel("Training and validation accuracy",
                                   fontsize=13)
  axes[0, i].set_xlabel("Epochs", fontsize=10)
  axes[0, i].plot(epch, r["accuracy"], "bo", label='Training accuracy')
  axes[0, i].plot(epch, r["val_accuracy"], "b", label='Validation accuracy')
  axes[0, i].legend(fontsize=11)
  axes[0, i].tick_params(axis='both', which='major', labelsize=11)

  # Training and validation loss graphs
  if i == 0: axes[1, i].set_ylabel("Training and validation loss", fontsize=13)
  axes[1, i].set_xlabel("Epochs", fontsize=10)
  axes[1, i].plot(epch, r["loss"], "bo",  label='Training loss', color ="red")  
  axes[1, i].plot(epch, r["val_loss"], "b", label='Validation loss',
                  color ="red")
  axes[1, i].legend(fontsize=11)
  axes[1, i].tick_params(axis='both', which='major', labelsize=11)

  # Prediction precision graphs
  df = r["report"].T
  bar_data = df.loc["akiec":"mel","precision"]
  if i == 0: axes[2, i].set_ylabel("Prediction precision", fontsize=13)
  axes[2, i].bar(bar_data.index, bar_data, color="rgbcmyk")
  axes[2, i].tick_params(axis='both', which='major', labelsize=11)

  # Confusion matrices  
  cm = confusion_matrix(r["test_Y"], r["predicted_classes"])
  df_cm = pd.DataFrame(cm, dx_list, dx_list)
  sn.set(font_scale=.7)
  sn.heatmap(df_cm, ax=axes[3, i], vmin=0, vmax=cbar_max, annot=True, 
             cmap="YlGnBu", cbar=i==len(all_data)-1, fmt="d", 
             linewidths=0.1, linecolor="black")
  if i == 0: axes[3, i].set_ylabel("Confusion matrices", fontsize=13)

fig.tight_layout()
fig.savefig("figure_convNetRes.svg", bbox_inches="tight")
fig.savefig("figure_convNetRes.pdf", bbox_inches="tight")
fig.show()

In [0]:
# create results table
import datetime
evaluation_data = pd.DataFrame([d["test_eval"] for d in all_data], index=titles,
                               columns=["Test loss", "Test accuracy"])
times = [67, 508, 1313, 3163, 5906]
evaluation_data["Training time"] = [str(datetime.timedelta(seconds=i)) for i 
                                    in times]
evaluation_data["Epochs"] = [epochs]*5

# re-arrange columns 
evaluation_data = evaluation_data.iloc[:,[3,2,0,1]]
# save to file
evaluation_data.to_csv("table_results.csv")

evaluation_data

In [0]:
# Convert models
converter = tf.lite.TFLiteConverter.from_keras_model(net28_aug["model"])
tflite_model = converter.convert()
# tflite_model.save(f"{base_dir}/HAM_models/tflite_28")
print(type(tflite_model))

<class 'bytes'>


In [0]:
print(len(tflite_model))
with open(f"{base_dir}/HAM_models/tflite_28", "wb") as f:
  f.write(tflite_model)


1426968
