# Skin Cancer Classification
HAM10000 ("Human Against Machine with 10000 training images") dataset - a large collection of multi-source dermatoscopic images of pigmented lesions

The dermatoscopic images are collected from different populations, acquired and stored by different modalities. The final dataset consists of 10015 dermatoscopic images.

It has 7 different classes of skin cancer which are listed below :

* Melanocytic nevi
* Melanoma
* Benign keratosis-like lesions
* Basal cell carcinoma
* Actinic keratoses
* Vascular lesions
* Dermatofibroma

## Importing necessary libraies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import os
from PIL import Image
import itertools

# Machine learning specific
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, Input, concatenate, BatchNormalization
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.utils import plot_model
# from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## EDA and Data Preprocessing

Before we can begin training/testing the model, EDA is required to assess what preprocessing will be needed.

In [None]:
meta_data = pd.read_csv("/content/drive/My Drive/CancerImages/HAM10000_metadata.csv")
meta_data.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [None]:
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

is_cancerous = {
    'nv': 0,
    'mel': 1,
    'bkl': 0,
    'bcc': 1,
    'akiec': 0,
    'vasc': 0,
    'df': 0
}

base_skin_dir = '/content/drive/My Drive/CancerImages'


imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(base_skin_dir, '*', '*.jpg'))}

In [None]:
# Add path to image to meta_data
meta_data['path'] = meta_data['image_id'].map(imageid_path_dict.get)
# Add lesion type to image to meta_data
meta_data['cell_type'] = meta_data['dx'].map(lesion_type_dict.get)
# Convert this to categorical
meta_data['cell_type_idx'] = pd.Categorical(meta_data['cell_type']).codes

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx,is_cancerous
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,/content/drive/My Drive/CancerImages/HAM10000_...,Benign keratosis-like lesions,2,0
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,/content/drive/My Drive/CancerImages/HAM10000_...,Benign keratosis-like lesions,2,0
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,/content/drive/My Drive/CancerImages/HAM10000_...,Benign keratosis-like lesions,2,0
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,/content/drive/My Drive/CancerImages/HAM10000_...,Benign keratosis-like lesions,2,0
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,/content/drive/My Drive/CancerImages/HAM10000_...,Benign keratosis-like lesions,2,0


In [None]:
# Fill empty ages with mean
meta_data['age'].fillna((meta_data['age'].mean()), inplace=True)

## Setup for ML
Need to to add images to the dataset.

Then need to split into train/validation/test sets.


In [None]:
#Add images to meta_data
meta_data['image'] = meta_data['path'].map(lambda x: np.asarray(Image.open(x)))

df = meta_data.copy(deep=True)

In [None]:
#Select features of target
features=df.drop(columns=['cell_type_idx'],axis=1)

features.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,is_cancerous,image
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,/content/drive/My Drive/CancerImages/HAM10000_...,Benign keratosis-like lesions,0,"[[[190, 151, 198], [190, 151, 196], [191, 152,..."
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,/content/drive/My Drive/CancerImages/HAM10000_...,Benign keratosis-like lesions,0,"[[[24, 18, 32], [19, 13, 27], [25, 13, 33], [3..."
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,/content/drive/My Drive/CancerImages/HAM10000_...,Benign keratosis-like lesions,0,"[[[177, 130, 138], [185, 138, 146], [191, 141,..."
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,/content/drive/My Drive/CancerImages/HAM10000_...,Benign keratosis-like lesions,0,"[[[29, 9, 11], [33, 13, 15], [43, 18, 24], [53..."
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,/content/drive/My Drive/CancerImages/HAM10000_...,Benign keratosis-like lesions,0,"[[[139, 89, 118], [150, 100, 129], [166, 116, ..."


In [None]:
#Select features of target
features=df.drop(columns=['cell_type_idx'],axis=1)
target=df['cell_type_idx']

#Split into test/train sets
x_train_o, x_test_o, y_train_o, y_test_o = train_test_split(features, target, test_size=0.20,random_state=1234)

#Select just images as x_train/x_test
x_train = np.asarray(x_train_o['image'].tolist())
x_test = np.asarray(x_test_o['image'].tolist())

x_train_mean = np.mean(x_train)
x_train_std = np.std(x_train)


x_train = (x_train - x_train_mean)/x_train_std
x_test = (x_test - x_train_mean)/x_train_std

# Ecnode the classification labels
y_train = to_categorical(y_train_o)
y_test = to_categorical(y_test_o)

# Reshape image in 3 dimensions (height = 75px, width = 100px , canal = 3)
x_train = x_train.reshape(x_train.shape[0], *(75, 100, 3))
x_test = x_test.reshape(x_test.shape[0], *(75, 100, 3))

## Model Setup

Now the data has been appropriately preprocessed, the model must be set up so it can be trained on the data.

In [None]:
input_shape = (75, 100, 3)
num_classes = 7

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),activation='relu',padding = 'Same',input_shape=input_shape))
model.add(Conv2D(32,kernel_size=(3, 3), activation='relu',padding = 'Same',))
model.add(MaxPool2D(pool_size = (2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), activation='relu',padding = 'Same'))
model.add(Conv2D(64, (3, 3), activation='relu',padding = 'Same'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(0.40))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_4 (Conv2D)           (None, 75, 100, 32)       896       
                                                                 
 conv2d_5 (Conv2D)           (None, 75, 100, 32)       9248      
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 37, 50, 32)        0         
 g2D)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 37, 50, 32)        0         
                                                                 
 conv2d_6 (Conv2D)           (None, 37, 50, 64)        18496     
                                                                 
 conv2d_7 (Conv2D)           (None, 37, 50, 64)        36928     
                                                      

In [None]:
# Define the optimizer
optimizer = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, amsgrad=False)

# Compile the model
model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])

# Set a learning rate annealer
learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy',
                                            patience=3,
                                            verbose=1,
                                            factor=0.5,
                                            learning_rate=0.00001)


epochs = 50
batch_size = 10
datagen = ImageDataGenerator(
    featurewise_center=True,
    featurewise_std_normalization=True,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    validation_split=0.1)


datagen.fit(x_train)


model.fit(datagen.flow(x_train, y_train, batch_size=32,
         subset='training'),
         validation_data=datagen.flow(x_train, y_train,
         batch_size=8, subset='validation'),
         steps_per_epoch=len(x_train) / 32, epochs=epochs)
# here's a more "manual" example
for e in range(epochs):
    print('Epoch', e)
    batches = 0
    for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=32):
        model.fit(x_batch, y_batch)
        batches += 1
        if batches >= len(x_train) / 32:
            # we need to break the loop by hand because
            # the generator loops indefinitely
            break

Epoch 1/50


ValueError: ignored

In [None]:
# model = load_model(base_skin_dir + '/model.h5')
# plot_model(model)

## Model Performance

In [None]:
loss, accuracy = model.evaluate(x_test, y_test, verbose=1)
loss_v, accuracy_v = model.evaluate(x_validate, y_validate, verbose=1)
print("Validation: accuracy = %f  ;  loss_v = %f" % (accuracy_v, loss_v))
print("Test: accuracy = %f  ;  loss = %f" % (accuracy, loss))
# model.save("model.h5")

RuntimeError: ignored

In [None]:
# Predict the values from the validation dataset
Y_pred = model.predict(x_test)

# Convert predictions classes to one hot vectors
Y_pred_classes = np.argmax(Y_pred,axis = 1)

# Convert validation observations to one hot vectors
Y_true = np.argmax(y_test,axis = 1)


ConfusionMatrixDisplay.from_predictions(Y_true, Y_pred_classes, normalize='true', cmap = 'inferno')
plt.grid(False)
plt.show()

## Intermediate steps


In [None]:
# # Original image
# plt.imshow(meta_data['image'][0])
# plt.title('Original Image')
# plt.grid(False)
# plt.axis('off')
# plt.show()


x_example = x_train[0]


# plt.imshow(x_example)
# plt.title('After preprocessing')
# plt.grid(False)
# plt.axis('off')
# plt.show()

current_image = x_example.reshape(1, *(75, 100, 3))
current_image.shape

intermediate_layers = model.layers[:7]
model = Sequential(intermediate_layers)
intermediate_image = model.predict(current_image)

# print(current_image.shape)
# plt.imshow(current_image[0,:,:,0])
# plt.title(f'Intermediate image')
# plt.grid(False)
# plt.axis('off')
# plt.show()

# Viz Grid
num_rows = 4
num_cols = 5

fig, ax = plt.subplots(ncols=num_cols, nrows=num_rows, figsize=(20,10))
for idx, ax_idx in enumerate(list(itertools.product(range(num_rows), range(num_cols)))):
    ax[ax_idx[0], ax_idx[1]].imshow(intermediate_image[0,:,:,idx])
    ax[ax_idx[0], ax_idx[1]].grid(False)
    ax[ax_idx[0], ax_idx[1]].axis('off')