# Imports

In [None]:
# os, filepath, randomizers
import os
import random

# key libraries required
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle as pk

# specific imports from tensorflow
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dropout, Flatten, Dense
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.metrics import FalseNegatives, FalsePositives, TrueNegatives, TruePositives
from tensorflow.keras.applications import ResNet50, VGG19

# specific imports from sklearn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score

# keras backend and image handeling
from keras import backend as K
from PIL import Image

%config InlineBackend.figure_format='retina'

parameters = {'axes.titleweight': 'heavy',
          'figure.titleweight': 'heavy'}

plt.rcParams.update(parameters)

plt.rc('text', usetex=True)
plt.rc('font', family='serif')
plt.rc('font', size=14)

# Data

In [None]:
# folder referencing

ROOT_DIR = os.path.dirname(os.path.abspath("top_level_file.txt"))
ARCH_DIR = os.path.join(ROOT_DIR, 'archive')
DATA_DIR = os.path.join(ARCH_DIR, 'data')
IMG_DIR = os.path.join(ROOT_DIR, 'latex/img')

TRAIN_DIR = os.path.join(DATA_DIR, 'train')
M_TRAIN = os.path.join(TRAIN_DIR, 'malignant')
B_TRAIN = os.path.join(TRAIN_DIR, 'benign')

TEST_DIR = os.path.join(DATA_DIR, 'test')
M_TEST = os.path.join(TEST_DIR, 'malignant')
B_TEST = os.path.join(TEST_DIR, 'benign')

CP_DIR = os.path.join(ROOT_DIR, 'checkpoint')
M1 = os.path.join(CP_DIR, 'mone')

HIST_DIR = os.path.join(ROOT_DIR, 'history')
M1HIST = os.path.join(HIST_DIR, 'm1.p')
M2HIST = os.path.join(HIST_DIR, 'm2.p')
M3HIST = os.path.join(HIST_DIR, 'm3.p')

In [None]:
read = lambda imname: np.asarray(Image.open(imname).convert("RGB"))

# load in training pictures

benign = [read(os.path.join(B_TRAIN, name)) for name in os.listdir(B_TRAIN) if name.replace(".jpg", "").isdigit()]
malignant = [read(os.path.join(M_TRAIN, name)) for name in os.listdir(M_TRAIN) if name.replace(".jpg", "").isdigit()]
X_train_b = np.array(benign, dtype='uint8')
X_train_m = np.array(malignant, dtype='uint8')

# load in test pictures

benign = [read(os.path.join(B_TEST, name)) for name in os.listdir(B_TEST) if name.replace(".jpg", "").isdigit()]
malignant = [read(os.path.join(M_TEST, name)) for name in os.listdir(M_TEST) if name.replace(".jpg", "").isdigit()]
X_test_b = np.array(benign, dtype='uint8')
X_test_m = np.array(malignant, dtype='uint8')

print([arr.shape for arr in [X_train_b, X_train_m, X_test_b, X_test_m]])

# create labels

Y_train_b = np.zeros(X_train_b.shape[0])
Y_train_m = np.ones(X_train_m.shape[0])
Y_test_b = np.zeros(X_test_b.shape[0])
Y_test_m = np.ones(X_test_m.shape[0])

print([arr.shape for arr in [Y_train_b, Y_train_m, Y_test_b, Y_test_m]])

# Merge data

X_train = np.concatenate((X_train_b, X_train_m), axis=0)
X_test = np.concatenate((X_test_b, X_test_m), axis=0)
Y_train = np.concatenate((Y_train_b, Y_train_m), axis=0)
Y_test = np.concatenate((Y_test_b, Y_test_m), axis=0)

# Shuffle data
s = np.arange(X_train.shape[0])
np.random.shuffle(s)
X_train = X_train[s]
Y_train = Y_train[s]

s = np.arange(X_test.shape[0])
np.random.shuffle(s)
X_test = X_test[s]
Y_test = Y_test[s]


In [None]:
# plot sample images from dataset

fig = plt.figure(figsize=(15, 6))
columns = 5
rows = 2

for i in range(1, columns*rows + 1):
    ax = fig.add_subplot(rows, columns, i)
    if Y_train[i] == 0:
        ax.title.set_text('Benign')
    else:
        ax.title.set_text('Malignant')
    plt.imshow(X_train[i], interpolation='nearest')
    plt.axis('off')
fig.tight_layout()
plt.savefig(os.path.join(IMG_DIR, 'imgs.png'))
plt.show()

In [None]:
# one-hot encode the labels and normalize them

Y_train = to_categorical(Y_train, num_classes=2)
Y_test = to_categorical(Y_test, num_classes=2)

X_train = X_train / float(255)
X_test = X_test / float(255)

In [None]:
# Model building function for the self-made CNN. Contains 9 layers (2 sets of conv, maxpooling and dropout)

def build_model(input_shape: tuple=(224, 224, 3),
                lr: float=1e-3,
                num_classes: int=2,
                init: str='normal',
                activation: str='relu',
                optimizer: str='adam') -> tf.keras.Sequential:
    
    model: tf.keras.Sequential = tf.keras.Sequential()
    
    model.add(Conv2D(64, kernel_size=(3, 3), padding='same', input_shape=input_shape,
                     activation=activation, kernel_initializer='glorot_uniform'))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    
    model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation=activation, kernel_initializer='glorot_uniform'))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    
    model.add(Flatten())
    model.add(Dense(128, activation='relu', kernel_initializer=init))
    model.add(Dense(num_classes, activation='softmax'))
    
    model.summary()
    
    if optimizer == 'rmsprop':
        optimizer = RMSprop(lr=lr)
    
    model.compile(optimizer=optimizer, 
                  loss='binary_crossentropy', 
                  metrics=['accuracy', FalseNegatives(), FalsePositives(), TrueNegatives(), TruePositives()])
    
    return model

# learning rate plataeu modifier as a callback from the keras lib

learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', patience=5, verbose=1, factor=0.5, min_lr=1e-7) 

def save_checkpoint(model_path):
    return ModelCheckpoint(filepath=model_path, verbose=0, save_best_only=True, save_weights_only=True)

# Model #1

In [None]:
# building self-made CNN

input_shape = (224, 224, 3)
lr = 1e-5
init = 'normal'
activation = 'relu'
optimizer = 'adam'
epochs = 30
batch_size = 64
model_path = M1

model = build_model(lr=lr, init=init, activation=activation, optimizer=optimizer, input_shape=input_shape)

history = model.fit(X_train, Y_train,
                    validation_split=0.2, 
                    epochs=epochs,
                    batch_size=batch_size, 
                    verbose=1, 
                    callbacks=[learning_rate_reduction, save_checkpoint(model_path)])


In [None]:
# storing history

with open(M1HIST, 'wb') as f:
    pk.dump(history.history, f)

In [None]:
# graphing model validation loss

history = pk.load(open(M1HIST, 'rb'))

fig, ax = plt.subplots(figsize=(12, 8))
x_arr = range(30)

ax.plot(x_arr, history['val_accuracy'], color='green', marker='.', ls=':', label='Validation')
ax.plot(x_arr, history['accuracy'], color='blue', marker='.', ls=':', label='Training')

ax.set(xlabel='Epochs', ylabel='\% Loss', title=r'\textbf{Training \& Validation Metrics (Model 1)}')
ax.grid(ls=':', alpha=0.5)
ax.legend()
plt.savefig(IMG_DIR + '/1.png')

In [None]:
# Confusion plots

fn = history['false_negatives']
fp = history['false_positives']
tn = history['true_negatives']
tp = history['true_positives']

vfn = history['val_false_negatives']
vfp = history['val_false_positives']
vtn = history['val_true_negatives']
vtp = history['val_true_positives']

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))

ax1.plot(x_arr, fn, color='red', marker='x', ls=':', label='False negative')
ax1.plot(x_arr, fp, color='red', marker='.', ls=':', label='False positive')
ax1.plot(x_arr, tn, color='green', marker='x', ls=':', label='True negative')
ax1.plot(x_arr, tp, color='green', marker='.', ls=':', label='True positive')
ax1.set_title('Training')

ax2.plot(x_arr, vfn, color='red', marker='x', ls=':', label='False negative')
ax2.plot(x_arr, vfp, color='red', marker='.', ls=':', label='False positive')
ax2.plot(x_arr, vtn, color='green', marker='x', ls=':', label='True negative')
ax2.plot(x_arr, vtp, color='green', marker='.', ls=':', label='True positive')
ax2.set_title('Validation')

for ax in [ax1, ax2]:
    ax.set(xlabel='Epochs', ylabel='Count')
    ax.grid(ls=':', alpha=0.5)
    ax.legend()

fig.suptitle(r'\textbf{Confusion Counts}')
fig.tight_layout()
plt.savefig(IMG_DIR + '/2.png')

In [None]:
# Prediction accuracy on test dataset

Y_pred = model.predict(X_test)

In [None]:
accuracy_score(np.argmax(Y_test, axis=1), np.argmax(Y_pred, axis=1))

# Model #2 - ResNet50

In [None]:
# building resnet50

model = ResNet50(include_top=True,
                 weights= None,
                 input_tensor=None,
                 input_shape=input_shape,
                 pooling='avg',
                 classes=2)

model.compile(optimizer=Adam(lr),
              loss='binary_crossentropy',
              metrics=['accuracy', FalseNegatives(), FalsePositives(), TrueNegatives(), TruePositives()])

history = model.fit(X_train, Y_train,
                    validation_split=0.2, 
                    epochs=epochs,
                    batch_size=batch_size, 
                    verbose=1, 
                    callbacks=[learning_rate_reduction, save_checkpoint(model_path)])

with open(M2HIST, 'wb') as f:
    pk.dump(history.history, f)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
x_arr = range(30)

history = pk.load(open(M2HIST, 'rb'))

ax.plot(x_arr, history['val_accuracy'], color='green', marker='.', ls=':', label='Validation')
ax.plot(x_arr, history['accuracy'], color='blue', marker='.', ls=':', label='Training')

ax.set(xlabel='Epochs', ylabel='\% Loss', title=r'\textbf{Training \& Validation Metrics (ResNet)}')
ax.grid(ls=':', alpha=0.5)
ax.legend()
plt.savefig(IMG_DIR + '/3.png')

In [None]:
# Confusion plots

fn = history['false_negatives_5']
fp = history['false_positives_5']
tn = history['true_negatives_5']
tp = history['true_positives_5']

vfn = history['val_false_negatives_5']
vfp = history['val_false_positives_5']
vtn = history['val_true_negatives_5']
vtp = history['val_true_positives_5']

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))

ax1.plot(x_arr, fn, color='red', marker='x', ls=':', label='False negative')
ax1.plot([], [], color='red', marker='.', ls=':', label='False positive')
ax1.plot(x_arr, tn, color='green', marker='x', ls=':', label='True negative')
ax1.plot([], [], color='green', marker='.', ls=':', label='True positive')
ax1.set_title('Training')

ax2.plot(x_arr, vfn, color='red', marker='x', ls=':', label='False negative')
ax2.plot([], [], color='red', marker='.', ls=':', label='False positive')
ax2.plot(x_arr, vtn, color='green', marker='x', ls=':', label='True negative')
ax2.plot([], [], color='green', marker='.', ls=':', label='True positive')
ax2.set_title('Validation')

for ax in [ax1, ax2]:
    ax.set(xlabel='Epochs', ylabel='Count')
    ax.grid(ls=':', alpha=0.5)
    ax.legend()

fig.suptitle(r'\textbf{Confusion Counts}')
fig.tight_layout()

plt.savefig(IMG_DIR + '/4.png')

In [None]:
# Prediction accuracy on test dataset

Y_pred = model.predict(X_test)

In [None]:
accuracy_score(np.argmax(Y_test, axis=1), np.argmax(Y_pred, axis=1))

# Model #3 - VGG19

In [None]:
# building VGG19

model = VGG19(include_top=True,
                 weights= None,
                 input_tensor=None,
                 input_shape=input_shape,
                 pooling='avg',
                 classes=2)

model.compile(optimizer=Adam(lr),
              loss='binary_crossentropy',
              metrics=['accuracy', FalseNegatives(), FalsePositives(), TrueNegatives(), TruePositives()])

history = model.fit(X_train, Y_train,
                    validation_split=0.2, 
                    epochs=epochs,
                    batch_size=batch_size, 
                    verbose=1, 
                    callbacks=[learning_rate_reduction, save_checkpoint(model_path)])

with open(M3HIST, 'wb') as f:
    pk.dump(history.history, f)

In [None]:
# plotting accuracy

fig, ax = plt.subplots(figsize=(12, 8))
x_arr = range(30)

history = pk.load(open(M2HIST, 'rb'))

ax.plot(x_arr, history['val_accuracy'], color='green', marker='.', ls=':', label='Validation')
ax.plot(x_arr, history['accuracy'], color='blue', marker='.', ls=':', label='Training')

ax.set(xlabel='Epochs', ylabel='\% Loss', title=r'\textbf{Training \& Validation Metrics (VGG)}')
ax.grid(ls=':', alpha=0.5)
ax.legend()
plt.savefig(IMG_DIR + '/5.png')

In [None]:
# Confusion plots

fn = history['false_negatives_5']
fp = history['false_positives_5']
tn = history['true_negatives_5']
tp = history['true_positives_5']

vfn = history['val_false_negatives_5']
vfp = history['val_false_positives_5']
vtn = history['val_true_negatives_5']
vtp = history['val_true_positives_5']

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))

ax1.plot(x_arr, fn, color='red', marker='x', ls=':', label='False negative')
ax1.plot([], [], color='red', marker='.', ls=':', label='False positive')
ax1.plot(x_arr, tn, color='green', marker='x', ls=':', label='True negative')
ax1.plot([], [], color='green', marker='.', ls=':', label='True positive')
ax1.set_title('Training')

ax2.plot(x_arr, vfn, color='red', marker='x', ls=':', label='False negative')
ax2.plot([], [], color='red', marker='.', ls=':', label='False positive')
ax2.plot(x_arr, vtn, color='green', marker='x', ls=':', label='True negative')
ax2.plot([], [], color='green', marker='.', ls=':', label='True positive')
ax2.set_title('Validation')

for ax in [ax1, ax2]:
    ax.set(xlabel='Epochs', ylabel='Count')
    ax.grid(ls=':', alpha=0.5)
    ax.legend()

fig.suptitle(r'\textbf{Confusion Counts}')
fig.tight_layout()

plt.savefig(IMG_DIR + '/6.png')

In [None]:
# Prediction accuracy on test dataset

Y_pred = model.predict(X_test)

In [None]:
accuracy_score(np.argmax(Y_test, axis=1), np.argmax(Y_pred, axis=1))