# Breast Cancer Classification

---

Last Updated: 20/12/2022

## Retrieving dataset

The dataset used is Breast Histopathology Images data from Kaggle (https://www.kaggle.com/datasets/paultimothymooney/breast-histopathology-images). It contains 277,524 patches of size 50 x 50 extracted from 162 whole mount slide images of Breast Cancer (BCa) specimens scanned at 40x. There are 198,738 IDC negative and 78,786 IDC positive patches.

In [None]:
# upload kaggle API token
!pip install kaggle
from google.colab import files
uploaded = files.upload() # upload the kaggle json API token

In [None]:
# download the dataset from kaggle
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d paultimothymooney/breast-histopathology-images

In [None]:
# unzip the folder
!unzip breast-histopathology-images.zip -d original

## Imports

In [None]:
# Project structure organization
import os
import random 
import shutil 

# Preprocessing
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator 
from tensorflow.keras.utils import to_categorical 

# Training
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Activation, MaxPooling2D, BatchNormalization, Dropout, Flatten, Dense # model layers
from tensorflow.keras.callbacks import EarlyStopping # callbacks
from tensorflow.keras.optimizers import Adagrad # optimizer

# Results
import matplotlib.pyplot as plt
import numpy as np 
from sklearn.metrics import classification_report, confusion_matrix

---
## Configuration

In [None]:
# Project Structure organization
ORIGINAL_DATASET_PATH = "original"
BASE_PATH = "Dataset"
TRAIN_PATH = os.path.join(BASE_PATH, "train")
VALIDATION_PATH = os.path.join(BASE_PATH, "validation")
TEST_PATH = os.path.join(BASE_PATH, "test")

# Train-Validation-Test split
TRAIN_SPLIT = 0.8
VALIDATION_SPLIT = 0.1

# Parameters for training
EPOCHS = 20
BATCH_SIZE = 64
VERBOSE = 1
LEARNING_RATE = 0.01

# callback parameters
PATIENCE = 10

---
## Setting up the project directory

As the dataset is too large to load into memory, the dataset will be batched and loaded from disk. As such, there is a need to organise the project directory.

In [None]:
def list_images(file_path):
  '''
  Returns the images within a folder
  file_path: path to a folder whose subfolders contain images
  valid image types: png, jpeg
  '''
  
  VALID_IMAGE_TYPE = "png"
  
  images = []

  # walk through 
  for (root, dirs, files) in os.walk(file_path, topdown = True):
    images_sub = [os.path.join(root, file_) for file_ in files if VALID_IMAGE_TYPE in file_]
    images.extend(images_sub)
  
  return images

In [None]:
# grab all the image file paths
image_paths = list_images(ORIGINAL_DATASET_PATH)

# randomize the order of the files
random.seed(42)
random.shuffle(image_paths)

# conduct the split
train_image_paths = image_paths[:int(len(image_paths) * TRAIN_SPLIT)]
validation_image_paths = train_image_paths[:int(len(train_image_paths) * VALIDATION_SPLIT):]
train_image_paths = train_image_paths[int(len(train_image_paths) * VALIDATION_SPLIT):]
test_image_paths = image_paths[int(len(image_paths) * TRAIN_SPLIT):]

In [None]:
# create a new folder to organize the data 
if not os.path.exists(BASE_PATH):
  os.mkdir(BASE_PATH)

dataset = {TRAIN_PATH: train_image_paths, VALIDATION_PATH: validation_image_paths, TEST_PATH: test_image_paths}
for PATH, image_paths in dataset.items():
    for image_path in image_paths:
        # create a new folder to organize the train data
        if not os.path.exists(PATH):
            os.mkdir(PATH)
        
        # extract the label and name
        name = image_path.split(os.path.sep)[-1]
        label = name[-5]

        # create a new folder to store the image
        if not os.path.isdir(os.path.join(PATH, label)):
            os.mkdir(os.path.join(PATH, label))
        
        # copy the image
        shutil.copy(image_path, os.path.join(PATH, label, name))

In [None]:
print(f"[INFO] Number of train images \t: {len(train_image_paths)}")
print(f"[INFO] Number of validation images \t: {len(validation_image_paths)}")
print(f"[INFO] Number of test images \t: {len(test_image_paths)}")

---

## Preprocessing

Create the image datasets for the model.

In [None]:
# initialize the training data augmentation object
trainAug = ImageDataGenerator(
	rescale=1 / 255.0,
	rotation_range=20,
	zoom_range=0.05,
	width_shift_range=0.1,
	height_shift_range=0.1,
	shear_range=0.05,
	horizontal_flip=True,
	vertical_flip=True,
	fill_mode="nearest")

# initialize the validation (and testing) data augmentation object
valAug = ImageDataGenerator(rescale=1 / 255.0)

In [None]:
# initialize the training generator
trainGen = trainAug.flow_from_directory(
	TRAIN_PATH,
	class_mode="categorical",
	target_size=(48, 48),
	color_mode="rgb",
	shuffle=True,
	batch_size=BATCH_SIZE)

# initialize the validation generator
valGen = valAug.flow_from_directory(
	VALIDATION_PATH,
	class_mode="categorical",
	target_size=(48, 48),
	color_mode="rgb",
	shuffle=False,
	batch_size=BATCH_SIZE)

# initialize the testing generator
testGen = valAug.flow_from_directory(
	TEST_PATH,
	class_mode="categorical",
	target_size=(48, 48),
	color_mode="rgb",
	shuffle=False,
	batch_size=BATCH_SIZE)

Compute the weights to address the class inbalance.

In [None]:
train_labels = [int(p.split(os.path.sep)[-2]) for p in train_image_paths]
train_labels = to_categorical(train_labels)
class_totals = train_labels.sum(axis=0)
class_weight = dict()

# loop over all classes and calculate the class weight
for i in range(0, len(class_totals)):
	class_weight[i] = class_totals.max() / class_totals[i]

---

## Model Creation and Testing

Train and Test the models.

In [None]:
def build(width, height, depth, classes):
    model = Sequential()

    # CONV2D -> ACTIVATION -> BN -> MAXPOOLING2D -> DROPOUT
    model.add(Conv2D(32, (3, 3), padding="same", input_shape=(height, width, depth), activation='relu'))
    model.add(BatchNormalization(axis=-1))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))

    # (CONV => RELU => BN) * 2 -> MAXPOOLING2D -> DROPOUT
    model.add(Conv2D(64, (3, 3), padding="same", activation='relu'))
    model.add(BatchNormalization(axis=-1))
    model.add(Conv2D(64, (3, 3), padding="same", activation='relu'))
    model.add(BatchNormalization(axis=-1))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))

    # (CONV => RELU => BN) * 3 -> MAXPOOLING2D -> DROPOUT
    model.add(Conv2D(128, (3, 3), padding="same", activation='relu'))
    model.add(BatchNormalization(axis=-1))
    model.add(Conv2D(128, (3, 3), padding="same", activation='relu'))
    model.add(BatchNormalization(axis=-1))
    model.add(Conv2D(128, (3, 3), padding="same", activation='relu'))
    model.add(BatchNormalization(axis=-1))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))

    # FC
    model.add(Flatten())

    # DENSE -> ACTIVATION -> BN -> DROPOUT
    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    # DENSE -> SOFTMAX
    model.add(Dense(classes))
    model.add(Activation("softmax"))

    # compile model
    optimizer = Adagrad(learning_rate = LEARNING_RATE, decay = LEARNING_RATE / EPOCHS)
    model.compile(optimizer = optimizer, loss = "categorical_crossentropy", metrics = ["accuracy"])

    return model

In [None]:
# callbacks
es = tf.keras.callbacks.EarlyStopping(monitor = "val_accuracy", patience = PATIENCE, restore_best_weights = True)

In [None]:
# training the model
model = build(48, 48, 3, 2)
H = model.fit(trainGen, steps_per_epoch=len(train_image_paths) // BATCH_SIZE, validation_data = valGen, validation_steps=len(validation_image_paths) // BATCH_SIZE, epochs = EPOCHS, verbose = VERBOSE, batch_size = BATCH_SIZE, callbacks = [es], class_weight = class_weight)

In [None]:
# test the model
testGen.reset()
predictions = model.predict(testGen, steps=(len(test_image_paths) // BATCH_SIZE) + 1).argmax(axis=1)
print(classification_report(testGen.classes, predictions, target_names=testGen.class_indices.keys()))

In [None]:
# compute the confusion matrix and and calculate accuracy, sensitivity, and specificity
cm = confusion_matrix(testGen.classes, predictions)
total = sum(sum(cm))
acc = (cm[0, 0] + cm[1, 1]) / total
sensitivity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
specificity = cm[1, 1] / (cm[1, 0] + cm[1, 1])

print(cm)
print("acc: {:.4f}".format(acc))
print("sensitivity: {:.4f}".format(sensitivity))
print("specificity: {:.4f}".format(specificity))

In [None]:
# plot the learning curve
N = len(H['validation_accuracy'])
plt.figure()
plt.plot(np.arange(N), H['loss'], label='loss')
plt.plot(np.arange(N), H['accuracy'], label='accuracy')
plt.plot(np.arange(N), H['validation_loss'], label='validation loss')
plt.plot(np.arange(N), H['validation_accuracy'], label='validation accuracy')
plt.title("Learning Curve")
plt.xlabel("Epoch Number")
plt.ylabel("Loss/Accuracy")
plt.legend(loc='lower left')
plt.show()