## COMS4030A/COMS7047A- ADVANCED COMPUTATION AND MACHINE LEARNING
## Brain Tumor Detection using a CNN
Tumi Jourdan ~ 2180153, 
Luca von Mayer ~ 2427051, 
Mohammad Zaid Moonsamy ~ 2433079, 
Shakeel Malagas ~ 2424161


### Importing all necessary libraries

In [None]:
import math
import os
import umap
import cv2
import imutils
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, cohen_kappa_score
from sklearn.model_selection import KFold
from imblearn.over_sampling import RandomOverSampler
from tensorflow.keras.optimizers import Adam, SGD, Adagrad, RMSprop


### Raw data Insights

In [None]:
# setting directories
meningioma_dir = 'dataset/1'
glioma_dir = 'dataset/2'
pituitary_dir = 'dataset/3'


meningioma_images = [os.path.join(meningioma_dir, img) for img in os.listdir(meningioma_dir) if img.endswith('.jpg') or img.endswith('.JPG') or img.endswith('.png') or img.endswith('.PNG') or img.endswith('.JPEG') or img.endswith('.jpeg')]
glioma_images = [os.path.join(glioma_dir, img) for img in os.listdir(glioma_dir) if img.endswith('.jpg') or img.endswith('.JPG') or img.endswith('.png') or img.endswith('.PNG') or img.endswith('.JPEG') or img.endswith('.jpeg')]
pituitary_images = [os.path.join(pituitary_dir, img) for img in os.listdir(pituitary_dir) if img.endswith('.jpg') or img.endswith('.JPG') or img.endswith('.png') or img.endswith('.PNG') or img.endswith('.JPEG') or img.endswith('.jpeg')]

# combining the normal images and the tumor images
data = np.concatenate([meningioma_images, glioma_images, pituitary_images])

# printing some informative dataset information
print("The dataset contains ", len(meningioma_images), "meningioma tumor images")
print("The dataset contains ", len(glioma_images), "glioma tumor images")
print("The dataset contains ", len(pituitary_images), "pituitary tumor images")
print("The merged dataset contains ", len(data), "image samples")
print("The data list contains the image samples as files: ", data[0:5], "&", data[-5:])


### Data Visualisation

In [None]:
# Printing 10 meningioma tumor image samples
print("The meningioma tumor image samples:")
plt.figure(figsize=(6, 3))
for i, img_path in enumerate(meningioma_images[:10], 1):
    image = Image.open(img_path)
    plt.subplot(2, 5, i)
    plt.imshow(image)
    plt.title(os.path.basename(img_path))
    plt.axis('off')

plt.tight_layout()
plt.show()

# Printing 10 glioma tumor image samples
print("The glioma tumor image samples:")
plt.figure(figsize=(6, 3))
for i, img_path in enumerate(glioma_images[:10], 1):
    image = Image.open(img_path)
    plt.subplot(2, 5, i)
    plt.imshow(image)
    plt.title(os.path.basename(img_path))
    plt.axis('off')

plt.tight_layout()
plt.show()

# Printing 10 pituitary tumor image samples
print("The pituitary tumor image samples:")
plt.figure(figsize=(6, 3))
for i, img_path in enumerate(pituitary_images[:10], 1):
    image = Image.open(img_path)
    plt.subplot(2, 5, i)
    plt.imshow(image)
    plt.title(os.path.basename(img_path))
    plt.axis('off')

plt.tight_layout()
plt.show()

# Plotting the graph displaying the balance of different types of tumors in the dataset
plt.figure(figsize=(6, 4))
plt.bar(0, len(meningioma_images),color='green',  label='Value 1')
plt.bar(1, len(glioma_images),color='red', label='Value 2')
plt.bar(2, len(pituitary_images),color='blue', label='Value 3')
plt.xlabel('Image dataset')
plt.ylabel('Size')
plt.title('Data Balance / Imbalance Check')
plt.xticks([0, 1, 2], ['Meningioma Tumor', 'Glioma Tumor' , 'Pituitary Tumor'])  
plt.tight_layout()
plt.show()

### Data Preprocessing

##### Image Cropping Function and Example

In [None]:
# The cropping function
def crop_brain_image(image):
    
    # Converting the image to grayscale and blurring it
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (5, 5), 0)
    
    threshold = cv2.threshold(gray, 45, 255, cv2.THRESH_BINARY)[1]
    threshold = cv2.erode(threshold, None, iterations=2)
    threshold = cv2.dilate(threshold, None, iterations=2)

    # Finding contours in the thresholded image and then choosing the largest
    contours = cv2.findContours(threshold.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = imutils.grab_contours(contours)
    c = max(contours, key=cv2.contourArea)
    
    extreme_left = tuple(c[c[:, :, 0].argmin()][0])
    extreme_right = tuple(c[c[:, :, 0].argmax()][0])
    extreme_top = tuple(c[c[:, :, 1].argmin()][0])
    extreme_bottom = tuple(c[c[:, :, 1].argmax()][0])
    
    new_image = image[extreme_top[1]:extreme_bottom[1], extreme_left[0]:extreme_right[0]]            
    
    return new_image

# Cropping an example image from the meningioma dataset
prior_image = cv2.imread('./dataset/1/2307.png')
cropped_image = crop_brain_image(prior_image)

# Visualising the cropping results
plt.figure()
plt.subplot(1, 2, 1)
plt.imshow(prior_image)
plt.tick_params(axis='both', which='both', top=False, bottom=False, left=False, right=False,labelbottom=False, labeltop=False, labelleft=False, labelright=False)
plt.title('Original Image')
plt.subplot(1, 2, 2)
plt.imshow(cropped_image)
plt.tick_params(axis='both', which='both',top=False, bottom=False, left=False, right=False,labelbottom=False, labeltop=False, labelleft=False, labelright=False)
plt.title('Cropped Image')
plt.show()


##### Image Cropping, Resizing and Normalisation

In [None]:
# The preprocessing function that performs croppin, resizing and normalisation on all images
def preprocess_images(dir):
    final_images = []
    for file in os.listdir(dir):
        img_path = os.path.join(dir, file)
        
        if os.path.isfile(img_path):
            img = cv2.imread(img_path)
            
            # Cropping the images using the cropping function above
            cropped_img = crop_brain_image(img)
            
            # Resizing the image using cv2
            resized_img = cv2.resize(cropped_img, (224, 224))
            
            # Normalising the resized image
            normalized_img = resized_img / 255.0
            
            final_images.append(normalized_img)
    return final_images

# Function to display the images
def show_images(images, title):
    plt.figure(figsize=(10, 5))
    
    for i in range(min(5, len(images))):
        plt.subplot(1, 5, i + 1)
        plt.imshow(images[i])
        plt.axis('off')
    plt.suptitle(title, y=0.7)  
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()


meningioma = preprocess_images('./dataset/1')
glioma = preprocess_images('./dataset/2')
pituitary = preprocess_images('./dataset/3')

# visualising the images after the preprocessing
show_images(meningioma, 'Meningioma Tumor Images')
show_images(glioma, 'Glioma Tumor Images')
show_images(pituitary, 'Pituitary Tumor Images')

# Printing the shape of images after the resizing and normalisation
print("Shape of first 5 meningioma images:", [img.shape for img in meningioma[:5]])
print("Shape of first 5 glioma images:", [img.shape for img in glioma[:5]])
print("Shape of first 5 pituitary images:", [img.shape for img in pituitary[:5]])



##### Target Appending, Data Splitting and Input Batches for the Model

In [None]:
# filling the numpy targets with 0s, 1s and 2s
meningioma_targets = np.zeros(len(meningioma))
glioma_targets = np.ones(len(glioma))
pituitary_targets = np.full(len(pituitary), 2)

# the data and targets finalisation - converting to numpy arrays
data = meningioma + glioma + pituitary
targets = np.concatenate((meningioma_targets, glioma_targets, pituitary_targets), axis=0)
X = np.array(data)
Y = np.array(targets)

# Data Augmentation
oversampler = RandomOverSampler(random_state=42)
X_resampled, Y_resampled = oversampler.fit_resample(X.reshape(X.shape[0], -1), Y)
X_resampled = X_resampled.reshape(-1, 224, 224, 3)

# Plotting the graph displaying the balance of different types of tumors in the resampled dataset
unique, counts = np.unique(Y_resampled, return_counts=True)
plt.figure(figsize=(6, 4))
plt.bar(range(len(unique)), counts, color=['green', 'red', 'blue'], label=['Meningioma', 'Glioma', 'Pituitary'])
plt.xlabel('Tumor Type')
plt.ylabel('Size')
plt.title('Resampled Data Balance')
plt.xticks(range(len(unique)), ['Meningioma', 'Glioma', 'Pituitary'])
plt.legend()
plt.tight_layout()
plt.show()

# Splitting the training and testing data and making sure that the data is shuffled 
X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42, shuffle=True)


print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of Y_train:", Y_train.shape)
print("Shape of Y_test:", Y_test.shape)

train = tf.data.Dataset.from_tensor_slices((X_train, Y_train))
test = tf.data.Dataset.from_tensor_slices((X_test, Y_test))

# Splitting the train data into train and validation
validation_size = int(0.1 * len(X_train))
train = train.skip(validation_size)
val = train.take(validation_size)

print("Train dataset size:", len(X_train) - validation_size)
print("Validation dataset size:", validation_size)
print("Test dataset size:", len(X_test))

# Input will be fed in batches of 28
BATCH_SIZE = 25
train = train.batch(BATCH_SIZE)
test = test.batch(BATCH_SIZE)
val = val.batch(BATCH_SIZE)

train.as_numpy_iterator().next()

### CNN Model Training

In [None]:
# Making use of a sequential model: layers are added one by one
model = Sequential()

print(X_train.shape)

# Adding convolutional, pooling, flattening, dropout and dense layers
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3), padding='valid'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu', padding='valid'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu', padding='valid'))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

# printing the summary model architecture
model.summary()

# Create an instance of the Adam optimizer with a custom learning rate
optimizer = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.99)

# Using adam w optimiser with sparse categorical cross entropy for multiclass classification
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# training
training_results = model.fit(train, validation_data=val, epochs=30, verbose=1, shuffle=True)

### Model Testing and Evaluation of Results

In [None]:
# Evaluating the model on the test data
test_loss, test_accuracy = model.evaluate(test)
print("Test accuracy:", test_accuracy)

# Making predictions on the test set
y_prediction = model.predict(test)
y_prediction_classes = np.argmax(y_prediction, axis=1)

# Calculating and printing the classification report which includes precision, f1-score and more
print("Classification Report:")
print(classification_report(Y_test, y_prediction_classes, target_names=['Meningioma', 'Glioma', 'Pituitary']))

# Confusion matrix
confusion_matrix = tf.math.confusion_matrix(Y_test, y_prediction_classes)
print("Confusion Matrix:")
print(confusion_matrix.numpy())
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix.numpy(), annot=True, fmt='d', cmap='Blues', xticklabels=['Meningioma', 'Glioma', 'Pituitary'], yticklabels=['Meningioma', 'Glioma', 'Pituitary'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Model accuracy graph
plt.figure()
plt.plot(training_results.history['accuracy'])
plt.plot(training_results.history['val_accuracy'])
plt.legend(['Train Accuracy', 'Validation Accuracy'], loc='upper right')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.show()

# Model loss graph
plt.figure()
plt.plot(training_results.history['loss'])
plt.plot(training_results.history['val_loss'])
plt.legend(['Train Loss', 'Validation Loss'], loc='upper right')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()


### Further Evaluation with k-fold Cross Validation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

data = meningioma + glioma + pituitary
targets = np.concatenate((meningioma_targets, glioma_targets, pituitary_targets), axis=0)
X = np.array(data)
Y = np.array(targets)

# Oversampling the data
oversampler = RandomOverSampler(random_state=42)
X_resampled, Y_resampled = oversampler.fit_resample(X.reshape(X.shape[0], -1), Y)
X_resampled = X_resampled.reshape(-1, 224, 224, 3)

# Implementing the k-fold Cross Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
loss_scores = []
confusion_matrices = []

fold_no = 1

for train_index, test_index in kfold.split(X_resampled, Y_resampled):
    X_train, X_test = X_resampled[train_index], X_resampled[test_index]
    Y_train, Y_test = Y_resampled[train_index], Y_resampled[test_index]

    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3), padding='valid'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu', padding='valid'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu', padding='valid'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    print(f'Training for fold {fold_no} ...')
    training_results = model.fit(X_train, Y_train, epochs=10, batch_size=25, validation_split=0.2, verbose=1)

    test_loss, test_accuracy = model.evaluate(X_test, Y_test)
    print(f'Score for fold {fold_no}: loss of {test_loss:.4f}; accuracy of {test_accuracy:.4f}')
    y_prediction = model.predict(X_test)
    y_prediction_classes = np.argmax(y_prediction, axis=1)

    # Calculating and printing the classification report
    print(f"Classification Report for fold {fold_no}:")
    print(classification_report(Y_test, y_prediction_classes, target_names=['Meningioma', 'Glioma', 'Pituitary']))

    # Confusion matrix
    confusion_mtx = confusion_matrix(Y_test, y_prediction_classes)
    print(f"Confusion Matrix for fold {fold_no}:")
    print(confusion_mtx)


    # Calculate the metrics
    accuracy = accuracy_score(Y_test, y_prediction_classes)
    precision = precision_score(Y_test, y_prediction_classes, average='weighted')
    recall = recall_score(Y_test, y_prediction_classes, average='weighted')
    f1 = f1_score(Y_test, y_prediction_classes, average='weighted')

    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    loss_scores.append(test_loss)
    confusion_matrices.append(confusion_mtx)

    fold_no += 1

# Displaying the average metrics
print('\nAverage scores for all folds:')
print(f'> Accuracy: {np.mean(accuracy_scores):.4f} (+- {np.std(accuracy_scores):.4f})')
print(f'> Precision: {np.mean(precision_scores):.4f} (+- {np.std(precision_scores):.4f})')
print(f'> Recall: {np.mean(recall_scores):.4f} (+- {np.std(recall_scores):.4f})')
print(f'> F1 Score: {np.mean(f1_scores):.4f} (+- {np.std(f1_scores):.4f})')
print(f'> Loss: {np.mean(loss_scores):.4f} (+- {np.std(loss_scores):.4f})')

# Calculating the average confusion matrix
avg_cm = np.mean(confusion_matrices, axis=0).astype(int)

print('\nAverage Confusion Matrix:')
print(avg_cm)

plt.figure(figsize=(8, 6))
sns.heatmap(avg_cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Meningioma', 'Glioma', 'Pituitary'],
            yticklabels=['Meningioma', 'Glioma', 'Pituitary'])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Average Confusion Matrix')
plt.show()