# Exploratory Data Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2  # for image processing
from PIL import Image
import os
import random
import warnings

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models, applications
from keras_tuner import RandomSearch
from tensorflow.keras.applications import VGG16


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, roc_auc_score, roc_curve, precision_recall_curve, PrecisionRecallDisplay, average_precision_score
import matplotlib.pyplot as plt

In [2]:
random.seed(123)
warnings.filterwarnings('ignore')

## Data Exploration

In [3]:
healthy_filepath = "archive/Brain Tumor Data Set/Brain Tumor Data Set/Healthy"
unhealthy_filepath = "archive/Brain Tumor Data Set/Brain Tumor Data Set/Brain Tumor"

healthy_count = len(os.listdir(healthy_filepath))
unhealthy_count = len(os.listdir(unhealthy_filepath))

print(f"Healthy Images: {healthy_count}, Unhealthy Images: {unhealthy_count}")

Healthy Images: 2087, Unhealthy Images: 2513


## Load Images

In [4]:
# standardize all of the images
def load_images_from_folder(folder, label, image_size=(128, 128)):
    images = []
    labels = []
    for filename in os.listdir(folder):
        filepath = os.path.join(folder, filename)
        try: 
            img = cv2.imread(filepath)
            if img is not None: # if openCV can't read the image try with PIL
                img = Image.open(filepath)
                img = img.convert("RGB") # convert to RGB in case of grayscale or RGBA
                img = np.array(img)

            img = cv2.resize(img, image_size)
            images.append(img)
            labels.append(label)
        
        except Exception as e:
            print(f"Error loading {filename}: {e}")
            continue  # Skip unreadable images
            
    return images, labels

In [5]:
# create arrays for images and labels
healthy_images, healthy_labels = load_images_from_folder(healthy_filepath, label = 0)
unhealthy_images, unhealthy_labels = load_images_from_folder(unhealthy_filepath, label = 1)

## Image Preprocessing

In [6]:
X = np.array(healthy_images + unhealthy_images) / 255.0
y = np.array(healthy_labels + unhealthy_labels)

**X explained:**
    This helps normalize the pixel values from their original range of [0, 255] to [0,1]. Normalizing helps the model learn more efficiently and can lead to better training results since most neural networks work better with normalized data. 
    


In [7]:
# Data Augmentation
data_gen = ImageDataGenerator(
    rotation_range=5,      # Smaller rotation range
    zoom_range=0.1,        # Small zoom to avoid losing details
    width_shift_range=0.05, # Small shifts to keep important features within the frame
    height_shift_range=0.05,
    fill_mode="nearest"
)


**Data Augmentation:** Is meant to help prevent overfitting. 
* Rotation Range: randomly rotates images up to 20 degress which helps the model gereralize to slightly rotated images.
* Zoom Range: randomly zooms in or out on images by up to 15% which helps the model learn to recognize images at different scales.
* Width Shift Range and Height Shift Range: Randomly shifts images horizontally and vertically by up to 20%. This helps the model generalize slightly to images that may not be perfectly centered.
* Shear Range: Shear transformation means essentially "slanting" the image by up to 15% which helps robustness with slight distortions.
* Fill Mode: Speicifies how to fill in new pixel values when shifting, zooming, or rotating an image. 

## Split Data

In [8]:
# Separate healthy and unhealthy data based on labels
X_healthy = X[y == 0]
X_unhealthy = X[y == 1]

# Step 1: Training Set - 60% of total data, balanced 50/50 between healthy and unhealthy
train_size_per_class = 540  # 540 healthy, 540 unhealthy
X_train = np.concatenate((X_healthy[:train_size_per_class], X_unhealthy[:train_size_per_class]))
y_train = np.array([0] * train_size_per_class + [1] * train_size_per_class)

# Step 2: Validation Set - 30% of total data, with an 80/20 split
val_size_healthy = 432  # 80% healthy
val_size_unhealthy = 108  # 20% unhealthy
X_val = np.concatenate((X_healthy[train_size_per_class:train_size_per_class + val_size_healthy], 
                        X_unhealthy[train_size_per_class:train_size_per_class + val_size_unhealthy]))
y_val = np.array([0] * val_size_healthy + [1] * val_size_unhealthy)

# Step 3: Testing Set - 10% of total data, with an 80/20 split
test_size_healthy = 144  # 80% healthy
test_size_unhealthy = 36  # 20% unhealthy
X_test = np.concatenate((X_healthy[train_size_per_class + val_size_healthy:train_size_per_class + val_size_healthy + test_size_healthy], 
                         X_unhealthy[train_size_per_class + val_size_unhealthy:train_size_per_class + val_size_unhealthy + test_size_unhealthy]))
y_test = np.array([0] * test_size_healthy + [1] * test_size_unhealthy)

# Verification
print("Training set:", len(X_train), len(y_train))  # Expected: 1080
print("Validation set:", len(X_val), len(y_val))    # Expected: 540
print("Testing set:", len(X_test), len(y_test))      # Expected: 180


Training set: 1080 1080
Validation set: 540 540
Testing set: 180 180


**Training Set (2760 Images):**
* We select the first 1380 healthy and the first 1380 unhealthy images to ensure a 50/50 split.

**Validation Set (1380 Images):**
* We select the next 1104 healthy images and 276 unhealthy images to achieve an 80/20 split.

**Testing Set (460 Images):**
* Finally, we select the next 368 healthy images and 92 unhealthy images to meet the 80/20 split.

## Export the Training, Validation, and Testing Data

### Testing Data

In [9]:
import imageio

In [10]:
X_test2 = X_test * 255

# define the folder to save the testing images
fileDirectory = "BrainImages/Testing"
os.makedirs(fileDirectory, exist_ok = True)

# define the subfolder for each of the two classes
healthyBrainFolder = os.path.join(fileDirectory, "Healhty")
BrainTumorFolder = os.path.join(fileDirectory, "Brain_Tumor")

os.makedirs(healthyBrainFolder, exist_ok = True)
os.makedirs(BrainTumorFolder, exist_ok = True)


# a loop to save the images as a .jpeg file
for sample, (images, label) in enumerate(zip(X_test2, y_test)):
    images = np.clip(images, 0, 255)
    images = images.astype(np.uint8)

    # determine the destination folder
    ## if the image is a healthy brain, else it has a tumor
    if(label == 0):
        destination = healthyBrainFolder
    else:
        destination = BrainTumorFolder

    # assign a file name for the image
    image_fileName = os.path.join(destination, f"image_{sample+1}.png")

    # export the image
    imageio.imwrite(image_fileName, images)

### Training Data

In [11]:
X_train2 = X_train * 255

# define the folder to save the training images
fileDirectory = "BrainImages/Training"
os.makedirs(fileDirectory, exist_ok = True)

# define the subfolder for each of the two classes
healthyBrainFolder = os.path.join(fileDirectory, "Healhty")
BrainTumorFolder = os.path.join(fileDirectory, "Brain_Tumor")

os.makedirs(healthyBrainFolder, exist_ok = True)
os.makedirs(BrainTumorFolder, exist_ok = True)


# a loop to save the images as a .jpeg file
for sample, (images, label) in enumerate(zip(X_train2, y_train)):
    images = np.clip(images, 0, 255)
    images = images.astype(np.uint8)

    # determine the destination folder
    ## if the image is a healthy brain, else it has a tumor
    if(label == 0):
        destination = healthyBrainFolder
    else:
        destination = BrainTumorFolder

    # assign a file name for the image
    image_fileName = os.path.join(destination, f"image_{sample+1}.png")

    # export the image
    imageio.imwrite(image_fileName, images)

### Validation Data

In [12]:
X_val2 = X_val * 255

# define the folder to save the validation images
fileDirectory = "BrainImages/Validation"
os.makedirs(fileDirectory, exist_ok = True)

# define the subfolder for each of the two classes
healthyBrainFolder = os.path.join(fileDirectory, "Healhty")
BrainTumorFolder = os.path.join(fileDirectory, "Brain_Tumor")

os.makedirs(healthyBrainFolder, exist_ok = True)
os.makedirs(BrainTumorFolder, exist_ok = True)


# a loop to save the images as a .jpeg file
for sample, (images, label) in enumerate(zip(X_val2, y_val)):
    images = np.clip(images, 0, 255)
    images = images.astype(np.uint8)

    # determine the destination folder
    ## if the image is a healthy brain, else it has a tumor
    if(label == 0):
        destination = healthyBrainFolder
    else:
        destination = BrainTumorFolder

    # assign a file name for the image
    image_fileName = os.path.join(destination, f"image_{sample+1}.png")

    # export the image
    imageio.imwrite(image_fileName, images)