# Exploratory Data Analysis

In [1]:
# python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2  # for image processing
from PIL import Image
import os

# Tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from sklearn.model_selection import train_test_split



# Data Exploration

In [2]:
healthy_filepath = "/kaggle/input/brian-tumor-dataset/Brain Tumor Data Set/Brain Tumor Data Set/Healthy"
unhealthy_filepath = "/kaggle/input/brian-tumor-dataset/Brain Tumor Data Set/Brain Tumor Data Set/Brain Tumor"

healthy_count = len(os.listdir(healthy_filepath))
unhealthy_count = len(os.listdir(unhealthy_filepath))

print(f"Healthy Images: {healthy_count}, Unhealthy Images: {unhealthy_count}")

Healthy Images: 2087, Unhealthy Images: 2513


# Load Images

In [3]:
# standardize all of the images
def load_images_from_folder(folder, label, image_size=(128, 128)):
    images = []
    labels = []
    for filename in os.listdir(folder):
        filepath = os.path.join(folder, filename)
        try: 
            img = cv2.imread(filepath)
            if img is not None: # if openCV can't read the image try with PIL
                img = Image.open(filepath)
                img = img.convert("RGB") # convert to RGB in case of grayscale or RGBA
                img = np.array(img)

            img = cv2.resize(img, image_size)
            images.append(img)
            labels.append(label)
        
        except Exception as e:
            print(f"Error loading {filename}: {e}")
            continue  # Skip unreadable images
            
    return images, labels

In [4]:
# create arrays for images and labels
healthy_images, healthy_labels = load_images_from_folder(healthy_filepath, label = 0)
unhealthy_images, unhealthy_labels = load_images_from_folder(unhealthy_filepath, label = 1)

# Image Preprocessing

In [5]:
X = np.array(healthy_images + unhealthy_images) / 255.0
y = np.array(healthy_labels + unhealthy_labels)

**X explained:**
    This helps normalize the pixel values from their original range of [0, 255] to [0,1]. Normalizing helps the model learn more efficiently and can lead to better training results since most neural networks work better with normalized data. 
    


In [6]:
# Data Augmentation
data_gen = ImageDataGenerator(
    rotation_range=5,      # Smaller rotation range
    zoom_range=0.1,        # Small zoom to avoid losing details
    width_shift_range=0.05, # Small shifts to keep important features within the frame
    height_shift_range=0.05,
    fill_mode="nearest"
)


**Data Augmentation:** Is meant to help prevent overfitting. 
* Rotation Range: randomly rotates images up to 20 degress which helps the model gereralize to slightly rotated images.
* Zoom Range: randomly zooms in or out on images by up to 15% which helps the model learn to recognize images at different scales.
* Width Shift Range and Height Shift Range: Randomly shifts images horizontally and vertically by up to 20%. This helps the model generalize slightly to images that may not be perfectly centered.
* Shear Range: Shear transformation means essentially "slanting" the image by up to 15% which helps robustness with slight distortions.
* Fill Mode: Speicifies how to fill in new pixel values when shifting, zooming, or rotating an image. 

# Split Data

In [7]:
import numpy as np

# Separate healthy and unhealthy data based on labels
X_healthy = X[y == 0]
X_unhealthy = X[y == 1]

# Step 1: Training Set - 60% of total data, balanced 50/50 between healthy and unhealthy
train_size_per_class = 540  # 540 healthy, 540 unhealthy
X_train = np.concatenate((X_healthy[:train_size_per_class], X_unhealthy[:train_size_per_class]))
y_train = np.array([0] * train_size_per_class + [1] * train_size_per_class)

# Step 2: Validation Set - 30% of total data, with an 80/20 split
val_size_healthy = 432  # 80% healthy
val_size_unhealthy = 108  # 20% unhealthy
X_val = np.concatenate((X_healthy[train_size_per_class:train_size_per_class + val_size_healthy], 
                        X_unhealthy[train_size_per_class:train_size_per_class + val_size_unhealthy]))
y_val = np.array([0] * val_size_healthy + [1] * val_size_unhealthy)

# Step 3: Testing Set - 10% of total data, with an 80/20 split
test_size_healthy = 144  # 80% healthy
test_size_unhealthy = 36  # 20% unhealthy
X_test = np.concatenate((X_healthy[train_size_per_class + val_size_healthy:train_size_per_class + val_size_healthy + test_size_healthy], 
                         X_unhealthy[train_size_per_class + val_size_unhealthy:train_size_per_class + val_size_unhealthy + test_size_unhealthy]))
y_test = np.array([0] * test_size_healthy + [1] * test_size_unhealthy)

# Verification
print("Training set:", len(X_train), len(y_train))  # Expected: 1080
print("Validation set:", len(X_val), len(y_val))    # Expected: 540
print("Testing set:", len(X_test), len(y_test))      # Expected: 180


Training set: 1080 1080
Validation set: 540 540
Testing set: 180 180


**Training Set (2760 Images):**
* We select the first 1380 healthy and the first 1380 unhealthy images to ensure a 50/50 split.

**Validation Set (1380 Images):**
* We select the next 1104 healthy images and 276 unhealthy images to achieve an 80/20 split.

**Testing Set (460 Images):**
* Finally, we select the next 368 healthy images and 92 unhealthy images to meet the 80/20 split.

# Model Building

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout

model = Sequential([
    Input(shape=(128, 128, 3)),  # Define the input shape here
    Conv2D(32, (3,3), activation='relu'),
    MaxPooling2D((2,2)),
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D((2,2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])


In [9]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [10]:
print(len(X_train), len(y_train))  # Ensure both are equal
print(len(X_val), len(y_val))


1080 1080
540 540


In [11]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Train the model using the data generator for training and the validation set directly
history = model.fit(
    data_gen.flow(X_train, y_train, batch_size=32),
    validation_data=(X_val, y_val),
    epochs=20
)


Epoch 1/20


  self._warn_if_super_not_called()


[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 534ms/step - accuracy: 0.5325 - loss: 1.0087 - val_accuracy: 0.7556 - val_loss: 0.4845
Epoch 2/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 484ms/step - accuracy: 0.6122 - loss: 0.6211 - val_accuracy: 0.7241 - val_loss: 0.5191
Epoch 3/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 501ms/step - accuracy: 0.6934 - loss: 0.5930 - val_accuracy: 0.8056 - val_loss: 0.4299
Epoch 4/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 483ms/step - accuracy: 0.7594 - loss: 0.5403 - val_accuracy: 0.8167 - val_loss: 0.4025
Epoch 5/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 488ms/step - accuracy: 0.7626 - loss: 0.5203 - val_accuracy: 0.8296 - val_loss: 0.3732
Epoch 6/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 500ms/step - accuracy: 0.7688 - loss: 0.4756 - val_accuracy: 0.7667 - val_loss: 0.4326
Epoch 7/20
[1m34/34[0m [32m━━━

In [12]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# Predict on the test set
y_pred = model.predict(X_test)
y_pred = np.round(y_pred).astype(int)  # Convert probabilities to binary predictions

# Generate classification report and confusion matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 115ms/step
              precision    recall  f1-score   support

           0       0.97      0.83      0.89       144
           1       0.56      0.89      0.69        36

    accuracy                           0.84       180
   macro avg       0.76      0.86      0.79       180
weighted avg       0.89      0.84      0.85       180

[[119  25]
 [  4  32]]
