# Brain Tumor MRI Research Notebook
*Generated 2025-08-11 09:59*

This notebook provides a complete, modular pipeline for brain tumor detection and classification using MRI images. It includes data intake, preprocessing, exploratory analysis, training (transfer learning baseline), evaluation metrics, confusion matrix, ROC, and Grad‑CAM explainability. Slots are provided to plug in advanced models (e.g., DBN + BiLSTM) if desired.

## 0. Environment & Dependencies

In [None]:
# If running on a clean environment, uncomment the next cell to install packages.
# %pip install -q numpy pandas matplotlib scikit-learn scikit-image opencv-python pillow tqdm seaborn tensorflow==2.15.0
# If you plan to use PyTorch instead of TensorFlow, you can install it and swap the model section accordingly.

In [None]:
import os, math, json, itertools, random, shutil, zipfile, glob
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (confusion_matrix, classification_report, roc_auc_score,
                             roc_curve, auc)
from sklearn.model_selection import train_test_split

# For image utilities
from PIL import Image

# Optional: uncomment if you use these paths
# import cv2
from skimage import exposure, filters, morphology, measure

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Reproducibility
SEED = 42
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

print(tf.__version__)

## 1. Dataset Setup
Organize your dataset in the following directory structure (typical for Brain Tumor MRI datasets):
```
dataset_root/
  train/
    glioma/
    meningioma/
    pituitary/
    notumor/
  val/
    glioma/
    meningioma/
    pituitary/
    notumor/
  test/
    glioma/
    meningioma/
    pituitary/
    notumor/
```
Update `DATASET_ROOT` below to point to your local path.

In [None]:
# === Configure paths ===
DATASET_ROOT = Path("/path/to/brain-tumor-dataset")  # <-- CHANGE THIS
IMG_SIZE = (224, 224)   # MobileNetV2 default; adjust if needed
BATCH_SIZE = 32
CLASSES = ["glioma", "meningioma", "pituitary", "notumor"]

assert len(CLASSES) >= 2, "Need at least two classes"

# Sanity check (won't fail if path missing; will be validated when loading)
print("Dataset root:", DATASET_ROOT)

### 1.1 Quick File Count

In [None]:
def count_images(root: Path, classes):
    rows = []
    for split in ["train","val","test"]:
        for c in classes:
            d = root / split / c
            n = len(list(d.glob("**/*.png"))) + len(list(d.glob("**/*.jpg"))) + len(list(d.glob("**/*.jpeg")))
            rows.append({"split":split, "class":c, "count":n})
    return pd.DataFrame(rows)

try:
    counts_df = count_images(DATASET_ROOT, CLASSES)
    display(counts_df)
except Exception as e:
    print("Skipping counts (path may be invalid):", e)

## 2. Exploratory Data Analysis (EDA)

In [None]:
def show_samples(root, split="train", classes=CLASSES, n_per_class=3, img_size=IMG_SIZE):
    fig, axes = plt.subplots(len(classes), n_per_class, figsize=(n_per_class*3, len(classes)*3))
    if len(classes) == 1:
        axes = np.array([axes])
    for i, c in enumerate(classes):
        imgs = list((root / split / c).glob("**/*"))
        imgs = [p for p in imgs if p.suffix.lower() in [".png",".jpg",".jpeg"]]
        random.shuffle(imgs)
        for j in range(n_per_class):
            ax = axes[i, j] if n_per_class > 1 else axes[i, 0]
            if j < len(imgs):
                im = Image.open(imgs[j]).convert("RGB").resize(img_size)
                ax.imshow(im)
                ax.set_title(f"{c}")
            ax.axis("off")
    plt.tight_layout()
    plt.show()

# show_samples(DATASET_ROOT, split="train")

## 3. Preprocessing
Here we implement simple intensity normalization, optional CLAHE (contrast enhancement), and Otsu thresholding examples. You can adapt this section for your preferred pipeline.

In [None]:
def preprocess_pil(img: Image.Image, img_size=IMG_SIZE, do_clahe=False):
    img = img.convert("L")  # grayscale for intensity ops; keep RGB for CNN below if desired
    arr = np.array(img)

    # Normalize to [0,1]
    arr = (arr - arr.min()) / (arr.max() - arr.min() + 1e-8)

    if do_clahe:
        arr = exposure.equalize_adapthist(arr, clip_limit=0.02)

    arr = (arr * 255).astype(np.uint8)
    img_out = Image.fromarray(arr).convert("RGB").resize(img_size)
    return img_out

# Example (commented until a real file path is provided):
# sample_path = next((DATASET_ROOT/'train'/'glioma').glob("*.jpg"))
# preprocess_pil(Image.open(sample_path))

### 3.1 Simple Segmentation Demo (Otsu)

In [None]:
def simple_otsu_segmentation(img: Image.Image):
    gray = np.array(img.convert("L"))
    thresh = filters.threshold_otsu(gray)
    mask = (gray > thresh).astype(np.uint8)
    # Clean small regions
    mask = morphology.remove_small_objects(mask.astype(bool), min_size=64)
    mask = morphology.remove_small_holes(mask, area_threshold=64)
    return mask.astype(np.uint8)

# Example (requires a sample image path):
# sample = Image.open(sample_path).resize(IMG_SIZE)
# mask = simple_otsu_segmentation(sample)
# plt.figure(figsize=(6,3))
# plt.subplot(1,2,1); plt.imshow(sample); plt.title("Original"); plt.axis("off")
# plt.subplot(1,2,2); plt.imshow(mask, cmap="gray"); plt.title("Otsu Mask"); plt.axis("off")
# plt.show()

## 4. Dataloaders (TensorFlow)

In [None]:
def make_datasets(root, img_size=IMG_SIZE, batch_size=BATCH_SIZE, seed=SEED):
    train_ds = keras.preprocessing.image_dataset_from_directory(
        root/"train", image_size=img_size, batch_size=batch_size, label_mode="categorical", seed=seed, shuffle=True)
    val_ds = keras.preprocessing.image_dataset_from_directory(
        root/"val", image_size=img_size, batch_size=batch_size, label_mode="categorical", seed=seed, shuffle=False)
    test_ds = keras.preprocessing.image_dataset_from_directory(
        root/"test", image_size=img_size, batch_size=batch_size, label_mode="categorical", seed=seed, shuffle=False)
    return train_ds, val_ds, test_ds

# train_ds, val_ds, test_ds = make_datasets(DATASET_ROOT)

### 4.1 Augmentation Layer

In [None]:
data_augmentation = keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.05),
    layers.RandomZoom(0.1),
])

## 5. Model: Transfer Learning Baseline (MobileNetV2)

In [None]:
def build_model(num_classes=len(CLASSES), img_size=IMG_SIZE):
    base = keras.applications.MobileNetV2(
        input_shape=img_size + (3,), include_top=False, weights="imagenet")
    base.trainable = False  # fine-tune later

    inputs = keras.Input(shape=img_size + (3,))
    x = data_augmentation(inputs)
    x = keras.applications.mobilenet_v2.preprocess_input(x)
    x = base(x, training=False)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(num_classes, activation="softmax")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer=keras.optimizers.Adam(1e-3),
                  loss="categorical_crossentropy",
                  metrics=["accuracy"])
    return model

# model = build_model()
# model.summary()

## 6. Training

In [None]:
EPOCHS = 10
# train_ds, val_ds, test_ds = make_datasets(DATASET_ROOT)

# history = model.fit(
#     train_ds,
#     validation_data=val_ds,
#     epochs=EPOCHS
# )
# pd.DataFrame(history.history).to_csv("training_log.csv", index=False)

### 6.1 Training Curves

In [None]:
def plot_curves(history_dict):
    fig = plt.figure(figsize=(6,4))
    acc = history_dict.get("accuracy", [])
    val_acc = history_dict.get("val_accuracy", [])
    loss = history_dict.get("loss", [])
    val_loss = history_dict.get("val_loss", [])
    epochs_range = range(1, len(acc)+1)

    # Accuracy
    plt.plot(epochs_range, acc, label="train_acc")
    if val_acc: plt.plot(epochs_range, val_acc, label="val_acc")

    # Loss on twin y-axis for clarity
    ax2 = plt.gca().twinx()
    ax2.plot(epochs_range, loss, linestyle="--", label="train_loss")
    if val_loss: ax2.plot(epochs_range, val_loss, linestyle="--", label="val_loss")

    plt.title("Training Progress")
    plt.xlabel("Epoch")
    plt.legend(loc="upper left")
    fig.tight_layout()
    plt.show()

# Example after training:
# plot_curves(history.history)

## 7. Evaluation

In [None]:
def evaluate_model(model, test_ds, class_names=CLASSES):
    y_true, y_prob = [], []
    for x, y in test_ds:
        p = model.predict(x, verbose=0)
        y_prob.append(p)
        y_true.append(y.numpy())
    y_prob = np.concatenate(y_prob, axis=0)
    y_true = np.concatenate(y_true, axis=0)
    y_pred = np.argmax(y_prob, axis=1)
    y_true_labels = np.argmax(y_true, axis=1)

    # Confusion Matrix
    cm = confusion_matrix(y_true_labels, y_pred, labels=list(range(len(class_names))))
    fig = plt.figure(figsize=(5,4))
    plt.imshow(cm, interpolation="nearest")
    plt.title("Confusion Matrix")
    plt.xticks(range(len(class_names)), class_names, rotation=45, ha="right")
    plt.yticks(range(len(class_names)), class_names)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, cm[i, j], ha="center", va="center")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    fig.tight_layout()
    plt.show()

    print("\nClassification Report\n")
    print(classification_report(y_true_labels, y_pred, target_names=class_names))

    # One-vs-Rest ROC-AUC (if at least 2 classes)
    if y_prob.shape[1] >= 2:
        try:
            auc_macro = roc_auc_score(y_true, y_prob, multi_class="ovr")
            print(f"Macro ROC-AUC: {auc_macro:.4f}")
        except Exception as e:
            print("ROC-AUC could not be computed:", e)

# evaluate_model(model, test_ds, CLASSES)

## 8. Explainability: Grad‑CAM

In [None]:
def make_gradcam_heatmap(img_array, model, last_conv_layer_name):
    grad_model = tf.keras.models.Model(
        [model.inputs], [model.get_layer(last_conv_layer_name).output, model.output]
    )
    with tf.GradientTape() as tape:
        conv_outputs, predictions = grad_model(img_array, training=False)
        class_idx = tf.argmax(predictions[0])
        loss = predictions[:, class_idx]

    grads = tape.gradient(loss, conv_outputs)
    pooled_grads = tf.reduce_mean(grads, axis=(0, 1, 2))
    conv_outputs = conv_outputs[0]

    heatmap = tf.reduce_sum(tf.multiply(pooled_grads, conv_outputs), axis=-1)
    heatmap = np.maximum(heatmap, 0) / (np.max(heatmap) + 1e-8)
    return heatmap.numpy()

# Example usage after training:
# target_layer = [l.name for l in model.layers if isinstance(l, layers.Conv2D)][-1]
# img_path = next((DATASET_ROOT/'test'/'glioma').glob("*.jpg"))
# img = Image.open(img_path).convert("RGB").resize(IMG_SIZE)
# x = np.expand_dims(np.array(img)/255.0, 0).astype(np.float32)
# heatmap = make_gradcam_heatmap(x, model, last_conv_layer_name=target_layer)
# plt.figure(figsize=(6,3))
# plt.subplot(1,2,1); plt.imshow(img); plt.title("Image"); plt.axis("off")
# plt.subplot(1,2,2); plt.imshow(heatmap); plt.title("Grad-CAM"); plt.axis("off")
# plt.show()

## 9. (Optional) Advanced Slot: DBN + BiLSTM
This section is a placeholder where you can prototype a DBN feature extractor (via stacked RBMs) and feed the extracted features into a BiLSTM classifier over slice sequences. Implementations vary; consider using PyTorch or TensorFlow Probability for RBM-like layers, or substitute with an autoencoder as a practical surrogate.

In [None]:
# Pseudocode / sketch (fill as needed):
# 1) Build/Train stacked RBMs (or autoencoder) to extract features from each image/slice.
# 2) Aggregate per-patient sequences: shape [time/slices, feature_dim].
# 3) Feed sequences to a BiLSTM for classification.
# 4) Train end-to-end (optionally fine-tune feature extractor).

# from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Input, Masking
# seq_inputs = Input(shape=(None, feature_dim))
# x = Bidirectional(layers.LSTM(128, return_sequences=False))(seq_inputs)
# outputs = Dense(num_classes, activation="softmax")(x)
# seq_model = keras.Model(seq_inputs, outputs)
# seq_model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

## 10. Export & Reproducibility

In [None]:
# Save class mapping & configuration
config = {
    "classes": CLASSES,
    "img_size": IMG_SIZE,
    "batch_size": BATCH_SIZE,
    "seed": SEED,
}
with open("run_config.json", "w") as f:
    json.dump(config, f, indent=2)
print("Saved run_config.json")

## Notes
- Ensure dataset paths are correct before running the dataloader cell.
- Start with the baseline model; once stable, enable fine‑tuning (`base.trainable=True`) and re‑train with a lower LR.
- Use the Grad‑CAM cell to sanity‑check that the network is focusing on plausible tumor regions.
- Replace MobileNetV2 with any backbone (e.g., ResNet50, EfficientNet) if you prefer.
- For publications, record seeds, exact package versions, and hardware.
