In [None]:
!scp /content/drive/MyDrive/Colab\ Notebooks/DS\ 340W\ Project.ipynb /content/DS340W-

In [5]:

import os
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from tensorflow.keras.preprocessing import image
from tensorflow.keras import layers, models, regularizers, callbacks, optimizers
import tensorflow as tf

# ============================================================
# 1. Mount Drive & Define Dataset Paths
# ============================================================
from google.colab import drive
drive.mount('/content/drive')

healthy_save_path = '/content/drive/MyDrive/DS340W/Data/dataset2/healthy'
schizophrenia_save_path = '/content/drive/MyDrive/DS340W/Data/dataset2/schizophrenic'

healthy_folder = healthy_save_path
schizophrenia_folder = schizophrenia_save_path



Mounted at /content/drive


In [None]:

# ============================================================
# 2. Load & Preprocess Images
# ============================================================
def load_images(folder):
    images, labels = [], []
    for filename in os.listdir(folder):
        if filename.endswith(".png"):
            img = image.load_img(os.path.join(folder, filename), target_size=(224, 224))
            img_array = image.img_to_array(img) / 255.0
            images.append(img_array)
            labels.append(0 if folder == healthy_folder else 1)
    return np.array(images), np.array(labels)

X_healthy, y_healthy = load_images(healthy_folder)
X_schizophrenia, y_schizophrenia = load_images(schizophrenia_folder)

# Merge datasets
X = np.concatenate((X_healthy, X_schizophrenia), axis=0)
y = np.concatenate((y_healthy, y_schizophrenia), axis=0)

# Shuffle dataset
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
X, y = X[indices], y[indices]

print(f"Dataset loaded. X shape: {X.shape}, y shape: {y.shape}, #healthy={sum(y==0)}, #schizophrenia={sum(y==1)}")


Dataset loaded. X shape: (1828, 224, 224, 3), y shape: (1828,), #healthy=948, #schizophrenia=880


In [None]:

# ============================================================
# 3. Define CNN Model (from the paper)
# ============================================================
def build_small_cnn(input_shape=(224,224,3), l2_lambda=0.01, dropout_p=0.1):
    """Two-layer CNN as in the parent paper."""
    inp = layers.Input(shape=input_shape)

    # Conv layer 1
    x = layers.Conv2D(4, (3,3), strides=(2,2), padding='same', activation='relu')(inp)
    x = layers.MaxPool2D(pool_size=(2,2))(x)
    x = layers.Dropout(dropout_p)(x)

    # Conv layer 2 with L2 regularization
    x = layers.Conv2D(8, (3,3), strides=(2,2), padding='same',
                      activation='relu',
                      kernel_regularizer=regularizers.l2(l2_lambda))(x)
    x = layers.GlobalMaxPool2D()(x)
    x = layers.Dropout(dropout_p)(x)

    x = layers.Flatten()(x)
    x = layers.Dense(50, activation='relu')(x)
    out = layers.Dense(2, activation='softmax')(x)

    model = models.Model(inputs=inp, outputs=out)
    return model

def compile_and_prepare_training(model, lr=1e-3):
    """Compile model and return early stopping & LR schedule."""
    opt = optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    es = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=0)
    rlr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=0)
    return [es, rlr]


In [None]:

# ============================================================
# 4. Nested Cross-Validation (outer 10-fold, inner 3-fold simplified)
# ============================================================
# To simplify, we'll perform 10-fold stratified CV (outer loop only)
# Each fold reports balanced accuracy; final result is mean ± SD.

outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
balanced_accuracies = []

fold = 1
for train_idx, test_idx in outer_cv.split(X, y):
    print(f"\n===== Fold {fold} / 10 =====")

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Build & compile a fresh model for each fold
    model = build_small_cnn(input_shape=(224,224,3))
    cbs = compile_and_prepare_training(model)

    # Train
    history = model.fit(
        X_train, y_train,
        validation_split=0.1,
        epochs=50,
        batch_size=32,
        callbacks=cbs,
        verbose=0
    )

    # Evaluate
    y_pred = np.argmax(model.predict(X_test, verbose=0), axis=1)
    bal_acc = balanced_accuracy_score(y_test, y_pred)
    balanced_accuracies.append(bal_acc)
    print(f"Balanced Accuracy (Fold {fold}): {bal_acc:.4f}")
    fold += 1



===== Fold 1 / 10 =====




Balanced Accuracy (Fold 1): 0.7388

===== Fold 2 / 10 =====




Balanced Accuracy (Fold 2): 0.7044

===== Fold 3 / 10 =====
Balanced Accuracy (Fold 3): 0.6901

===== Fold 4 / 10 =====
Balanced Accuracy (Fold 4): 0.7676

===== Fold 5 / 10 =====
Balanced Accuracy (Fold 5): 0.6443

===== Fold 6 / 10 =====
Balanced Accuracy (Fold 6): 0.7007

===== Fold 7 / 10 =====
Balanced Accuracy (Fold 7): 0.7789

===== Fold 8 / 10 =====
Balanced Accuracy (Fold 8): 0.7813

===== Fold 9 / 10 =====
Balanced Accuracy (Fold 9): 0.7117

===== Fold 10 / 10 =====
Balanced Accuracy (Fold 10): 0.7722


In [None]:

# ============================================================
# 5. Report Final Results
# ============================================================
mean_acc = np.mean(balanced_accuracies)
std_acc = np.std(balanced_accuracies)
print("\n===========================================")
print("Parent Paper Small CNN Replication Results")
print(f"Mean Balanced Accuracy: {mean_acc:.4f} ± {std_acc:.4f}")
print("===========================================")

In [2]:
!git config --global user.email "nam5820@psu.edu"


In [3]:
!git config --global user.name "Nathan Mannings"

In [4]:
!git clone https://github.com/nlmannings/DS340W-Project.git

Cloning into 'DS340W-Project'...
