# THIS NOTEBOOK MUST BE RAN LOCALLY ON A MAC FOR ALL DEPENDENCIES TO INSTALL CORRECTLY

### Run this to install the pyenv version manager

In [None]:
!pip3 install pyenv


### Once it has successfully installed, install all the reqiured python version with

In [None]:
!pyenv install 3.11.9

### And creeate a virtual envrionment and activate it

In [None]:
!pyenv virtualenv 3.11.9 group48
!pyenv activate group48



Collecting virtualenv
  Downloading virtualenv-20.35.4-py3-none-any.whl.metadata (4.6 kB)
Collecting distlib<1,>=0.3.7 (from virtualenv)
  Downloading distlib-0.4.0-py2.py3-none-any.whl.metadata (5.2 kB)
Collecting filelock<4,>=3.12.2 (from virtualenv)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Downloading virtualenv-20.35.4-py3-none-any.whl (6.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m6.0 MB/s[0m  [33m0:00:01[0m eta [36m0:00:01[0m
[?25hDownloading distlib-0.4.0-py2.py3-none-any.whl (469 kB)
Downloading filelock-3.20.0-py3-none-any.whl (16 kB)
Installing collected packages: distlib, filelock, virtualenv
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [virtualenv]3[0m [virtualenv]
[1A[2KSuccessfully installed distlib-0.4.0 filelock-3.20.0 virtualenv-20.35.4
created virtual environment CPython3.11.9.final.0-64 in 873ms
  creator CPython3Posix(dest=/Users/nate/.pyenv/versions/3.11.9/envs/aaa

### Now you should be able to install the necessary packages under the python binary that was just created using

In [None]:
!pip3 install numpy pandas scikit-learn tensorflow pillow

In [None]:
!~/.pyenv/versions/3.11.9/bin/python3 -m pip install numpy pandas scikit-learn tensorflow pillow

In [5]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing import image
from tensorflow.keras import layers, models, regularizers, callbacks, optimizers
import tensorflow as tf

In [None]:




KERNEL_SIZES = [(3,3), (5,5), (7,7)]
N_SPLITS = 5  # Stratified K-Fold splits
RANDOM_SEED = 42
BATCH_SIZE = 16
EPOCHS = 10
DROPOUT_P = 0.1
RESULTS_CSV = "kernel_search_results.csv"
TARGET_SIZE = (224, 224)  # as in your loader
POS_LABEL = 1  # schizophrenia class is positive
VERBOSE = 1


healthy_save_path = './Data/healthy'
schizophrenia_save_path = './Data/schizophrenic'

healthy_folder = healthy_save_path
schizophrenia_folder = schizophrenia_save_path


In [None]:

# Load & preprocess images (your loader adapted)
def load_images(folder, label_for_folder):
    images, labels = [], []
    if not os.path.exists(folder):
        raise FileNotFoundError(f"Folder not found: {folder}")
    for filename in sorted(os.listdir(folder)):
        if filename.lower().endswith(".png"):
            img = image.load_img(os.path.join(folder, filename), target_size=TARGET_SIZE)
            img_array = image.img_to_array(img) / 255.0
            images.append(img_array)
            labels.append(label_for_folder)
    if len(images) == 0:
        raise ValueError(f"No PNG images found in folder: {folder}")
    return np.array(images), np.array(labels)

print("Loading healthy images...")
X_healthy, y_healthy = load_images(healthy_folder, 0)
print("Loading schizophrenia images...")
X_schizophrenia, y_schizophrenia = load_images(schizophrenia_folder, 1)

# Merge and shuffle
X = np.concatenate((X_healthy, X_schizophrenia), axis=0)
y = np.concatenate((y_healthy, y_schizophrenia), axis=0)
rng = np.random.RandomState(RANDOM_SEED)
perm = rng.permutation(len(y))
X = X[perm]
y = y[perm]

print(f"Loaded dataset: X.shape={X.shape}, y.shape={y.shape}, #HC={np.sum(y==0)}, #SCZ={np.sum(y==1)}")


Loading healthy images...
Loading schizophrenia images...
Loaded dataset: X.shape=(1828, 224, 224, 3), y.shape=(1828,), #HC=948, #SCZ=880


In [None]:


from tensorflow.keras import regularizers

def build_model(kernel_size_1=(3,3), kernel_size_2=(3,3), input_shape=None, dropout_p=0.1):
    if input_shape is None:
        input_shape = X.shape[1:]
    inp = layers.Input(shape=input_shape)

    x = layers.Conv2D(filters=4, kernel_size=kernel_size_1, strides=(2,2),
                      padding='same', activation='relu')(inp)
    x = layers.MaxPool2D(pool_size=(2,2))(x)
    x = layers.Dropout(dropout_p)(x)

    x = layers.Conv2D(filters=8, kernel_size=kernel_size_2, strides=(2,2),
                      padding='same', activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
    x = layers.GlobalMaxPool2D()(x)
    x = layers.Dropout(dropout_p)(x)

    x = layers.Dense(50, activation='relu')(x)
    out = layers.Dense(2, activation='softmax')(x)

    model = models.Model(inputs=inp, outputs=out)
    model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Try to allow GPU memory growth if GPUs available
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for g in gpus:
            tf.config.experimental.set_memory_growth(g, True)
    except Exception:
        pass



In [None]:

# Grid search
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)

all_results = []

total_runs = len(KERNEL_SIZES) * len(KERNEL_SIZES)
run_counter = 0

for ks1, ks2 in [(a,b) for a in KERNEL_SIZES for b in KERNEL_SIZES]:
    run_counter += 1
    print(f"\n=== Kernel run {run_counter}/{total_runs}: conv1={ks1}, conv2={ks2} ===")

    # Collect fold metrics
    fold_bal_acc = []
    fold_prec = []
    fold_rec = []
    fold_f1 = []
    fold_epochs = []

    fold_idx = 0
    for train_idx, val_idx in skf.split(X, y):
        fold_idx += 1
        print(f"  Fold {fold_idx}/{N_SPLITS}")

        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Keras expects categorical labels
        y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes=2)
        y_val_cat = tf.keras.utils.to_categorical(y_val, num_classes=2)

        # Build model for this fold
        model = build_model(kernel_size_1=ks1, kernel_size_2=ks2, input_shape=X_train.shape[1:], dropout_p=DROPOUT_P)

        # Callbacks
        es = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=0)

        history = model.fit(
            X_train, y_train_cat,
            validation_data=(X_val, y_val_cat),
            epochs=EPOCHS,
            batch_size=BATCH_SIZE,
            callbacks=[es],
            verbose=VERBOSE
        )

        fold_epochs.append(len(history.history['loss']))

        # Predict
        y_pred_prob = model.predict(X_val, verbose=0)
        y_pred = np.argmax(y_pred_prob, axis=1)

        # Compute metrics (binary averaging; positive label = POS_LABEL)
        bal_acc = balanced_accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, pos_label=POS_LABEL, zero_division=0)
        rec = recall_score(y_val, y_pred, pos_label=POS_LABEL, zero_division=0)
        f1 = f1_score(y_val, y_pred, pos_label=POS_LABEL, zero_division=0)

        print(f"    fold bal_acc={bal_acc:.4f}, prec={prec:.4f}, rec={rec:.4f}, f1={f1:.4f}")

        fold_bal_acc.append(bal_acc)
        fold_prec.append(prec)
        fold_rec.append(rec)
        fold_f1.append(f1)

        # Clean up to reduce GPU memory growth between folds
        tf.keras.backend.clear_session()

    # Aggregate per kernel combo (mean + std)
    result = {
        "kernel_size_conv1": f"{ks1[0]}x{ks1[1]}",
        "kernel_size_conv2": f"{ks2[0]}x{ks2[1]}",
        "bal_acc_mean": float(np.mean(fold_bal_acc)),
        "bal_acc_std": float(np.std(fold_bal_acc)),
        "precision_mean": float(np.mean(fold_prec)),
        "precision_std": float(np.std(fold_prec)),
        "recall_mean": float(np.mean(fold_rec)),
        "recall_std": float(np.std(fold_rec)),
        "f1_mean": float(np.mean(fold_f1)),
        "f1_std": float(np.std(fold_f1)),
        "mean_train_epochs": float(np.mean(fold_epochs))
    }
    all_results.append(result)

# Save to disk
df = pd.DataFrame(all_results)
df = df.sort_values(by="f1_mean", ascending=False).reset_index(drop=True)
df.to_csv(RESULTS_CSV, index=False)
print(f"\nGrid search complete. Aggregated results saved to: {RESULTS_CSV}")
print(df)



=== Kernel run 1/9: conv1=(3, 3), conv2=(3, 3) ===
  Fold 1/5
Epoch 1/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 64ms/step - accuracy: 0.5410 - loss: 0.7276 - val_accuracy: 0.6776 - val_loss: 0.7038
Epoch 2/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 41ms/step - accuracy: 0.6347 - loss: 0.6526 - val_accuracy: 0.6721 - val_loss: 0.6784
Epoch 3/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 38ms/step - accuracy: 0.6648 - loss: 0.6112 - val_accuracy: 0.6967 - val_loss: 0.5960
Epoch 4/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 39ms/step - accuracy: 0.6908 - loss: 0.5732 - val_accuracy: 0.6694 - val_loss: 0.6507
Epoch 5/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 38ms/step - accuracy: 0.6970 - loss: 0.5626 - val_accuracy: 0.6694 - val_loss: 0.6445
Epoch 6/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 38ms/step - accuracy: 0.7031 - loss: 0.5570 - val_accura