# 08_Deep_Learning

- 2025-09-08
- Aim to establish optimal model scores for comparison to SVM


In [None]:
import os
import pandas as pd
import numpy as np
from pca import pca
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.base import clone

from random import randint
from pqdm.threads import pqdm

In [None]:
data_df = None
if os.path.isfile("data/data.pkl"):
    data_df = pd.read_pickle("data/data.pkl")
else:
    pass

print(data_df.head().to_markdown())

RANDOM_STATE = 42

In [None]:
# Scale and load data
ad_df = data_df[
    data_df["Diagnosis"].isin(["AD", "HC"]) & (data_df["Harmonized"].notna())
].copy()

x_ad = np.vstack(ad_df["EVC"].values)
y_ad = ad_df["Diagnosis"].map({"AD": 1, "HC": 0}).values

# tbi_df = data_df[
#     data_df["Diagnosis"].isin(["NEG", "POS"]) & (data_df["Harmonized"].notna())
# ].copy()
# x_tbi = np.vstack(tbi_df["EVC"].values)
# y_tbi = tbi_df["Diagnosis"].map({"POS": 1, "NEG": 0}).values

# Scale X
scaler = StandardScaler()
scaler.fit(x_ad)
x_ad = scaler.transform(x_ad)
# x_tbi = scaler.transform(x_tbi)

#### PCA


In [None]:
# Obtain PCA
ad_pca = pca()
x_ad_pca = ad_pca.fit_transform(x_ad)

ad_pca.plot()
print(x_ad_pca["topfeat"].to_markdown())

In [None]:
ad_pca.biplot3d(n_feat=10, legend=False)

### MLP Classification


In [None]:
# Extremely basic MLP for overfit
clf = MLPClassifier(
    random_state=RANDOM_STATE, solver="lbfgs", hidden_layer_sizes=(64, 32)
)
clf.fit(x_ad, y_ad)
clf.score(x_ad, y_ad)

In [None]:
# Holdout Test with basic MLP
ratios = np.arange(0.05, 1, 0.05)


def test_ratio(ratio):
    scores_at_ratio = []

    for _ in range(10):
        X_train, X_test, y_train, y_test = train_test_split(
            x_ad, y_ad, random_state=randint(1, 100000), test_size=ratio
        )

        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        clf = MLPClassifier(
            random_state=RANDOM_STATE, solver="lbfgs", hidden_layer_sizes=(64, 32)
        )
        clf.fit(X_train, y_train)
        scores_at_ratio.append(clf.score(X_test, y_test))

    return scores_at_ratio


scores = pqdm(ratios, test_ratio, n_jobs=2)

y = np.mean(scores, axis=1)

In [None]:
plt.plot(ratios, y, "k-")
plt.fill_between(
    ratios, y - np.std(scores, axis=1), y + np.std(scores, axis=1), alpha=0.5
)
plt.xlabel("Train/Test Size Ratio")
plt.ylabel("Model Score")
plt.title("Effect of Train/Test Size on Model Performance")
plt.ylim(0, 1)
plt.xlim(0, 1)
plt.grid(True)
plt.show()

In [None]:
# Test of alpha, reinvent cv wheel
scores = []
scores_tbi = []
alpha = np.geomspace(1e-8, 10, num=50)


def test_alpha(a):
    scores_at_alpha = []
    scores_at_alpha_tbi = []

    for _ in range(10):
        X_train, X_test, y_train, y_test = train_test_split(
            x_ad, y_ad, random_state=randint(1, 100000), test_size=0.2
        )

        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        X_tbi = scaler.transform(x_tbi)

        clf = MLPClassifier(
            alpha=a, max_iter=1000, solver="lbfgs", hidden_layer_sizes=(64, 32)
        )
        clf.fit(X_train, y_train)

        scores_at_alpha.append(clf.score(X_test, y_test))
        scores_at_alpha_tbi.append(clf.score(X_tbi, y_tbi))

    return (scores_at_alpha, scores_at_alpha_tbi)


results = pqdm(alpha, test_alpha, n_jobs=12)
scores, scores_tbi = zip(*results)

y = np.mean(scores, axis=1)
alpha_y_tbi = np.mean(scores_tbi, axis=1)

In [None]:
plt.semilogx(alpha, y, "k-", label="AD/HC")
plt.fill_between(
    alpha,
    y - np.std(scores, axis=1),
    y + np.std(scores, axis=1),
    alpha=0.5,
    color="black",
)

plt.semilogx(alpha, alpha_y_tbi, "r-", label="TBI+/TBI-")
plt.fill_between(
    alpha,
    alpha_y_tbi - np.std(scores_tbi, axis=1),
    alpha_y_tbi + np.std(scores_tbi, axis=1),
    alpha=0.5,
    color="red",
)

plt.xlabel("MLP Alpha")
plt.ylabel("Model Score")
plt.title("Effect of MLP Alpha Value on Model Performance")
plt.ylim(0, 1)
# plt.xlim(0, 10)
plt.grid("both")
plt.legend()
plt.show()