In [2]:
import numpy as np
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    cross_val_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    classification_report
)
import os

In [4]:
processed_dir = "../data/processed"
X_path = os.path.join(processed_dir, "X.npy")
y_path = os.path.join(processed_dir, "y.npy")

# 1) Load data
X = np.load(X_path)   # shape (N,64,64,1)
y = np.load(y_path)   # shape (N,)

# 2) Flatten & scale
N = X.shape[0]
X_flat = X.reshape(N, -1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_flat)

# 3) CV helper
def run_cv(model):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accs = cross_val_score(model, X_scaled, y, cv=cv, scoring="accuracy")
    aucs = cross_val_score(model, X_scaled, y, cv=cv, scoring="roc_auc")
    return accs, aucs

# 4) Define baselines
classifiers = {
    "lr": LogisticRegression(
        penalty="l2", solver="liblinear", C=1.0,
        max_iter=1000, class_weight="balanced"
    ),
    "rbf_svm": SVC(
        kernel="rbf", probability=True, C=1.0,
        gamma="scale", class_weight="balanced"
    ),
    "knn": KNeighborsClassifier(n_neighbors=5),
    "rf": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "nb": GaussianNB(),
}

# Ensure output dir
os.makedirs("../results", exist_ok=True)

# 5) Run CV, save per-fold acc & AUC
for name, model in classifiers.items():
    accs, aucs = run_cv(model)
    np.save(f"../results/{name}_cv_accs.npy", accs)
    np.save(f"../results/{name}_cv_aucs.npy", aucs)

# 6) Hold-out split
X_tr, X_te, y_tr, y_te = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)
np.save("../results/y_test.npy", y_te)

# 7) Fit on train, save preds & probs on test
for name, model in classifiers.items():
    model.fit(X_tr, y_tr)
    preds = model.predict(X_te)
    np.save(f"../results/{name}_preds.npy", preds)
    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(X_te)[:, 1]
    else:
        probs = model.decision_function(X_te)
    np.save(f"../results/{name}_probs.npy", probs)

# 8) Summary
for name in classifiers:
    preds = np.load(f"../results/{name}_preds.npy")
    probs = np.load(f"../results/{name}_probs.npy")
    acc = (preds == y_te).mean()
    auc = roc_auc_score(y_te, probs)
    print(f"{name}: Hold-out Acc={acc:.3f}, AUC={auc:.3f}")


lr: Hold-out Acc=0.714, AUC=0.678
rbf_svm: Hold-out Acc=0.810, AUC=0.785
knn: Hold-out Acc=0.825, AUC=0.862
rf: Hold-out Acc=0.825, AUC=0.767
nb: Hold-out Acc=0.794, AUC=0.727
