In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight

In [2]:
relevant_columns = [
    "main.disorder",  # Target variable

    # Schizophrenia (alpha PSD)
    "AB.C.alpha.a.FP1", "AB.C.alpha.b.FP2", "AB.C.alpha.c.F7", "AB.C.alpha.d.F3",
    "AB.C.alpha.e.Fz", "AB.C.alpha.f.F4", "AB.C.alpha.g.F8", "AB.C.alpha.h.T3",
    "AB.C.alpha.i.C3", "AB.C.alpha.j.Cz", "AB.C.alpha.k.C4", "AB.C.alpha.l.T4",
    "AB.C.alpha.m.T5", "AB.C.alpha.n.P3", "AB.C.alpha.o.Pz", "AB.C.alpha.p.P4",
    "AB.C.alpha.q.T6", "AB.C.alpha.r.O1", "AB.C.alpha.s.O2",

    # Trauma and stress-related disorders (beta FC)
    "COH.D.beta.a.FP1.b.FP2", "COH.D.beta.c.F7.d.F3", "COH.D.beta.e.Fz.f.F4",
    "COH.D.beta.g.F8.h.T3", "COH.D.beta.i.C3.j.Cz", "COH.D.beta.k.C4.l.T4",
    "COH.D.beta.m.T5.n.P3", "COH.D.beta.o.Pz.p.P4", "COH.D.beta.q.T6.r.O1",

    # Anxiety disorders (whole band PSD: all frequency bands)
    "AB.A.delta.a.FP1", "AB.A.delta.b.FP2", "AB.B.theta.a.FP1", "AB.B.theta.b.FP2",
    "AB.C.alpha.a.FP1", "AB.C.alpha.b.FP2", "AB.D.beta.a.FP1", "AB.D.beta.b.FP2",
    "AB.E.highbeta.a.FP1", "AB.E.highbeta.b.FP2", "AB.F.gamma.a.FP1", "AB.F.gamma.b.FP2",

    # Mood disorders (theta FC)
    "COH.B.theta.a.FP1.b.FP2", "COH.B.theta.c.F7.d.F3", "COH.B.theta.e.Fz.f.F4",
    "COH.B.theta.g.F8.h.T3", "COH.B.theta.i.C3.j.Cz", "COH.B.theta.k.C4.l.T4",
    "COH.B.theta.m.T5.n.P3", "COH.B.theta.o.Pz.p.P4", "COH.B.theta.q.T6.r.O1",

    # Addictive disorders (theta PSD)
    "AB.B.theta.a.FP1", "AB.B.theta.b.FP2", "AB.B.theta.c.F7", "AB.B.theta.d.F3",
    "AB.B.theta.e.Fz", "AB.B.theta.f.F4", "AB.B.theta.g.F8", "AB.B.theta.h.T3",
    "AB.B.theta.i.C3", "AB.B.theta.j.Cz", "AB.B.theta.k.C4", "AB.B.theta.l.T4",
    "AB.B.theta.m.T5", "AB.B.theta.n.P3", "AB.B.theta.o.Pz", "AB.B.theta.p.P4",
    "AB.B.theta.q.T6", "AB.B.theta.r.O1", "AB.B.theta.s.O2",

    # Obsessive-compulsive disorder (gamma FC)
    "COH.F.gamma.a.FP1.b.FP2", "COH.F.gamma.c.F7.d.F3", "COH.F.gamma.e.Fz.f.F4",
    "COH.F.gamma.g.F8.h.T3", "COH.F.gamma.i.C3.j.Cz", "COH.F.gamma.k.C4.l.T4",
    "COH.F.gamma.m.T5.n.P3", "COH.F.gamma.o.Pz.p.P4", "COH.F.gamma.q.T6.r.O1"
]


In [3]:
# Load dataset with only relevant columns
data = pd.read_csv("data/train.csv", usecols=relevant_columns)

# Filter only Mood Disorder and Healthy Control
data = data[data["main.disorder"].isin(["Healthy control", "Mood disorder"])]

# Encode labels manually: Healthy Control -> 0, Mood Disorder -> 1
data["main.disorder"] = data["main.disorder"].map({
    "Healthy control": 0,
    "Mood disorder": 1
})

# Features and labels
X = data.drop(columns=["main.disorder"]).values
y = data["main.disorder"].values.astype(int)  # Ensure integer labels

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler
joblib.dump(scaler, "svm_mood_scaler.pkl")

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [4]:
# ---------------- Hyperparameter Tuning (Grid Search) ----------------
param_grid = {
    "C": [0.1, 0.5, 1, 5, 10],  # Regularization parameter
    "kernel": ["linear", "rbf"],  # Linear and RBF kernel
}

svm = SVC(probability=True, class_weight="balanced", random_state=42)

grid_search = GridSearchCV(svm, param_grid, cv=5, scoring="roc_auc", n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Get the best model
best_svm = grid_search.best_estimator_

# Save the trained SVM model
joblib.dump(best_svm, "svm_mood_disorder_tuned.pkl")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


['svm_mood_disorder_tuned.pkl']

In [5]:
# ---------------- Evaluate Best Model ----------------
y_pred = best_svm.predict(X_test)
y_pred_prob = best_svm.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_prob)

print(f"\n Tuned SVM Model for Mood Disorder vs. Healthy Control")
print(f" Best Parameters: {grid_search.best_params_}")
print(f" Accuracy: {accuracy:.4f}")
print(f" AUC Score: {auc:.4f}")


 Tuned SVM Model for Mood Disorder vs. Healthy Control
 Best Parameters: {'C': 5, 'kernel': 'rbf'}
 Accuracy: 0.6324
 AUC Score: 0.6190


In [6]:
def predict_mood_disorder(new_data_path):
    """
    Loads new EEG data, scales it, and predicts Mood Disorder probability.
    """
    # Load new data
    new_data = pd.read_csv(new_data_path, usecols=relevant_columns[1:])  # Exclude "main.disorder"
    
    # Load saved scaler and model
    scaler = joblib.load("svm_mood_scaler.pkl")
    svm = joblib.load("svm_mood_disorder_tuned.pkl")

    # Load new data
    new_data = pd.read_csv(new_data_path, usecols=relevant_columns[1:])  # Exclude "main.disorder"

# Convert DataFrame to NumPy before applying StandardScaler
    new_data_scaled = scaler.transform(new_data.values)  # <-- FIXED


    # Predict probabilities
    predictions = svm.predict_proba(new_data_scaled)[:, 1]  # Probability of Mood Disorder

    return predictions

In [7]:
preds = predict_mood_disorder("data.csv")


In [8]:
from sklearn.metrics import roc_curve

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Find the optimal threshold (Youden’s J statistic: max(TPR - FPR))
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]

print(f"🔹 Optimal Decision Threshold: {optimal_threshold:.4f}")

# Convert probabilities to labels using the optimal threshold
pred_labels = (y_pred_prob >= optimal_threshold).astype(int)


🔹 Optimal Decision Threshold: 0.7400


In [9]:
print(preds)

[0.80284479 0.71326441 0.80601528 0.72000826 0.62107228 0.65523013
 0.68912596 0.7188927  0.81573845 0.57650467 0.78639811 0.7830009
 0.52469755 0.74461155 0.65940643 0.83890919 0.72688012 0.72961519
 0.72143543 0.8370777  0.73296879 0.77077455 0.75014044 0.82070822
 0.8565958  0.57917032 0.64450552 0.80605042 0.72240639 0.69297869
 0.5398011  0.873065   0.67479468 0.85300638 0.7125315  0.73745027
 0.83342636 0.67288709 0.78033434 0.68679918 0.69852461 0.66770325
 0.60505112 0.80208643 0.81836087 0.73299376 0.72688461 0.77612505
 0.62999582 0.69297666 0.73081949 0.7593808  0.660928   0.78844193
 0.60690609 0.62724485 0.85915603 0.73187531 0.61616016 0.68980373
 0.70473059 0.5770585  0.65627269 0.69012829 0.75953479 0.79585003
 0.62269323 0.75302305 0.64654744 0.61686774 0.75325727 0.81674016
 0.60733192 0.87339636 0.62768524 0.74141004 0.79484153 0.77264536
 0.70173254 0.79404162 0.66918506 0.80350843 0.68482846 0.80137758
 0.45574204 0.73694254 0.70647883 0.77762778 0.62984611 0.71469

In [10]:
def decode_predictions(preds):
    """
    Convert predictions (0,1) back to disorder labels.
    """
    label_map = {0: "Healthy control", 1: "Mood disorder"}
    return [label_map[pred] for pred in preds]

# Example usage
pred_labels = (preds >= 0.7400).astype(int)  # Convert probabilities to binary labels
decoded_labels = decode_predictions(pred_labels)

print("\nPredicted Labels:\n", decoded_labels)


Predicted Labels:
 ['Mood disorder', 'Healthy control', 'Mood disorder', 'Healthy control', 'Healthy control', 'Healthy control', 'Healthy control', 'Healthy control', 'Mood disorder', 'Healthy control', 'Mood disorder', 'Mood disorder', 'Healthy control', 'Mood disorder', 'Healthy control', 'Mood disorder', 'Healthy control', 'Healthy control', 'Healthy control', 'Mood disorder', 'Healthy control', 'Mood disorder', 'Mood disorder', 'Mood disorder', 'Mood disorder', 'Healthy control', 'Healthy control', 'Mood disorder', 'Healthy control', 'Healthy control', 'Healthy control', 'Mood disorder', 'Healthy control', 'Mood disorder', 'Healthy control', 'Healthy control', 'Mood disorder', 'Healthy control', 'Mood disorder', 'Healthy control', 'Healthy control', 'Healthy control', 'Healthy control', 'Mood disorder', 'Mood disorder', 'Healthy control', 'Healthy control', 'Mood disorder', 'Healthy control', 'Healthy control', 'Healthy control', 'Mood disorder', 'Healthy control', 'Mood disorder