for the run over the general methodology, refer to "general_training_methodology.md"





Let's import the libraries we'll be needing:

In [None]:
#cross-validation & tuning
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

#models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

#metrics
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

#dimensionality-reduction & tools
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

import pandas as pd
import numpy as np

Let's import our data and define our train and test values. The 'label' column is the encoded output (0 for healthy control, 1 for Parkinson's disease). We'll split out train/test data, leaving a 20% holdout test:

In [None]:
csv_path = "/content/drive/MyDrive/openSMILE_GeMAPSv01b.csv"

df = pd.read_csv(csv_path)

X = df.drop(columns=['label', 'Sex', 'ID', 'Age'], axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

parameter grid definition.

In [8]:
param_grid = {
    "lr": {
        "pca__n_components": [5, 10, 15, 20, 25, 30],
        "model__C": [0.01, 0.1, 1, 10],
        "model__penalty": ["l2"],
        "model__solver": ["lbfgs"]
    },
    "svm": {
        "pca__n_components": [5, 10, 15, 20, 25, 30],
        "model__C": [0.1, 1, 10],
        "model__kernel": ["linear", "rbf"],
        "model__gamma": ["scale", "auto"]
    },
    "rf": {
        "pca__n_components": [5, 10, 15, 20, 25, 30],
        "model__n_estimators": [50, 100],
        "model__max_depth": [3, 5],
        "model__min_samples_split": [5, 10],
        "model__min_samples_leaf": [1, 5]
    },
    "xgb": {
        "pca__n_components": [5, 10, 15, 20, 25, 30],
        "model__n_estimators": [100, 200],
        "model__max_depth": [3, 6, 10],
        "model__learning_rate": [0.01, 0.1, 0.2]
    },
    "knn": {
        "pca__n_components": [5, 10, 15, 20, 25, 30],
        "model__n_neighbors": [3, 5, 7, 9],
        "model__weights": ["uniform", "distance"]
    },
    "gb": {
        "pca__n_components": [5, 10, 15, 20, 25, 30],
        "model__n_estimators": [50, 100],
        "model__max_depth": [3, 5],
        "model__learning_rate": [0.05, 0.1]
    }
}

we now define our models array.

In [9]:
models = {
    "lr": LogisticRegression(max_iter=1000, random_state=42),
    "rf": RandomForestClassifier(random_state=42),
    "svm": SVC(random_state=42, probability=True),
    "xgb": XGBClassifier(eval_metric='logloss', colsample_bytree=0.8, gamma=0, min_child_weight=1, subsample=0.6, random_state=42),
    "knn": KNeighborsClassifier(),
    "gb": GradientBoostingClassifier(random_state=42)
}

Training and Validation.

In [None]:
cv = RepeatedStratifiedKFold(n_splits=8, n_repeats=15, random_state=123) #the cross validation method we'll use across models
results = [] #to store metrics, accuracy scores and all other
best_models={}

for model_name, params in param_grid.items():
    print(f"Running grid search for {model_name}")

    #creating different pipelines

    pipeline = Pipeline(steps=[
        ('scaler', StandardScaler()), #feature scaling
        ('pca', PCA()), #feature selection
        ('model', models[model_name]) 
    ])


    grid = GridSearchCV(
        pipeline, #this implements fit and predict functions
        params, #our params to test
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1, # parallel computing, use all cores
        verbose=1,
        return_train_score=True, # To get training scores as well

    )

    grid.fit(X_train, y_train)

    #make predictions with the best model (meaning with best params)
    y_pred = grid.best_estimator_.predict(X_test)
    y_pred_proba = grid.best_estimator_.predict_proba(X_test)[:, 1]


    # calculate metrics on the test set
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    test_auc = roc_auc_score(y_test, y_pred_proba)


    results.append({
        'model': model_name,
        'test_accuracy': accuracy,
        'test_precision': precision,
        'test_recall': recall,
        'test_f1': f1,
        'test_auc': test_auc,
        'best_params': grid.best_params_,
    })

    best_models[model_name] = grid.best_estimator_

    # display best parameters
    print(f"\nBest parameters for {model_name}:")
    for param, value in grid.best_params_.items():
        print(f"  {param}: {value}\n")

    # display classification report
    print(f"\nClassification Report for {model_name}:")
    print(classification_report(y_test, y_pred))

    #dissplay AUC
    print(f"\nTest AUC for {model_name}: {test_auc:.4f}")


results_df = pd.DataFrame(results)


Running grid search for lr
Fitting 65 folds for each of 24 candidates, totalling 1560 fits

Best parameters for lr:
  model__C: 0.1

  model__penalty: l2

  model__solver: lbfgs

  pca__n_components: 30


Classification Report for lr:
              precision    recall  f1-score   support

           0       0.70      0.78      0.74         9
           1       0.71      0.62      0.67         8

    accuracy                           0.71        17
   macro avg       0.71      0.70      0.70        17
weighted avg       0.71      0.71      0.70        17


Test AUC for lr: 0.8333
Running grid search for svm
Fitting 65 folds for each of 72 candidates, totalling 4680 fits

Best parameters for svm:
  model__C: 0.1

  model__gamma: scale

  model__kernel: linear

  pca__n_components: 30


Classification Report for svm:
              precision    recall  f1-score   support

           0       0.78      0.78      0.78         9
           1       0.75      0.75      0.75         8

    accur

KeyboardInterrupt: 

In [None]:
# dispaly models sorted by auc
print(f"\n{'='*80}")
print("MODELS RANKED BY AUC SCORE")
print(results_df.sort_values('test_auc', ascending=False)[['model', 'test_auc', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1']].to_string(index=False))


MODELS RANKED BY AUC SCORE
model  test_auc  test_accuracy  test_precision  test_recall  test_f1
   lr  0.833333       0.705882        0.706723     0.705882 0.703818
  svm  0.798611       0.764706        0.764706     0.764706 0.764706
   rf  0.708333       0.647059        0.650624     0.647059 0.639496
  xgb  0.694444       0.705882        0.706723     0.705882 0.703818


calculating mean CV AUC

In [None]:
cv = RepeatedStratifiedKFold(n_splits=8, n_repeats=15, random_state=123) #the cross validation method we'll use across models
cv_auc_scores = {}  #dict to store CV AUC scores for each model

for model_name in best_models.keys():
    #best parameters for the current model
    best_params = results_df[results_df['model'] == model_name]['best_params'].iloc[0]

    pipeline = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('pca', PCA()),
        ('model', models[model_name])
    ])
    pipeline.set_params(**best_params)

    scores = cross_val_score(
        pipeline,
        X_train,
        y_train,
        scoring='roc_auc',
        cv=cv,
        n_jobs=-1
    )

    cv_auc_scores[model_name] = {
        'mean_cv_auc': scores.mean(),
        'std_cv_auc': scores.std()
    }

    print(f"{model_name}: Mean CV AUC = {scores.mean():.4f} (±{scores.std():.4f})")

#add CV AUC metrics to results_df so we can access them later
results_df['mean_cv_auc'] = results_df['model'].map(lambda x: cv_auc_scores[x]['mean_cv_auc'])
results_df['std_cv_auc'] = results_df['model'].map(lambda x: cv_auc_scores[x]['std_cv_auc'])

print(f"\n{'='*80}")
print("MODELS RANKED BY MEAN CV AUC")
print(results_df.sort_values('mean_cv_auc', ascending=False)[['model', 'mean_cv_auc', 'std_cv_auc', 'test_auc']].to_string(index=False))

lr: Mean CV AUC = 0.7145 (±0.1705)
svm: Mean CV AUC = 0.6599 (±0.1857)
rf: Mean CV AUC = 0.6659 (±0.1653)
xgb: Mean CV AUC = 0.6382 (±0.1733)

MODELS RANKED BY MEAN CV AUC
model  mean_cv_auc  std_cv_auc  test_auc
   lr     0.714543    0.170452  0.833333
   rf     0.665865    0.165307  0.708333
  svm     0.659856    0.185653  0.798611
  xgb     0.638221    0.173273  0.694444
