for the run over the general methodology, refer to "general_training_methodology.md"





Let's import the libraries we'll be needing:

In [None]:
#cross-validation & tuning
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

#models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

#metrics
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score, classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score


#dimensionality-reduction & tools
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE

import pandas as pd
import numpy as np


from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier

Let's import our data and define our train and test values. The 'label' column is the encoded output (0 for healthy control, 1 for Parkinson's disease). We'll split out train/test data, leaving a 20% holdout test:

In [9]:
csv_path = '/content/drive/MyDrive/openSMILE_GeMAPSv01b.csv'

df = pd.read_csv(csv_path)

X = df.drop(columns=['label', 'Sex', 'ID'], axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

parameter grid definition.

In [None]:
param_grid = {
    "lr": {
        "rfe__n_features_to_select": [5, 10, 15, 20, 25, 30],
        "model__C": [0.01, 0.1, 1, 10],
        "model__penalty": ["l2"],
        "model__solver": ["lbfgs"]
    },
    "svm": {
        "rfe__n_features_to_select": [5, 10, 15, 20, 25, 30],
        "model__C": [0.1, 1, 10],
        "model__kernel": ["linear", "rbf"],
        "model__gamma": ["scale", "auto"]
    },
    "rf": {
        "rfe__n_features_to_select": [5, 10, 20, 30, 40, 50],
        "model__n_estimators": [50, 100],
        "model__max_depth": [3, 5],
        "model__min_samples_split": [5, 10],
        "model__min_samples_leaf": [1, 5]
    },
    "xgb": {
        "rfe__n_features_to_select": [5, 10, 20, 30, 40, 50, 'all'],
        "model__n_estimators": [100, 200],
        "model__max_depth": [3, 6, 10],
        "model__learning_rate": [0.01, 0.1, 0.2]
    },
    # rfe requires coeff_ or feature importance, kNN doesnt have that
    # "knn": {
    #     "rfe__n_features_to_select": [5, 10, 15, 20, 25, 30],
    #     "model__n_neighbors": [3, 5, 7, 9],
    #     "model__weights": ["uniform", "distance"]
    # },
    "gb": {
        "rfe__n_features_to_select": [ 30, 40, 50, 'all'],
        "model__n_estimators": [50],
        "model__max_depth": [3],
        "model__learning_rate": [0.05]
    }
}

we now define our models array.

In [4]:
models = {
    "lr": LogisticRegression(max_iter=1000, random_state=42),
    "rf": RandomForestClassifier(random_state=42),
    "svm": SVC(random_state=42, probability=True),
    "xgb": XGBClassifier(eval_metric='logloss', colsample_bytree=0.8, gamma=0, min_child_weight=1, subsample=0.6, random_state=42),
    "knn": KNeighborsClassifier(),
    "gb": GradientBoostingClassifier(random_state=42)
}

Training and Validation.

In [None]:
cv = RepeatedStratifiedKFold(n_splits=8, n_repeats=15, random_state=123) #the cross validation method we'll use across models
results = [] #to store metrics, accuracy scores and all other
best_models={}


for model_name, params in param_grid.items():
    print(f"Running grid search for {model_name}")

    base_model = models[model_name]

    # rfe_estimator = LogisticRegression(max_iter=1000, random_state=42) if model_name == "svm" else clone(base_model)

    if model_name in ["lr", "svm"]: #we directly excluded knn
        # we use LR as the estimator
        rfe_estimator = LogisticRegression(max_iter=1000, random_state=42) if model_name == "svm" else clone(base_model)
        pipeline = Pipeline(steps=[
            ('scaler', StandardScaler()),
            ('rfe', RFE(estimator=rfe_estimator)),
            ('model', clone(base_model))
        ])
    elif model_name in ["rf", "gb", "xgb"]:
        #no scaling for tree-based scaling
        rfe_estimator = LinearSVC(random_state=42) #test, rfe_base_model vs. rfe_linear_svc
        pipeline = Pipeline(steps=[
            ('rfe', RFE(estimator=rfe_estimator)),
            ('model', clone(base_model))
        ])


    grid = GridSearchCV(
        pipeline, #this implements fit and predict functions
        params, #our params to test
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1, # parallel computing, use all cores
        verbose=1,
        return_train_score=True, # to get training scores as well

    )

    grid.fit(X_train, y_train)

    #make predictions with the best model (meaning with best params)
    y_pred = grid.best_estimator_.predict(X_test)
    y_pred_proba = grid.best_estimator_.predict_proba(X_test)[:, 1]


    # calculate metrics on the test set
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    test_auc = roc_auc_score(y_test, y_pred_proba)


    results.append({
        'model': model_name,
        'test_accuracy': accuracy,
        'test_precision': precision,
        'test_recall': recall,
        'test_f1': f1,
        'test_auc': test_auc,
        'best_params': grid.best_params_,
    })

    best_models[model_name] = grid.best_estimator_

    # display best parameters
    print(f"\nBest parameters for {model_name}:")
    for param, value in grid.best_params_.items():
        print(f"  {param}: {value}\n")

    # display classification report
    print(f"\nClassification Report for {model_name}:")
    print(classification_report(y_test, y_pred))

    #display AUC
    print(f"\nTest AUC for {model_name}: {test_auc:.4f}")

    #display CV scores for each fold and overall
    print(f"\nCross-validation scores for {model_name}:")
    cv_results = grid.cv_results_
    mean_scores = cv_results['mean_test_score'][grid.best_index_]
    std_scores = cv_results['std_test_score'][grid.best_index_]
    print(f"Overall CV score: {mean_scores:.4f} ± {std_scores:.4f}")



results_df = pd.DataFrame(results)

# dispaly models sorted by auc
print(f"\n{'='*80}")
print("MODELS RANKED BY AUC SCORE")
print(results_df.sort_values('test_auc', ascending=False)[['model', 'test_auc', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1']].to_string(index=False))

Running grid search for lr
Fitting 65 folds for each of 24 candidates, totalling 1560 fits

Best parameters for lr:
  model__C: 0.1

  model__penalty: l2

  model__solver: lbfgs

  rfe__n_features_to_select: 5


Classification Report for lr:
              precision    recall  f1-score   support

           0       0.80      0.89      0.84         9
           1       0.86      0.75      0.80         8

    accuracy                           0.82        17
   macro avg       0.83      0.82      0.82        17
weighted avg       0.83      0.82      0.82        17


Cross-validation scores for lr:
Overall CV score: 0.8507 ± 0.1212
Running grid search for svm
Fitting 65 folds for each of 72 candidates, totalling 4680 fits

Best parameters for svm:
  model__C: 0.1

  model__gamma: scale

  model__kernel: linear

  rfe__n_features_to_select: 5


Classification Report for svm:
              precision    recall  f1-score   support

           0       0.78      0.78      0.78         9
        




Best parameters for rf:
  model__max_depth: 5

  model__min_samples_leaf: 1

  model__min_samples_split: 5

  model__n_estimators: 100

  rfe__n_features_to_select: 30


Classification Report for rf:
              precision    recall  f1-score   support

           0       0.73      0.89      0.80         9
           1       0.83      0.62      0.71         8

    accuracy                           0.76        17
   macro avg       0.78      0.76      0.76        17
weighted avg       0.78      0.76      0.76        17


Cross-validation scores for rf:
Overall CV score: 0.9190 ± 0.0708
Running grid search for xgb
Fitting 65 folds for each of 126 candidates, totalling 8190 fits


1170 fits failed out of a total of 8190.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1170 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/l


Best parameters for xgb:
  model__learning_rate: 0.01

  model__max_depth: 6

  model__n_estimators: 100

  rfe__n_features_to_select: 30


Classification Report for xgb:
              precision    recall  f1-score   support

           0       0.78      0.78      0.78         9
           1       0.75      0.75      0.75         8

    accuracy                           0.76        17
   macro avg       0.76      0.76      0.76        17
weighted avg       0.76      0.76      0.76        17


Cross-validation scores for xgb:
Overall CV score: 0.9426 ± 0.0586
Running grid search for gb
Fitting 65 folds for each of 4 candidates, totalling 260 fits


65 fits failed out of a total of 260.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
65 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/py


Best parameters for gb:
  model__learning_rate: 0.05

  model__max_depth: 3

  model__n_estimators: 50

  rfe__n_features_to_select: 40


Classification Report for gb:
              precision    recall  f1-score   support

           0       0.70      0.78      0.74         9
           1       0.71      0.62      0.67         8

    accuracy                           0.71        17
   macro avg       0.71      0.70      0.70        17
weighted avg       0.71      0.71      0.70        17


Cross-validation scores for gb:
Overall CV score: 0.8400 ± 0.0911

MODELS RANKED BY TEST ACCURACY
model  test_accuracy  test_precision  test_recall  test_f1  test_auc
   lr       0.823529        0.826891     0.823529 0.822291  0.888889
  svm       0.764706        0.764706     0.764706 0.764706  0.875000
   rf       0.764706        0.777184     0.764706 0.759664  0.930556
  xgb       0.764706        0.764706     0.764706 0.764706  0.847222
   gb       0.705882        0.706723     0.705882 0.703818  

In [None]:
cv = RepeatedStratifiedKFold(n_splits=8, n_repeats=15, random_state=123) #the cross validation method we'll use across models
cv_auc_scores = {}  #dict to store CV AUC scores for each model

for model_name in best_models.keys():
    #best parameters for the current model
    best_params = results_df[results_df['model'] == model_name]['best_params'].iloc[0]

    if model_name in ["lr", "svm"]:
        rfe_estimator = LogisticRegression(max_iter=1000, random_state=42) if base_model_name == "svm" else clone(base_model)
        pipeline = Pipeline(steps=[
            ('scaler', StandardScaler()),
            ('rfe', RFE(estimator=rfe_estimator)),
            ('model', clone(base_model))
        ])
    else:
        rfe_estimator = LinearSVC(random_state=42)
        pipeline = Pipeline(steps=[
            ('rfe', RFE(estimator=rfe_estimator)),
            ('model', clone(base_model))
        ])
    pipeline.set_params(**best_params)

    scores = cross_val_score(
        pipeline,
        X_train,
        y_train,
        scoring='roc_auc',
        cv=cv,
        n_jobs=-1
    )

    cv_auc_scores[model_name] = {
        'mean_cv_auc': scores.mean(),
        'std_cv_auc': scores.std()
    }

    print(f"{model_name}: Mean CV AUC = {scores.mean():.4f} (±{scores.std():.4f})")

#add CV AUC metrics to results_df so we can access them later
results_df['mean_cv_auc'] = results_df['model'].map(lambda x: cv_auc_scores[x]['mean_cv_auc'])
results_df['std_cv_auc'] = results_df['model'].map(lambda x: cv_auc_scores[x]['std_cv_auc'])

print(f"\n{'='*80}")
print("MODELS RANKED BY MEAN CV AUC")
print(results_df.sort_values('mean_cv_auc', ascending=False)[['model', 'mean_cv_auc', 'std_cv_auc', 'test_auc']].to_string(index=False))

lr: Mean CV AUC = 0.8507 (±0.1212)
svm: Mean CV AUC = 0.8604 (±0.1034)
rf: Mean CV AUC = 0.9190 (±0.0708)
xgb: Mean CV AUC = 0.9426 (±0.0586)
gb: Mean CV AUC = 0.8400 (±0.0911)

MODELS RANKED BY MEAN CV AUC
model  mean_cv_auc  std_cv_auc  test_auc
  xgb     0.942552    0.058616  0.847222
   rf     0.918987    0.070795  0.930556
  svm     0.860440    0.103359  0.875000
   lr     0.850733    0.121163  0.888889
   gb     0.840049    0.091062  0.743056
