for the run over the general methodology, refer to "general_training_methodology.md"





first, for mRMR, we need to install the python wrapper pymrmr:

In [1]:
%pip install pymrmr

Collecting pymrmr
  Downloading pymrmr-0.1.11.tar.gz (69 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/69.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m61.4/69.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.5/69.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pymrmr
  Building wheel for pymrmr (setup.py) ... [?25l[?25hdone
  Created wheel for pymrmr: filename=pymrmr-0.1.11-cp312-cp312-linux_x86_64.whl size=411145 sha256=facd06839563fc32af5eb07e1b565e7170baf15654ff39922c82fe2f1a5f99c2
  Stored in directory: /root/.cache/pip/wheels/d8/8b/01/15ccb01b7f3703042aef0458e113d6d3568c22d0a54c2974fb
Successfully built pymrmr
Installing collected packages: pymrmr
Successfully installed pymrmr-0.1.11


Let's import the libraries we'll be needing:

In [None]:
#cross-validation & tuning
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

#models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

#metrics
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score, classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, make_scorer

#dimensionality-reduction & tools
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


import pandas as pd
import numpy as np
import pymrmr

from sklearn.base import clone

Let's import our data and define our train and test values. The 'label' column is the encoded output (0 for healthy control, 1 for Parkinson's disease). We'll split out train/test data, leaving a 20% holdout test:

In [3]:
csv_path = '/content/drive/MyDrive/openSMILE_GeMAPSv01b.csv'

df = pd.read_csv(csv_path)

X = df.drop(columns=['label', 'Sex', 'ID'], axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

parameter grid definition.

In [None]:
param_grid = {
    "lr": {
        "mrmr__n_features": [10, 20, 30, 40, 50, 60], # Changed to mrmr parameter
        "model__C": [0.01, 0.1, 1, 10],
        "model__penalty": ["l2"],
        "model__solver": ["lbfgs"]
    },
    "svm": {
        "mrmr__n_features": [10, 20, 30, 40, 50, 60], # Changed to mrmr parameter
        "model__C": [0.1, 1, 10],
        "model__kernel": ["linear", "rbf"],
        "model__gamma": ["scale", "auto"]
    },
    "rf": {
        "mrmr__n_features": [10, 20, 30, 40, 50, 60], # Changed to mrmr parameter
        "model__n_estimators": [100],
        "model__max_depth": [5],
        "model__min_samples_split": [5],
        "model__min_samples_leaf": [1]
    },
    "xgb": {
        "mrmr__n_features": [10, 20, 30, 40, 50, 60], # Changed to mrmr parameter
        "model__n_estimators": [100],
        "model__max_depth": [3],
        "model__learning_rate": [0.1]
    },
    "knn": {
        "mrmr__n_features": [10, 20, 30, 40, 50, 60], # Changed to mrmr parameter
        "model__n_neighbors": [3, 5, 7, 9],
        "model__weights": ["uniform", "distance"]
    },
    "gb": {
        "mrmr__n_features": [10, 20, 30, 40, 50, 60], # Changed to mrmr parameter
        "model__n_estimators": [50],
        "model__max_depth": [3],
        "model__learning_rate": [0.05]
    }
}

we now define our models array.

In [5]:
models = {
    "lr": LogisticRegression(max_iter=1000, random_state=42),
    "rf": RandomForestClassifier(random_state=42),
    "svm": SVC(random_state=42, probability=True),
    "xgb": XGBClassifier(eval_metric='logloss', colsample_bytree=0.8, gamma=0, min_child_weight=1, subsample=0.6, random_state=42),
    "knn": KNeighborsClassifier(),
    "gb": GradientBoostingClassifier(random_state=42)
}

Training and Validation. Here we change our methodology a little from before. We will study each feature selection param first anf then run the pipeline instead of running FS params inside gridsearch

In [None]:
#mRMR requires pandas DataFrames
X_train_df = X_train
y_train_series = y_train

#the cross validation method we'll use across models
cv = RepeatedStratifiedKFold(n_splits=8, n_repeats=15, random_state=123)

results = [] #to store metrics, accuracy scores and all other
best_models = {}

# Iterate through each model to perform GridSearchCV on mRMR features
for model_name, model in models.items():
    print(f"\nRunning GridSearchCV for {model_name}")

    # Get the parameter grid for the current model
    model_param_grid = param_grid[model_name].copy()

    #extract and remove mRMR parameters from the grid
    n_features_list = model_param_grid.pop('mrmr__n_features')

    #iterate through the different number of mrmr params
    for n_features_mrmr in n_features_list:
        print(f"\n  Testing with {n_features_mrmr} mRMR features for {model_name}")

        #do mrmr on the full training data
        selected_features_mrmr = pymrmr.mRMR(X_train_df, 'MIQ', n_features_mrmr)

        #create reduced training and test datasets with selected features
        X_train_mrmr = X_train[selected_features_mrmr]
        X_test_mrmr = X_test[selected_features_mrmr]

        base_model = clone(model)

        # create pipeline
        if model_name not in ["rf", "xgb", "gb"]:
            pipeline = Pipeline(steps=[
                ('scaler', StandardScaler()),
                ('model', base_model)
            ])
        else:
            pipeline = Pipeline(steps=[
                ('model', base_model)
            ])


        grid = GridSearchCV(
            pipeline,
            model_param_grid,
            cv=cv,
            scoring='roc_auc',
            n_jobs=-1,
            verbose=0
        )

        #fit grid search on the mrmr selected training data
        grid.fit(X_train_mrmr, y_train)

        #make predictions with the best model
        y_pred = grid.best_estimator_.predict(X_test_mrmr)
        y_pred_proba = grid.best_estimator_.predict_proba(X_test_mrmr)

        auc_score = roc_auc_score(y_test, y_pred_proba[:, 1])

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        results.append({
            'model': model_name,
            'n_features_mrmr': n_features_mrmr,
            'best_score': grid.best_score_,
            'test_accuracy': accuracy,
            'test_precision': precision,
            'test_recall': recall,
            'test_f1': f1,
            'test_auc': auc_score,
            'best_params': grid.best_params_,
            'confusion_matrix': confusion_matrix(y_test, y_pred)
        })

        #store the best model for this number of features and model type
        best_models[(model_name, n_features_mrmr)] = grid.best_estimator_

        print(f"Best parameters for {model_name} with {n_features_mrmr} features:")
        for param, value in grid.best_params_.items():
            print(f"{param}: {value}")

        print(f"Best CV score: {grid.best_score_:.4f}")
        print(f"Test Metrics for {model_name} with {n_features_mrmr} features:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-score: {f1:.4f}")
        print(f"AUC: {auc_score:.4f}")

results_df = pd.DataFrame(results)



Running GridSearchCV for lr

  Testing with 10 mRMR features for lr
    Best parameters for lr with 10 features:
      model__C: 0.1
      model__penalty: l2
      model__solver: lbfgs
    Best CV score: 0.6273
    Test Metrics for lr with 10 features:
      Accuracy: 0.5882
      Precision: 0.5923
      Recall: 0.5882
      F1-score: 0.5882
      AUC: 0.5694

  Testing with 20 mRMR features for lr
    Best parameters for lr with 20 features:
      model__C: 1
      model__penalty: l2
      model__solver: lbfgs
    Best CV score: 0.8785
    Test Metrics for lr with 20 features:
      Accuracy: 0.8235
      Precision: 0.8269
      Recall: 0.8235
      F1-score: 0.8223
      AUC: 0.9444

  Testing with 30 mRMR features for lr
    Best parameters for lr with 30 features:
      model__C: 0.1
      model__penalty: l2
      model__solver: lbfgs
    Best CV score: 0.8432
    Test Metrics for lr with 30 features:
      Accuracy: 0.8235
      Precision: 0.8269
      Recall: 0.8235
      F1-sco

In [None]:

print(f"\n{'='*80}")
print("MODELS RANKED BY AUC SCORE")
ranked_results = results_df.sort_values('test_auc', ascending=False)
print(ranked_results[['model', 'test_auc', 'n_features_mrmr', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1']].to_string(index=False))


MODELS RANKED BY AUC SCORE
model  test_auc  n_features_mrmr  test_accuracy  test_precision  test_recall  test_f1
   lr  0.944444               20       0.823529        0.826891     0.823529 0.822291
   lr  0.930556               40       0.764706        0.777184     0.764706 0.759664
  svm  0.916667               20       0.764706        0.764706     0.764706 0.764706
   rf  0.916667               60       0.764706        0.777184     0.764706 0.759664
   lr  0.916667               30       0.823529        0.826891     0.823529 0.822291
  svm  0.875000               60       0.647059        0.678733     0.647059 0.620321
  svm  0.875000               50       0.647059        0.678733     0.647059 0.620321
   lr  0.875000               50       0.705882        0.706723     0.705882 0.703818
   rf  0.861111               30       0.764706        0.777184     0.764706 0.759664
   rf  0.861111               40       0.764706        0.777184     0.764706 0.759664
   lr  0.861111           

calculating mean CV AUCs. This retrains with params grid again cause original code was very error prone :P

In [None]:
X_train_df = X_train
y_train_series = y_train

cv = RepeatedStratifiedKFold(n_splits=8, n_repeats=15, random_state=123)

results = []

for model_name, model in models.items():
    print(f"\nRunning GridSearchCV for {model_name}")

    model_param_grid = param_grid[model_name].copy()

    n_features_list = model_param_grid.pop('mrmr__n_features')

    for n_features_mrmr in n_features_list:
        print(f"\n  Testing with {n_features_mrmr} mRMR features for {model_name}")

        selected_features_mrmr = pymrmr.mRMR(X_train_df, 'MIQ', n_features_mrmr)

        X_train_mrmr = X_train[selected_features_mrmr]
        X_test_mrmr = X_test[selected_features_mrmr]

        base_model = clone(model)

        if model_name not in ["rf", "xgb", "gb"]:
            pipeline = Pipeline(steps=[
                ('scaler', StandardScaler()),
                ('model', base_model)
            ])
        else:
            pipeline = Pipeline(steps=[
                ('model', base_model)
            ])


        grid = GridSearchCV(
            pipeline,
            model_param_grid,
            cv=cv,
            scoring='roc_auc',
            n_jobs=-1,
            verbose=0
        )

       #fit grid search on the mrmr selected training data
        grid.fit(X_train_mrmr, y_train)

        best_index = grid.best_index_
        mean_cv_auc = grid.cv_results_['mean_test_score'][best_index]
        std_cv_auc = grid.cv_results_['std_test_score'][best_index]

        print(f"Mean CV AUC: {mean_cv_auc:.4f} +/- {std_cv_auc:.4f}")


Running GridSearchCV for lr

  Testing with 10 mRMR features for lr
Mean CV AUC: 0.6273 +/- 0.1435

  Testing with 20 mRMR features for lr
Mean CV AUC: 0.8785 +/- 0.0855

  Testing with 30 mRMR features for lr
Mean CV AUC: 0.8432 +/- 0.1000

  Testing with 40 mRMR features for lr
Mean CV AUC: 0.8604 +/- 0.1040

  Testing with 50 mRMR features for lr
Mean CV AUC: 0.8694 +/- 0.0927

  Testing with 60 mRMR features for lr
Mean CV AUC: 0.8618 +/- 0.0959

Running GridSearchCV for rf

  Testing with 10 mRMR features for rf
Mean CV AUC: 0.4931 +/- 0.1673

  Testing with 20 mRMR features for rf
Mean CV AUC: 0.9176 +/- 0.0754

  Testing with 30 mRMR features for rf
Mean CV AUC: 0.9076 +/- 0.0696

  Testing with 40 mRMR features for rf
Mean CV AUC: 0.8753 +/- 0.0934

  Testing with 50 mRMR features for rf
Mean CV AUC: 0.8893 +/- 0.0851

  Testing with 60 mRMR features for rf
Mean CV AUC: 0.8665 +/- 0.1002

Running GridSearchCV for svm

  Testing with 10 mRMR features for svm
Mean CV AUC: 0.6298