# Re-analysis for "A novel cortical biomarker signature predicts individual pain sensitivity"

libaries

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, roc_auc_score
import os
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

set your basepath

In [18]:
# Define base path
basepath = "/home/ole/projects/PAF_reanalysis"

## Main analysis

This does the following: \
\
(1) loads in complete dataset with PAF, CME and class for all subjects. \
(2) defines models and its parameter space \
(3) splits data in independent training and test set \
(4) uses gridsearch crossvalidation in training data to fit model \
(5) uses trained models to predict data of test set \
(6) reports accuracy and AUC for both, training and test set  \

This is repeatedly done to avoid an over/underestimation of the final metrics due to the relatively low sample size for a machine learning pipeline.

Number of repetitions is set by parameter repetitions.

set number of repetitions 

In [15]:
repetitions = 20

Pipeline

In [16]:
# Load data
paf_file = os.path.join(basepath, "data/PAF_all.xlsx")
cme_file = os.path.join(basepath, "data/map_volume_all.xlsx")
class_file = os.path.join(basepath, "data/class_IDs_all.xlsx")

df_paf = pd.read_excel(paf_file)
df_cme = pd.read_excel(cme_file)
df_class = pd.read_excel(class_file)

# Calculate CME values
df_cme["CME"] = (df_cme.Volume_Day5 - df_cme.Volume_Day0).apply(lambda x: 1 if x > 0 else 0)

# Merge data on ID
data = df_class.merge(df_paf, on="ID", how="inner").merge(df_cme[["ID", "CME"]], on="ID", how="inner")

# Define predictors and target
X = data[["sensorimotor_paf", "CME"]]
y = data["class"]

# Models and hyperparameters
models_and_params = {
    "LogisticRegression": (
        LogisticRegression(),
        {
            'model__C': np.logspace(-3, 3, 30),
            'model__solver': ['newton-cg', 'lbfgs'],
            'model__max_iter': [200, 400, 2000, 5000]
        }
    ),
    "RandomForest": (
        RandomForestClassifier(),
        {
            'model__n_estimators': [300, 500, 1000],
            'model__max_depth': [None, 5, 10],
            'model__min_samples_split': [2, 5, 10],
            'model__bootstrap': [True, False]
        }
    ),
    "GradientBoosting": (
        GradientBoostingClassifier(),
        {
            'model__learning_rate': [0.01, 0.1, 0.5],
            'model__max_depth': [None, 2, 5],
            'model__min_samples_split': [2, 5, 10],
            'model__n_estimators': [100, 200]
        }
    ),
    "SVC": (
        SVC(probability=True),
        {
            'model__C': [0.01, 0.1, 1, 10],
            'model__gamma': ['scale', 'auto']
        }
    ),
    "MLPClassifier": (
        MLPClassifier(),
        {
            'model__alpha': [1e-4, 1e-3, 1e-2, 1e-1],
            'model__hidden_layer_sizes': [(100,), (100, 100), (100, 100, 100)],
            'model__max_iter': [5000]
        }
    )
}

# Function to perform analysis
def run_analysis(random_seeds, n_runs):
    results = []

    for seed in random_seeds:
        print(f"Running analysis with seed {seed}...")

        # Shuffle data
        X_shuffled, y_shuffled = shuffle(X, y, random_state=seed)

        # Split data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X_shuffled, y_shuffled, test_size=0.35, random_state=seed)

        for model_name, (model, param_grid) in models_and_params.items():

            # Create pipeline
            pipeline = Pipeline([
                ("imputer", IterativeImputer(max_iter=100, random_state=seed)),
                ("scaler", StandardScaler()),
                ("model", model)
            ])

            # Perform grid search
            search = GridSearchCV(
                pipeline, param_grid=param_grid,
                cv=5, scoring="accuracy", verbose=1, n_jobs=-1
            )
            search.fit(X_train, y_train)

            # Get the best model
            best_model = search.best_estimator_

            # Evaluate the model on training and test sets
            for dataset, X_eval, y_eval, label in zip(
                ["train", "test"],
                [X_train, X_test],
                [y_train, y_test],
                ["Training", "Test"]
            ):
                y_pred = best_model.predict(X_eval)
                y_pred_proba = best_model.predict_proba(X_eval)[:, 1]
                accuracy = accuracy_score(y_eval, y_pred)
                auc = roc_auc_score(y_eval, y_pred_proba)
                results.append({
                    "Seed": seed,
                    "Model": model_name,
                    "Dataset": label,
                    "Accuracy": accuracy,
                    "AUC": auc
                })

    return pd.DataFrame(results)

# Run analysis
random_seeds = np.random.randint(0, 10000, size=repetitions)
n_runs = repetitions
results_df = run_analysis(random_seeds, n_runs=n_runs)

# Aggregate results
summary = results_df.groupby(["Model", "Dataset"]).mean()[["Accuracy", "AUC"]].reset_index()

# Save results
results_df.to_csv(os.path.join(basepath, "results/results_all_runs.csv"), index=False)
summary.to_csv(os.path.join(basepath, "results/summary_results.csv"), index=False)

print("Analysis complete. Results saved.")


Running analysis with seed 3548...
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Running analysis with seed 4090...
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Running analysis with seed 3925...
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 1

## Create plots for accuracy and AUC for all models 
\
Plot will only display the average for both metrics across all repetitions of the pipeline

In [17]:
# Define the order of the Dataset categories
hue_order = ["Training", "Test"]

# Melt the DataFrame for easier plotting
plot_df = summary.melt(id_vars=["Model", "Dataset"], value_vars=["Accuracy", "AUC"], 
                       var_name="Metric", value_name="Score")

# Create the bar plot for Accuracy and AUC
for metric in ["Accuracy", "AUC"]:
    plt.figure(figsize=(12, 10))
    sns.barplot(
        data=plot_df[plot_df["Metric"] == metric],
        x="Score",
        y="Model",
        hue="Dataset",
        hue_order=hue_order,  # Ensure Training is always left and Test is right
        palette={"Training": "#9fc8c8", "Test": "#298c8c"}  # Adjust colors
    )

    # Add values on bars
    for container in plt.gca().containers:
        labels = [f"{v.get_width():.2f}" for v in container]
        plt.gca().bar_label(container, labels=labels, label_type='edge', fontsize=14)

    # Add title and labels
    plt.title(f"{metric} by Model", fontsize=20, fontweight="bold")
    plt.xlabel(metric, fontsize=16)
    plt.ylabel("Model", fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.legend(title="Dataset", fontsize=14, title_fontsize=16, loc="lower right")

    # Remove grid lines
    plt.gca().grid(False)
    plt.gca().spines["left"].set_linewidth(0.5)
    plt.gca().spines["bottom"].set_linewidth(0.5)

    # Save the figure as an SVG
    filename = os.path.join(basepath, f"figures/{metric}_by_Model.svg")
    plt.tight_layout()
    plt.savefig(filename, format="svg")
    plt.close()  # Close the plot to avoid overlap

    print(f"Saved {metric} plot as SVG: {filename}")


Saved Accuracy plot as SVG: /home/ole/projects/PAF/figures/Accuracy_by_Model.svg
Saved AUC plot as SVG: /home/ole/projects/PAF/figures/AUC_by_Model.svg
