# Select key Biomarkers

## 1. Import necessary libraries, have function definitions

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Train a LASSO logistic regression model with cross-validation and stability selection
def train_lasso_model(X_train, y_train, num_bootstraps=50, selection_threshold=0.5):

    # Standardize the training data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    # Define a LASSO model (LogisticRegression with L1 penalty)
    lasso = LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000)

    # Grid search for the best regularization strength
    param_grid = {"C": np.logspace(-8, -1, 40)}
    grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)
    best_C = grid_search.best_params_["C"]
    best_model = LogisticRegression(penalty="l1", solver="liblinear", C=best_C, max_iter=1000)
    
    # Perform Stability Selection (Bootstrap)
    selection_counts = np.zeros(X_train.shape[1])
    for _ in range(num_bootstraps):
        X_resampled, y_resampled = resample(X_train_scaled, y_train, random_state=_)
        best_model.fit(X_resampled, y_resampled)
        selection_counts += (best_model.coef_[0] != 0)

    # Compute selection frequency
    selection_freq = selection_counts / num_bootstraps

    # Select biomarkers that appear in at least `selection_threshold` % of bootstrap runs
    selected_biomarkers = X_train.columns[selection_freq >= selection_threshold]

    # Store LASSO results
    lasso_results = pd.DataFrame({
        "Biomarker": X_train.columns,
        "Coefficient": best_model.coef_[0],
        "Selection_Frequency": selection_freq
    })

    return best_model, scaler, selected_biomarkers, lasso_results


In [3]:
# Evaluate the trained LASSO model on the validation set using ROC AUC.
def evaluate_lasso_model(model, scaler, X_val, y_val):
    X_val_scaled = scaler.transform(X_val)
    y_val_pred_prob = model.predict_proba(X_val_scaled)
    
    # If binary classification , use the probability of the positive class
    if y_val_pred_prob.shape[1] == 2:
        auc_val = roc_auc_score(y_val, y_val_pred_prob[:, 1])
    else:
        auc_val = roc_auc_score(y_val, y_val_pred_prob, multi_class="ovr")
    return auc_val

## 2. Load the data

In [4]:
# Load the biomarker data that we cleaned and scaled in N1_biomarker_data_cleaning.py
biomarker_df_file_path = "../data/clean_data/scaled_biomarker_data_with_subjectid.csv"
biomarker_df = pd.read_csv(biomarker_df_file_path)
biomarker_df["subject_id"] = biomarker_df["subject_id"].astype(str)

print(biomarker_df.shape)
biomarker_df.head()

(466, 21)


Unnamed: 0,subject_id,balf_Amphiregulin_V1_imputed,balf_Calprotectin_V1_imputed,balf_CD163_V1_imputed,balf_IL-8_chemo_V1_imputed,balf_IP-10_chemo_V1_imputed,balf_MCP-1_chemo_V1_imputed,balf_MCP-4_chemo_V1_imputed,balf_TARC_chemo_V1_imputed,balf_GM-CSF_V1_imputed,balf_IL-12/IL-23p40_V1_imputed,balf_IL-15_V1_imputed,balf_IL-16_V1_imputed,balf_IL-17A_V1_imputed,balf_IL-7_V1_imputed,balf_VEGF_V1_imputed,balf_G-CSF_V1_imputed,balf_PD-L1_V1_imputed,balf_IL-6_proinf_V1_imputed,balf_sRAGE_V1_imputed,balf_TNF-RI_V1_imputed
0,3901,-1.47013,-1.498284,-0.246658,-1.129279,-0.61407,-0.030338,-0.866099,-0.440824,-0.655745,-1.162979,0.375604,-0.695447,-1.099571,0.286584,0.352247,-1.095234,-0.887325,-1.598728,-0.288758,-1.412866
1,3695,0.80377,0.911863,1.228127,-3.101678,-2.73449,-2.711906,-1.388472,-1.679053,-0.042735,1.72837,0.789723,1.435168,2.252914,0.530009,0.845505,1.136749,1.368696,1.13896,0.049796,0.753702
2,4097,0.553875,0.576147,-0.224706,0.996641,-0.032357,0.08944,-0.522827,-0.620921,-0.935185,0.06412,-1.59051,0.092549,-0.101333,-0.558272,0.403123,0.351343,-0.339183,1.136237,0.310366,0.560863
3,3738,-1.671626,-0.889214,-0.380679,-0.493922,-0.726075,0.811831,0.090998,-0.737614,-0.720736,-0.498054,0.68541,-1.272986,-0.815376,-0.952069,-0.559623,-0.522124,-0.689435,-1.015348,-0.57365,-1.044524
4,3791,1.005517,1.19963,-0.85112,0.776083,0.854145,-0.590079,-0.351586,-0.13926,-0.981345,0.360495,-1.226081,1.447499,0.130569,0.644662,1.273656,0.558921,-0.019721,0.442616,-1.53627,0.599358


In [5]:
# Load the clusters from LCA analysis
file_path = "../data/clean_data/vap_cluster_assignments_k4_lca.csv"
df_cluster = pd.read_csv(file_path)
df_cluster["subject_id"] = df_cluster["subject_id"].astype(str)
df_cluster.drop(columns=["Unnamed: 0"], inplace=True)

print(df_cluster.shape)
df_cluster.head()

(466, 2)


Unnamed: 0,subject_id,cluster
0,3901,3
1,3695,4
2,4097,4
3,3738,3
4,3791,4


In [6]:
# Merge the biomarker data with the cluster assignments
biomarker_df_with_clusters = biomarker_df.merge(df_cluster, on="subject_id", how="inner")
print(biomarker_df_with_clusters.shape)
biomarker_df_with_clusters

(465, 22)


Unnamed: 0,subject_id,balf_Amphiregulin_V1_imputed,balf_Calprotectin_V1_imputed,balf_CD163_V1_imputed,balf_IL-8_chemo_V1_imputed,balf_IP-10_chemo_V1_imputed,balf_MCP-1_chemo_V1_imputed,balf_MCP-4_chemo_V1_imputed,balf_TARC_chemo_V1_imputed,balf_GM-CSF_V1_imputed,balf_IL-12/IL-23p40_V1_imputed,balf_IL-15_V1_imputed,balf_IL-16_V1_imputed,balf_IL-17A_V1_imputed,balf_IL-7_V1_imputed,balf_VEGF_V1_imputed,balf_G-CSF_V1_imputed,balf_PD-L1_V1_imputed,balf_IL-6_proinf_V1_imputed,balf_sRAGE_V1_imputed,balf_TNF-RI_V1_imputed,cluster
0,3901,-1.470130,-1.498284,-0.246658,-1.129279,-0.614070,-0.030338,-0.866099,-0.440824,-0.655745,-1.162979,0.375604,-0.695447,-1.099571,0.286584,0.352247,-1.095234,-0.887325,-1.598728,-0.288758,-1.412866,3
1,3695,0.803770,0.911863,1.228127,-3.101678,-2.734490,-2.711906,-1.388472,-1.679053,-0.042735,1.728370,0.789723,1.435168,2.252914,0.530009,0.845505,1.136749,1.368696,1.138960,0.049796,0.753702,4
2,4097,0.553875,0.576147,-0.224706,0.996641,-0.032357,0.089440,-0.522827,-0.620921,-0.935185,0.064120,-1.590510,0.092549,-0.101333,-0.558272,0.403123,0.351343,-0.339183,1.136237,0.310366,0.560863,4
3,3738,-1.671626,-0.889214,-0.380679,-0.493922,-0.726075,0.811831,0.090998,-0.737614,-0.720736,-0.498054,0.685410,-1.272986,-0.815376,-0.952069,-0.559623,-0.522124,-0.689435,-1.015348,-0.573650,-1.044524,3
4,3791,1.005517,1.199630,-0.851120,0.776083,0.854145,-0.590079,-0.351586,-0.139260,-0.981345,0.360495,-1.226081,1.447499,0.130569,0.644662,1.273656,0.558921,-0.019721,0.442616,-1.536270,0.599358,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
460,3564,0.136118,-0.716736,0.361324,-0.185581,-0.091699,1.287248,0.860970,1.088775,3.524383,0.091102,3.127050,-0.394790,-0.099334,-0.445344,-0.390419,0.420936,0.669298,0.360024,1.161601,0.345492,1
461,3884,-0.572252,0.305615,-0.346746,0.261856,1.546520,-1.042111,-0.455674,0.423276,-0.145223,-0.338022,0.394652,0.751150,0.256962,0.189636,0.203630,-0.326163,1.398747,-0.540334,-0.163602,-0.764395,1
462,4057,-1.358552,-2.328733,-1.124490,-3.101678,-1.321181,-1.943298,-1.326807,-1.420940,-0.561037,-0.953613,0.730921,-0.936780,-0.706578,-1.013454,-0.361471,-1.130783,-0.663959,-1.344822,-0.343330,-1.466420,3
463,3682,-1.160260,-0.085557,0.162542,-0.215466,0.283231,-0.999277,-0.944271,-1.039545,0.250593,-0.532099,0.585695,0.318892,0.040128,-1.048667,-0.112062,-0.607697,-0.012010,-0.555040,-0.805376,-0.526094,1


## 4. LASSO modeling to select the key biomarkers

Next, let us see which are the key biomarkers.


In [7]:
X = biomarker_df_with_clusters.drop(columns=["cluster", "subject_id"])
y = biomarker_df_with_clusters["cluster"]

# Perform a 80-20 train val split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=30)

# Train the LASSO model and extract selected biomarkers
best_model, scaler, selected_biomarkers, lasso_results = train_lasso_model(X_train, y_train)

print("\nSelected ", len(selected_biomarkers), " Biomarkers:")
print(selected_biomarkers)

print("\nLASSO Results:")
print(lasso_results)



Selected  5  Biomarkers:
Index(['balf_IP-10_chemo_V1_imputed', 'balf_TARC_chemo_V1_imputed',
       'balf_GM-CSF_V1_imputed', 'balf_IL-15_V1_imputed',
       'balf_TNF-RI_V1_imputed'],
      dtype='object')

LASSO Results:
                         Biomarker  Coefficient  Selection_Frequency
0     balf_Amphiregulin_V1_imputed     0.000000                 0.08
1     balf_Calprotectin_V1_imputed     0.000000                 0.20
2            balf_CD163_V1_imputed     0.000000                 0.00
3       balf_IL-8_chemo_V1_imputed     0.000000                 0.34
4      balf_IP-10_chemo_V1_imputed     0.000000                 0.76
5      balf_MCP-1_chemo_V1_imputed     0.076553                 0.42
6      balf_MCP-4_chemo_V1_imputed     0.090808                 0.40
7       balf_TARC_chemo_V1_imputed     0.010921                 0.50
8           balf_GM-CSF_V1_imputed     0.000000                 0.50
9   balf_IL-12/IL-23p40_V1_imputed     0.000000                 0.48
10           balf

The LASSO logistic regression with stability selection identified five key biomarkers, with `balf_TNF-RI_V1_imputed` (100%) as the most stable predictor, showing a strong inverse relationship with mortality (coefficient = -0.43). Other frequently selected biomarkers included `balf_IP-10_chemo_V1_imputed` (76%), `balf_TARC_chemo_V1_imputed` (50%), `balf_GM-CSF_V1_imputed` (50%), and `balf_IL-15_V1_imputed` (60%), suggesting their potential clinical relevance.

Meanwhile, biomarkers like `balf_Amphiregulin_V1_imputed` and `balf_CD163_V1_imputed` were rarely or never selected, indicating limited predictive value.

In [8]:
# Evaluate the model
auc_val = evaluate_lasso_model(best_model, scaler, X_val, y_val)
print("\nValidation AUC:", auc_val)


Validation AUC: 0.9576380544206787


With a Validation AUC of 0.96, our model seems to be performing really well in distinguishing outcomes. We’ll now dive deeper into statistical analysis in the coming notebooks to understand how these biomarkers and clusters are linked to hospital mortality and other key clinical factors.