# In this notebook we did:
  - Evaluates models on a combined dataset of 1905 samples and 49 common proteins (Control group included, blood cancer types combined).
  - Missing values handled using KNN imputation.
  - SMOTE applied for data balancing within each fold.
  - Random Forest and Extra Trees Classifiers performed.
  - k-Fold Cross-Validation is used to assess model performance.
  - Performance metrics: accuracy, F1 score (weighted), and ROC AUC.
  - The entire process (imputation, SMOTE, classification) is encapsulated in a machine learning pipeline

In [1]:
# Import the packages we may need
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from seaborn import set_style
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import is_classifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer
from sklearn.utils import resample
from sklearn.multiclass import OneVsRestClassifier  # Correct location for OneVsRestClassifier
from scipy.stats import sem
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix



In [2]:
#Random forest for feature importances
from sklearn.tree import DecisionTreeClassifier


## This sets the plot style
## to have a grid on a white background
set_style("whitegrid")


In [3]:
df_full = pd.read_csv('DataCleaning_and_ExploratoryAnalysis/Combined_df2.csv')
df_full

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Sample_ID,Cancer,Q15389,P29965,P49763,Q02763,P01127,P09341,...,P09382,Q16790,P26842,P14210,P43489,O75144,O43927,P32970,Q8WXI7,P10144
0,0,0.0,PM910,Ctrl,10.12895,5.62622,7.72902,8.47530,10.08926,10.58751,...,8.54519,4.20874,10.61404,8.37166,4.99296,7.78219,7.03584,4.81481,3.58965,4.13563
1,1,1.0,PM396,Ctrl,8.67289,5.18821,8.25523,9.06271,8.89866,9.22360,...,8.25401,3.46839,10.77271,8.36820,4.92422,7.47997,8.05700,3.98900,4.78155,3.14840
2,2,2.0,PM190,Ctrl,9.99567,6.38876,8.44263,8.42102,10.08508,10.43894,...,8.75887,4.65936,11.03062,9.18464,5.60743,7.92803,8.77261,4.80189,5.16350,4.29062
3,3,3.0,PM270,Ctrl,8.26407,5.06228,8.13429,8.66165,8.75925,9.24310,...,8.75741,4.44633,10.47952,8.65548,5.54289,9.29458,8.64028,4.04045,5.59217,3.75295
4,4,4.0,PM656,Ctrl,9.08833,5.95005,8.37830,8.29127,9.43936,9.83732,...,8.48018,3.81634,10.59295,8.63758,5.16271,7.41098,8.29143,4.59594,4.91665,4.10381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1900,162,,C29ZZ80Y4.01,HODG,9.87819,7.15139,7.71189,7.72042,10.23705,7.66455,...,7.19963,4.26576,7.80544,9.24521,5.59883,5.16395,9.03873,4.24414,1.53814,5.30002
1901,163,,C29ZS2ML8.01,HODG,9.42403,6.16559,7.48199,7.30005,10.14143,7.84437,...,6.90334,4.14795,7.74788,8.80026,5.18470,7.53391,7.97077,3.84446,3.41778,3.84928
1902,164,,C29ZFFZ0Q.01,HODG,10.26858,8.11759,7.78457,7.46200,10.25920,8.15014,...,6.69773,3.60838,7.67268,7.83666,5.35574,5.27098,7.55698,3.11025,1.56641,4.23022
1903,165,,C29ZQOG4N.01,HODG,10.15491,7.09910,7.78078,7.45732,10.27589,8.17975,...,7.01314,5.67411,8.44027,8.79263,5.66616,7.99492,8.71554,5.02993,3.94930,4.90380


In [4]:
#df = df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
#df

In [5]:
# Combine all the blood cancers into one category
df_bloodcombined = df_full.copy(deep = True)
df_bloodcombined=df_bloodcombined.replace(to_replace = ['AML', 'CLL', 'LYMPH', 'MYEL'], value = 'BLOOD' )
#df_bloodcombined = df_bloodcombined[df_bloodcombined['Cancer'] != 'Ctrl'] #Exclude the control group Ctrl

In [6]:
# Verify that the groups were combined as expected
set(df_bloodcombined['Cancer'])

{'BLOOD',
 'BRC',
 'CRC',
 'CVX',
 'Ctrl',
 'ENDC',
 'ESO',
 'GLIOM',
 'HODG',
 'LUNGC',
 'OVC',
 'PRC'}

In [7]:
# Specify the protein list 
proteins=df_bloodcombined.columns[4:]
# Check that we have the right number of proteins
len(proteins)

49

In [8]:
df_bloodcombined

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Sample_ID,Cancer,Q15389,P29965,P49763,Q02763,P01127,P09341,...,P09382,Q16790,P26842,P14210,P43489,O75144,O43927,P32970,Q8WXI7,P10144
0,0,0.0,PM910,Ctrl,10.12895,5.62622,7.72902,8.47530,10.08926,10.58751,...,8.54519,4.20874,10.61404,8.37166,4.99296,7.78219,7.03584,4.81481,3.58965,4.13563
1,1,1.0,PM396,Ctrl,8.67289,5.18821,8.25523,9.06271,8.89866,9.22360,...,8.25401,3.46839,10.77271,8.36820,4.92422,7.47997,8.05700,3.98900,4.78155,3.14840
2,2,2.0,PM190,Ctrl,9.99567,6.38876,8.44263,8.42102,10.08508,10.43894,...,8.75887,4.65936,11.03062,9.18464,5.60743,7.92803,8.77261,4.80189,5.16350,4.29062
3,3,3.0,PM270,Ctrl,8.26407,5.06228,8.13429,8.66165,8.75925,9.24310,...,8.75741,4.44633,10.47952,8.65548,5.54289,9.29458,8.64028,4.04045,5.59217,3.75295
4,4,4.0,PM656,Ctrl,9.08833,5.95005,8.37830,8.29127,9.43936,9.83732,...,8.48018,3.81634,10.59295,8.63758,5.16271,7.41098,8.29143,4.59594,4.91665,4.10381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1900,162,,C29ZZ80Y4.01,HODG,9.87819,7.15139,7.71189,7.72042,10.23705,7.66455,...,7.19963,4.26576,7.80544,9.24521,5.59883,5.16395,9.03873,4.24414,1.53814,5.30002
1901,163,,C29ZS2ML8.01,HODG,9.42403,6.16559,7.48199,7.30005,10.14143,7.84437,...,6.90334,4.14795,7.74788,8.80026,5.18470,7.53391,7.97077,3.84446,3.41778,3.84928
1902,164,,C29ZFFZ0Q.01,HODG,10.26858,8.11759,7.78457,7.46200,10.25920,8.15014,...,6.69773,3.60838,7.67268,7.83666,5.35574,5.27098,7.55698,3.11025,1.56641,4.23022
1903,165,,C29ZQOG4N.01,HODG,10.15491,7.09910,7.78078,7.45732,10.27589,8.17975,...,7.01314,5.67411,8.44027,8.79263,5.66616,7.99492,8.71554,5.02993,3.94930,4.90380


In [59]:
# Encode the cancer types as numbers

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_bloodcombined['Cancer'] = le.fit_transform(df_bloodcombined['Cancer'])

In [60]:
df_bloodcombined.head(15)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Sample_ID,Cancer,Q15389,P29965,P49763,Q02763,P01127,P09341,...,P09382,Q16790,P26842,P14210,P43489,O75144,O43927,P32970,Q8WXI7,P10144
0,0,0.0,PM910,4,10.12895,5.62622,7.72902,8.4753,10.08926,10.58751,...,8.54519,4.20874,10.61404,8.37166,4.99296,7.78219,7.03584,4.81481,3.58965,4.13563
1,1,1.0,PM396,4,8.67289,5.18821,8.25523,9.06271,8.89866,9.2236,...,8.25401,3.46839,10.77271,8.3682,4.92422,7.47997,8.057,3.989,4.78155,3.1484
2,2,2.0,PM190,4,9.99567,6.38876,8.44263,8.42102,10.08508,10.43894,...,8.75887,4.65936,11.03062,9.18464,5.60743,7.92803,8.77261,4.80189,5.1635,4.29062
3,3,3.0,PM270,4,8.26407,5.06228,8.13429,8.66165,8.75925,9.2431,...,8.75741,4.44633,10.47952,8.65548,5.54289,9.29458,8.64028,4.04045,5.59217,3.75295
4,4,4.0,PM656,4,9.08833,5.95005,8.3783,8.29127,9.43936,9.83732,...,8.48018,3.81634,10.59295,8.63758,5.16271,7.41098,8.29143,4.59594,4.91665,4.10381
5,5,5.0,PM736,4,8.64457,5.79507,8.27732,8.37578,8.60373,9.03092,...,8.76805,5.53215,10.41412,8.70804,5.83762,7.71496,7.50007,5.59281,5.25662,3.88111
6,6,6.0,PM842,4,10.06218,5.0638,7.74244,8.41381,9.86604,10.05079,...,8.56072,4.13384,10.50253,8.85963,4.95377,7.49059,7.39059,4.0135,5.06355,3.70582
7,7,7.0,PM767,4,9.86577,7.3084,8.29889,8.60954,11.17774,10.68756,...,8.95749,4.62731,11.00948,9.47875,5.84993,7.84098,8.23261,4.39093,5.66446,3.84306
8,8,8.0,PM1497,4,8.37279,5.91951,8.51155,8.31815,8.95261,9.1053,...,8.58994,4.68348,10.49201,8.60244,5.16184,7.31519,7.61259,4.02262,4.4467,3.3108
9,9,9.0,PM746,4,10.34566,5.71417,8.05251,8.38463,10.61735,10.4489,...,8.64534,4.59777,10.50475,8.67217,5.3083,7.66071,7.24119,4.71803,4.29578,3.78925


In [61]:
# Split the data into X and y
X = df_bloodcombined[proteins]
y = df_bloodcombined['Cancer']

In [62]:
X.head()

Unnamed: 0,Q15389,P29965,P49763,Q02763,P01127,P09341,O00182,Q14116,P09601,Q92583,...,P09382,Q16790,P26842,P14210,P43489,O75144,O43927,P32970,Q8WXI7,P10144
0,10.12895,5.62622,7.72902,8.4753,10.08926,10.58751,8.65712,7.81074,11.03461,8.07038,...,8.54519,4.20874,10.61404,8.37166,4.99296,7.78219,7.03584,4.81481,3.58965,4.13563
1,8.67289,5.18821,8.25523,9.06271,8.89866,9.2236,8.46706,8.18828,11.91449,8.15394,...,8.25401,3.46839,10.77271,8.3682,4.92422,7.47997,8.057,3.989,4.78155,3.1484
2,9.99567,6.38876,8.44263,8.42102,10.08508,10.43894,9.0983,9.15899,11.66469,8.5858,...,8.75887,4.65936,11.03062,9.18464,5.60743,7.92803,8.77261,4.80189,5.1635,4.29062
3,8.26407,5.06228,8.13429,8.66165,8.75925,9.2431,8.7225,8.57028,11.16933,7.91727,...,8.75741,4.44633,10.47952,8.65548,5.54289,9.29458,8.64028,4.04045,5.59217,3.75295
4,9.08833,5.95005,8.3783,8.29127,9.43936,9.83732,8.80904,8.18094,11.81232,7.69365,...,8.48018,3.81634,10.59295,8.63758,5.16271,7.41098,8.29143,4.59594,4.91665,4.10381


In [63]:
# Split data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, stratify = y, test_size=0.2, random_state=100)

In [64]:
# Initialize models
models = {
    #"Logistic Regression": OneVsRestClassifier(LogisticRegression(max_iter=100, solver='lbfgs')),
    #"k-Nearest Neighbors (n=5)": KNeighborsClassifier(n_neighbors=5),
    #"k-Nearest Neighbors (n=13)": KNeighborsClassifier(n_neighbors=13),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=100),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=100),
    #"XGBoost": XGBClassifier(n_estimators=100)
}

In [65]:
# Cross-validation and bootstrapping parameters
n_splits = 5  # Number of folds
n_bootstraps = 1000  # Number of bootstraps for confidence interval calculation
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=100)



In [66]:
#removed bootstrap to run the code faster
# Loop over models
for model_name, model in models.items():
    # Print the model evaluation message in bold
    print(f"\033[1m\nEvaluating {model_name} for the common 49 proteins, combined alldataset(included control, combined BLOOD)\033[0m")

    # Initialize KNNImputer for handling missing values
    imputer = KNNImputer(n_neighbors=5, weights="uniform")
    
    # Containers for accuracy and other metrics
    fold_accuracies = []
    fold_f1_scores = []
    fold_roc_auc_scores = []
    fold_confusion_matrices = []

    # k-Fold Cross-Validation Loop
    for i, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
        # Extract train and test sets for the current fold
        X_fold_train = X_train.iloc[train_index]
        X_fold_test = X_train.iloc[test_index]
        y_fold_train = y_train.iloc[train_index]
        y_fold_test = y_train.iloc[test_index]

        # Apply KNN Imputer to impute missing values for each fold
        X_fold_train_imputed = imputer.fit_transform(X_fold_train)
        X_fold_test_imputed = imputer.transform(X_fold_test)

        # Apply SMOTE to balance the data within each fold
        smote = SMOTE(random_state=100)
        X_resampled, y_resampled = smote.fit_resample(X_fold_train_imputed, y_fold_train)

        # Train the model on the bootstrap sample
        model.fit(X_resampled, y_resampled)

        # Predict and evaluate metrics for this fold
        y_pred = model.predict(X_fold_test_imputed)
        fold_accuracies.append(accuracy_score(y_fold_test, y_pred))
        fold_f1_scores.append(f1_score(y_fold_test, y_pred, average='weighted'))
        fold_roc_auc_scores.append(roc_auc_score(y_fold_test, model.predict_proba(X_fold_test_imputed), multi_class='ovr'))
        fold_confusion_matrices.append(confusion_matrix(y_fold_test, y_pred))

        # Print accuracy and other metrics for this fold
        print(f"Fold {i+1} - Accuracy: {fold_accuracies[-1]:.4f}, F1: {fold_f1_scores[-1]:.4f}, ROC AUC: {fold_roc_auc_scores[-1]:.4f}")
      
        ## Plot the confusion matrix as a heatmap
        #plt.figure(figsize=(8, 6))
        #sns.heatmap(fold_confusion_matrices[-1], annot=True, fmt='d', cmap='Blues', 
        #            xticklabels=np.unique(y_train), yticklabels=np.unique(y_train))
        #plt.xlabel('Predicted Label')
        #plt.ylabel('True Label')
        #plt.title(f"Logistic Regression - Fold {i+1} Confusion Matrix")
        #plt.show()
    
    
    # Display averaged accuracy and metrics for the cross-validation
    mean_accuracy = np.mean(fold_accuracies)
    mean_f1_score = np.mean(fold_f1_scores)
    mean_roc_auc = np.mean(fold_roc_auc_scores)
    confidence_interval = 1.96 * sem(fold_accuracies)
    
    print(f"\n{model_name} - Mean Accuracy: {mean_accuracy:.4f} ± {confidence_interval:.4f}")
    print(f"{model_name} - Mean F1 Score: {mean_f1_score:.4f}")
    print(f"{model_name} - Mean ROC AUC: {mean_roc_auc:.4f}")

    ## You could plot the confusion matrix for the last fold
    #plt.figure(figsize=(8, 6))
    #sns.heatmap(fold_confusion_matrices[-1], annot=True, fmt='d', cmap='Blues', 
    #            xticklabels=le.classes_, yticklabels=le.classes_)
    #plt.xlabel('Predicted Label')
    #plt.ylabel('True Label')
    #plt.title(f"{model_name} - Confusion Matrix (Fold {n_splits})")
    #plt.show()

# After cross-validation, we impute on the entire training and test set and train the best model

# Assuming the best model was Random Forest (or another model if selected)
best_model = RandomForestClassifier(n_estimators=100, random_state=100)
imputer = KNNImputer(n_neighbors=5, weights="uniform")

# Impute on entire training set
X_train_imputed = imputer.fit_transform(X_train)

# Impute on the test set (do not fit the imputer again, avoid data leakage)
X_test_imputed = imputer.transform(X_test)

# Train on entire training set
best_model.fit(X_train_imputed, y_train)

# Evaluate on test set
y_test_pred = best_model.predict(X_test_imputed)

# Evaluate final performance metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
test_roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test_imputed), multi_class='ovr')

print(f"\nFinal Test Accuracy: {test_accuracy:.4f}")
print(f"Final Test F1 Score: {test_f1_score:.4f}")
print(f"Final Test ROC AUC: {test_roc_auc:.4f}")


[1m
Evaluating Random Forest for the common 49 proteins, combined alldataset(included control, combined BLOOD)[0m
Fold 1 - Accuracy: 0.6164, F1: 0.6152, ROC AUC: 0.9153
Fold 2 - Accuracy: 0.5902, F1: 0.5931, ROC AUC: 0.8919
Fold 3 - Accuracy: 0.5672, F1: 0.5652, ROC AUC: 0.8974
Fold 4 - Accuracy: 0.5705, F1: 0.5735, ROC AUC: 0.8987
Fold 5 - Accuracy: 0.5921, F1: 0.5946, ROC AUC: 0.9018

Random Forest - Mean Accuracy: 0.5873 ± 0.0173
Random Forest - Mean F1 Score: 0.5883
Random Forest - Mean ROC AUC: 0.9010
[1m
Evaluating Extra Trees for the common 49 proteins, combined alldataset(included control, combined BLOOD)[0m
Fold 1 - Accuracy: 0.6164, F1: 0.6140, ROC AUC: 0.9055
Fold 2 - Accuracy: 0.5836, F1: 0.5830, ROC AUC: 0.8860
Fold 3 - Accuracy: 0.6033, F1: 0.6010, ROC AUC: 0.8941
Fold 4 - Accuracy: 0.5902, F1: 0.5934, ROC AUC: 0.8983
Fold 5 - Accuracy: 0.5658, F1: 0.5705, ROC AUC: 0.9051

Extra Trees - Mean Accuracy: 0.5918 ± 0.0169
Extra Trees - Mean F1 Score: 0.5924
Extra Trees - M

In [67]:
#using pipeline structure for imputation and SMOTE

from imblearn.pipeline import Pipeline  # Use imblearn's pipeline



# Loop over models
for model_name, model in models.items():
    # Print the model evaluation message in bold
    print(f"\033[1m\nEvaluating {model_name} for the common 49 proteins, combined alldataset(included control, combined BLOOD)\033[0m")

    # Container for metrics
    fold_accuracies = []
    fold_f1_scores = []
    fold_roc_auc_scores = []
    fold_confusion_matrices = []

    # k-Fold Cross-Validation Loop
    for i, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
        # Extract train and test sets for the current fold
        X_fold_train = X_train.iloc[train_index]
        X_fold_test = X_train.iloc[test_index]
        y_fold_train = y_train.iloc[train_index]
        y_fold_test = y_train.iloc[test_index]

        # Create the pipeline: Imputation -> SMOTE -> Classifier
        pipeline = Pipeline([
            ('imputer', KNNImputer(n_neighbors=5, weights="uniform")),  # kNN Imputation
            ('smote', SMOTE(random_state=100)),                        # SMOTE
            ('classifier', model)                                       # Model
        ])
        
        # Fit and evaluate the model within the pipeline
        pipeline.fit(X_fold_train, y_fold_train)

        # Predict and evaluate metrics for this fold
        y_pred = pipeline.predict(X_fold_test)
        fold_accuracies.append(accuracy_score(y_fold_test, y_pred))
        fold_f1_scores.append(f1_score(y_fold_test, y_pred, average='weighted'))
        fold_roc_auc_scores.append(roc_auc_score(y_fold_test, pipeline.predict_proba(X_fold_test), multi_class='ovr'))
        fold_confusion_matrices.append(confusion_matrix(y_fold_test, y_pred))

        # Print accuracy and other metrics for this fold
        print(f"Fold {i+1} - Accuracy: {fold_accuracies[-1]:.4f}, F1: {fold_f1_scores[-1]:.4f}, ROC AUC: {fold_roc_auc_scores[-1]:.4f}")

    # Display averaged accuracy and metrics for the cross-validation
    mean_accuracy = np.mean(fold_accuracies)
    mean_f1_score = np.mean(fold_f1_scores)
    mean_roc_auc = np.mean(fold_roc_auc_scores)
    confidence_interval = 1.96 * sem(fold_accuracies)
    
    print(f"\n{model_name} - Mean Accuracy: {mean_accuracy:.4f} ± {confidence_interval:.4f}")
    print(f"{model_name} - Mean F1 Score: {mean_f1_score:.4f}")
    print(f"{model_name} - Mean ROC AUC: {mean_roc_auc:.4f}")

    ## Optionally plot the confusion matrix for the last fold
    #plt.figure(figsize=(8, 6))
    #sns.heatmap(fold_confusion_matrices[-1], annot=True, fmt='d', cmap='Blues', 
    #            xticklabels=le.classes_, yticklabels=le.classes_)
    #plt.xlabel('Predicted Label')
    #plt.ylabel('True Label')
    #plt.title(f"{model_name} - Confusion Matrix (Fold {n_splits})")
    #plt.show()

# After cross-validation, impute on the entire training and test set and train the best model

# Assuming the best model was Random Forest (or another model if selected)
best_model = RandomForestClassifier(n_estimators=100, random_state=100)

# Create the pipeline for final training
pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5, weights="uniform")),
    ('smote', SMOTE(random_state=100)),
    ('classifier', best_model)
])

# Fit the pipeline on the entire training set
pipeline.fit(X_train, y_train)

# Evaluate on the test set
y_test_pred = pipeline.predict(X_test)

# Evaluate final performance metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
test_roc_auc = roc_auc_score(y_test, pipeline.predict_proba(X_test), multi_class='ovr')

print(f"\nFinal Test Accuracy: {test_accuracy:.4f}")
print(f"Final Test F1 Score: {test_f1_score:.4f}")
print(f"Final Test ROC AUC: {test_roc_auc:.4f}")


[1m
Evaluating Random Forest for the common 49 proteins, combined alldataset(included control, combined BLOOD)[0m
Fold 1 - Accuracy: 0.6164, F1: 0.6152, ROC AUC: 0.9153
Fold 2 - Accuracy: 0.5902, F1: 0.5931, ROC AUC: 0.8919
Fold 3 - Accuracy: 0.5672, F1: 0.5652, ROC AUC: 0.8974
Fold 4 - Accuracy: 0.5705, F1: 0.5735, ROC AUC: 0.8987
Fold 5 - Accuracy: 0.5921, F1: 0.5946, ROC AUC: 0.9018

Random Forest - Mean Accuracy: 0.5873 ± 0.0173
Random Forest - Mean F1 Score: 0.5883
Random Forest - Mean ROC AUC: 0.9010
[1m
Evaluating Extra Trees for the common 49 proteins, combined alldataset(included control, combined BLOOD)[0m
Fold 1 - Accuracy: 0.6164, F1: 0.6140, ROC AUC: 0.9055
Fold 2 - Accuracy: 0.5836, F1: 0.5830, ROC AUC: 0.8860
Fold 3 - Accuracy: 0.6033, F1: 0.6010, ROC AUC: 0.8941
Fold 4 - Accuracy: 0.5902, F1: 0.5934, ROC AUC: 0.8983
Fold 5 - Accuracy: 0.5658, F1: 0.5705, ROC AUC: 0.9051

Extra Trees - Mean Accuracy: 0.5918 ± 0.0169
Extra Trees - Mean F1 Score: 0.5924
Extra Trees - M

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from imblearn.over_sampling import SMOTE
from sklearn.impute import KNNImputer
from imblearn.pipeline import Pipeline  # Use imblearn's pipeline
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import sem

# Encode the cancer types as numbers (if necessary)
le = LabelEncoder()
df_bloodcombined['Cancer'] = le.fit_transform(df_bloodcombined['Cancer'])

X = df_bloodcombined[proteins]  # Ensure proteins is defined in your code
y = df_bloodcombined['Cancer']

# Split data into training and testing sets (but we only use the test set later for final evaluation)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, test_size=0.2, random_state=100)

# Initialize models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=100),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=100),
}

# Cross-validation parameters
n_splits = 5  # Number of folds
n_bootstraps = 1000  # Number of bootstraps for confidence interval calculation
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=100)

# Loop over models
for model_name, model in models.items():
    print(f"\nEvaluating {model_name} with a pipeline...")

    # Container for metrics
    fold_accuracies = []
    fold_f1_scores = []
    fold_roc_auc_scores = []
    fold_confusion_matrices = []  # This needs to be initialized before the fold loop

    # k-Fold Cross-Validation Loop
    for i, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
        # Extract train and test sets for the current fold
        X_fold_train = X_train.iloc[train_index]
        X_fold_test = X_train.iloc[test_index]
        y_fold_train = y_train.iloc[train_index]
        y_fold_test = y_train.iloc[test_index]

        # Create the pipeline: Imputation -> SMOTE -> Classifier
        pipeline = Pipeline([
            ('imputer', KNNImputer(n_neighbors=5, weights="uniform")),  # kNN Imputation
            ('smote', SMOTE(random_state=100)),                        # SMOTE
            ('classifier', model)                                       # Model
        ])
        
        # Bootstrapping - List to store bootstrap accuracies
        bootstrap_accuracies = []
        bootstrap_f1_scores = []
        bootstrap_roc_auc_scores = []

        # Bootstrapping within the fold
        for b in range(n_bootstraps):
            # Resample the training data with SMOTE (with replacement)
            X_resampled, y_resampled = resample(X_fold_train, y_fold_train, random_state=100)
            
            # Apply the pipeline to the bootstrap sample
            pipeline.fit(X_resampled, y_resampled)

            # Predict and evaluate performance on the validation set (test_index)
            y_pred = pipeline.predict(X_fold_test)
            bootstrap_accuracies.append(accuracy_score(y_fold_test, y_pred))
            bootstrap_f1_scores.append(f1_score(y_fold_test, y_pred, average='weighted'))
            bootstrap_roc_auc_scores.append(roc_auc_score(y_fold_test, pipeline.predict_proba(X_fold_test), multi_class='ovr'))

        # Calculate mean accuracy, F1 score, and ROC AUC for this fold from bootstraps
        fold_accuracy = np.mean(bootstrap_accuracies)
        fold_f1_score = np.mean(bootstrap_f1_scores)
        fold_roc_auc = np.mean(bootstrap_roc_auc_scores)

        fold_accuracies.append(fold_accuracy)
        fold_f1_scores.append(fold_f1_score)
        fold_roc_auc_scores.append(fold_roc_auc)

        # Calculate and append confusion matrix for this fold
        fold_conf_matrix = confusion_matrix(y_fold_test, y_pred)
        fold_confusion_matrices.append(fold_conf_matrix)

        # Print metrics for this fold
        print(f"Fold {i+1} - Accuracy: {fold_accuracy:.4f}, F1: {fold_f1_score:.4f}, ROC AUC: {fold_roc_auc:.4f}")

    # Display averaged accuracy and metrics for the cross-validation
    mean_accuracy = np.mean(fold_accuracies)
    mean_f1_score = np.mean(fold_f1_scores)
    mean_roc_auc = np.mean(fold_roc_auc_scores)
    confidence_interval = 1.96 * sem(fold_accuracies)
    
    print(f"\n{model_name} - Mean Accuracy: {mean_accuracy:.4f} ± {confidence_interval:.4f}")
    print(f"{model_name} - Mean F1 Score: {mean_f1_score:.4f}")
    print(f"{model_name} - Mean ROC AUC: {mean_roc_auc:.4f}")

    # Optionally plot the confusion matrix for the last fold
    if fold_confusion_matrices:
        plt.figure(figsize=(8, 6))
        sns.heatmap(fold_confusion_matrices[-1], annot=True, fmt='d', cmap='Blues', 
                    xticklabels=le.classes_, yticklabels=le.classes_)
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.title(f"{model_name} - Confusion Matrix (Last Fold)")
        plt.show()
    else:
        print(f"No confusion matrix to plot for {model_name}.")
