# In this notebook we did:
  - Evaluates models on a pancancer dataset of 1477 samples and 49 common proteins (blood cancer types combined).
  - Missing values handled using KNN imputation.
  - SMOTE applied for data balancing within each fold.
  - Random Forest and Extra Trees Classifiers performed.
  - k-Fold Cross-Validation is used to assess model performance.
  - Performance metrics: accuracy, F1 score (weighted), and ROC AUC.
  - The entire process (imputation, SMOTE, classification) is encapsulated in a machine learning pipeline

In [11]:
# Import the packages we may need
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from seaborn import set_style
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import is_classifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer
from sklearn.utils import resample
from sklearn.multiclass import OneVsRestClassifier  # Correct location for OneVsRestClassifier
from scipy.stats import sem
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix



In [12]:
#Random forest for feature importances
from sklearn.tree import DecisionTreeClassifier


## This sets the plot style
## to have a grid on a white background
set_style("whitegrid")


In [13]:
df_full = pd.read_csv('pancancer_cleaned.csv')
df_full

Unnamed: 0,Sample_ID,Cancer,O00182,O43927,O75144,O75509,O76036,O95727,P01127,P01133,...,Q02763,Q13241,Q14116,Q15389,Q16790,Q8WXI7,Q92583,Q9BQ51,Q9NP84,Q9UQV4
0,AML_1,AML,1.59230,3.15170,-0.44360,0.12740,-0.57045,1.29635,4.97385,5.50420,...,0.85000,0.20940,1.08450,3.87015,-1.14620,2.42120,2.04445,0.50115,0.44920,-2.77780
1,AML_10,AML,1.16670,0.30650,0.10625,0.36865,-0.08820,0.14870,0.83920,1.60795,...,0.47620,-0.11250,0.04940,0.23345,0.00320,0.31530,0.83345,-0.12485,-0.07950,1.46690
2,AML_11,AML,1.41675,0.44485,0.00470,0.03025,0.17980,0.55920,1.62395,2.69820,...,0.61460,0.40890,0.16355,1.11750,-0.35315,-1.66810,0.38805,0.02615,-0.16465,1.05760
3,AML_12,AML,0.98105,1.09810,0.34380,0.32815,-0.13315,,-3.21070,-1.52715,...,0.76085,0.85425,0.03160,-1.28095,-0.07265,-0.44885,-0.62225,0.19930,-0.07115,-0.50410
4,AML_13,AML,2.13880,3.80050,-0.45920,0.04570,-0.42470,-0.94935,3.51680,4.11555,...,0.53765,-0.26875,1.02810,1.89000,-0.09185,-0.48200,0.88205,0.85705,0.54480,0.51920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1472,PRC_95,PRC,,0.01985,0.17420,0.34940,1.15610,0.52560,,,...,,0.89455,,,-0.69030,0.48140,,0.16315,,-0.00240
1473,PRC_96,PRC,0.34665,0.55720,0.06700,0.35565,0.43470,-0.34745,1.29880,2.34565,...,0.03440,0.26440,-0.19275,1.40880,-0.16545,-0.45445,1.73215,-0.05205,0.62195,0.85200
1474,PRC_97,PRC,1.45545,0.47720,0.18355,1.04070,1.09255,0.58745,3.02390,3.96605,...,0.56755,1.29150,0.96960,2.58860,1.60425,2.85945,3.25395,1.01855,0.62515,1.23215
1475,PRC_98,PRC,1.10305,0.22715,0.07190,0.69890,0.41505,0.88095,4.91230,6.05290,...,0.40275,1.23105,1.34740,3.94170,0.09775,2.70775,3.66375,-0.05185,0.24315,0.95830


In [14]:
#df = df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
#df

In [15]:
print(df_full['Cancer'].unique())

['AML' 'BRC' 'CLL' 'CRC' 'CVX' 'ENDC' 'GLIOM' 'LUNGC' 'LYMPH' 'MYEL' 'OVC'
 'PRC']


In [62]:
# Combine all the blood cancers into one category
df_bloodcombined = df_full.copy(deep = True)
df_bloodcombined=df_bloodcombined.replace(to_replace = ['AML', 'CLL', 'LYMPH', 'MYEL'], value = 'BLOOD' )
#df_bloodcombined = df_bloodcombined[df_bloodcombined['Cancer'] != 'Ctrl'] #Exclude the control group Ctrl

In [63]:
# Verify that the groups were combined as expected
set(df_bloodcombined['Cancer'])

{'BLOOD', 'BRC', 'CRC', 'CVX', 'ENDC', 'GLIOM', 'LUNGC', 'OVC', 'PRC'}

In [64]:
# Specify the protein list 
proteins=df_bloodcombined.columns[2:]
# Check that we have the right number of proteins
len(proteins)

49

In [65]:
df_bloodcombined.head()

Unnamed: 0,Sample_ID,Cancer,O00182,O43927,O75144,O75509,O76036,O95727,P01127,P01133,...,Q02763,Q13241,Q14116,Q15389,Q16790,Q8WXI7,Q92583,Q9BQ51,Q9NP84,Q9UQV4
0,AML_1,BLOOD,1.5923,3.1517,-0.4436,0.1274,-0.57045,1.29635,4.97385,5.5042,...,0.85,0.2094,1.0845,3.87015,-1.1462,2.4212,2.04445,0.50115,0.4492,-2.7778
1,AML_10,BLOOD,1.1667,0.3065,0.10625,0.36865,-0.0882,0.1487,0.8392,1.60795,...,0.4762,-0.1125,0.0494,0.23345,0.0032,0.3153,0.83345,-0.12485,-0.0795,1.4669
2,AML_11,BLOOD,1.41675,0.44485,0.0047,0.03025,0.1798,0.5592,1.62395,2.6982,...,0.6146,0.4089,0.16355,1.1175,-0.35315,-1.6681,0.38805,0.02615,-0.16465,1.0576
3,AML_12,BLOOD,0.98105,1.0981,0.3438,0.32815,-0.13315,,-3.2107,-1.52715,...,0.76085,0.85425,0.0316,-1.28095,-0.07265,-0.44885,-0.62225,0.1993,-0.07115,-0.5041
4,AML_13,BLOOD,2.1388,3.8005,-0.4592,0.0457,-0.4247,-0.94935,3.5168,4.11555,...,0.53765,-0.26875,1.0281,1.89,-0.09185,-0.482,0.88205,0.85705,0.5448,0.5192


In [66]:
# Encode the cancer types as numbers

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_bloodcombined['Cancer'] = le.fit_transform(df_bloodcombined['Cancer'])

In [67]:
df_bloodcombined.head(15)

Unnamed: 0,Sample_ID,Cancer,O00182,O43927,O75144,O75509,O76036,O95727,P01127,P01133,...,Q02763,Q13241,Q14116,Q15389,Q16790,Q8WXI7,Q92583,Q9BQ51,Q9NP84,Q9UQV4
0,AML_1,0,1.5923,3.1517,-0.4436,0.1274,-0.57045,1.29635,4.97385,5.5042,...,0.85,0.2094,1.0845,3.87015,-1.1462,2.4212,2.04445,0.50115,0.4492,-2.7778
1,AML_10,0,1.1667,0.3065,0.10625,0.36865,-0.0882,0.1487,0.8392,1.60795,...,0.4762,-0.1125,0.0494,0.23345,0.0032,0.3153,0.83345,-0.12485,-0.0795,1.4669
2,AML_11,0,1.41675,0.44485,0.0047,0.03025,0.1798,0.5592,1.62395,2.6982,...,0.6146,0.4089,0.16355,1.1175,-0.35315,-1.6681,0.38805,0.02615,-0.16465,1.0576
3,AML_12,0,0.98105,1.0981,0.3438,0.32815,-0.13315,,-3.2107,-1.52715,...,0.76085,0.85425,0.0316,-1.28095,-0.07265,-0.44885,-0.62225,0.1993,-0.07115,-0.5041
4,AML_13,0,2.1388,3.8005,-0.4592,0.0457,-0.4247,-0.94935,3.5168,4.11555,...,0.53765,-0.26875,1.0281,1.89,-0.09185,-0.482,0.88205,0.85705,0.5448,0.5192
5,AML_14,0,4.5651,1.2373,1.2381,1.2057,-0.11685,-0.1653,0.7772,2.2749,...,1.29115,-0.83805,1.1933,1.31915,1.1544,0.34115,0.31695,0.2088,2.0033,-0.0272
6,AML_15,0,3.17215,0.3957,-0.51475,1.18285,1.5708,1.2836,-0.52645,0.25475,...,0.6901,1.3189,1.73905,0.7526,-0.475,1.3278,-0.37605,0.25185,0.4402,0.5736
7,AML_16,0,1.33105,0.4054,0.06235,1.0402,0.11895,2.03875,-3.72715,-1.9597,...,0.56625,0.84855,1.57205,-2.9437,-0.12315,1.1097,-0.37655,0.6684,0.3244,0.083
8,AML_17,0,3.26195,5.1031,,1.71945,1.86235,,1.65395,2.776,...,0.2785,2.1178,0.08635,1.6174,0.8036,,7.30745,,0.89465,1.6354
9,AML_18,0,3.37575,1.41015,0.3999,1.01035,0.7239,0.8102,-2.32635,-0.7101,...,0.9314,1.5402,2.40685,-1.3081,1.35865,1.2928,-2.10125,0.86385,2.26055,1.0129


In [68]:
# Split the data into X and y
X = df_bloodcombined[proteins]
y = df_bloodcombined['Cancer']

In [69]:
X.head()

Unnamed: 0,O00182,O43927,O75144,O75509,O76036,O95727,P01127,P01133,P01730,P05113,...,Q02763,Q13241,Q14116,Q15389,Q16790,Q8WXI7,Q92583,Q9BQ51,Q9NP84,Q9UQV4
0,1.5923,3.1517,-0.4436,0.1274,-0.57045,1.29635,4.97385,5.5042,1.76855,0.12275,...,0.85,0.2094,1.0845,3.87015,-1.1462,2.4212,2.04445,0.50115,0.4492,-2.7778
1,1.1667,0.3065,0.10625,0.36865,-0.0882,0.1487,0.8392,1.60795,0.53895,0.72145,...,0.4762,-0.1125,0.0494,0.23345,0.0032,0.3153,0.83345,-0.12485,-0.0795,1.4669
2,1.41675,0.44485,0.0047,0.03025,0.1798,0.5592,1.62395,2.6982,1.2693,-0.79495,...,0.6146,0.4089,0.16355,1.1175,-0.35315,-1.6681,0.38805,0.02615,-0.16465,1.0576
3,0.98105,1.0981,0.3438,0.32815,-0.13315,,-3.2107,-1.52715,1.3116,1.12495,...,0.76085,0.85425,0.0316,-1.28095,-0.07265,-0.44885,-0.62225,0.1993,-0.07115,-0.5041
4,2.1388,3.8005,-0.4592,0.0457,-0.4247,-0.94935,3.5168,4.11555,2.06045,-0.4323,...,0.53765,-0.26875,1.0281,1.89,-0.09185,-0.482,0.88205,0.85705,0.5448,0.5192


In [70]:
# Split data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, stratify = y, test_size=0.2, random_state=100)

In [71]:
# Initialize models
models = {
    #"Logistic Regression": OneVsRestClassifier(LogisticRegression(max_iter=100, solver='lbfgs')),
    #"k-Nearest Neighbors (n=5)": KNeighborsClassifier(n_neighbors=5),
    #"k-Nearest Neighbors (n=13)": KNeighborsClassifier(n_neighbors=13),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=100),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=100),
    #"XGBoost": XGBClassifier(n_estimators=100)
}

In [72]:
# Cross-validation and bootstrapping parameters
n_splits = 5  # Number of folds
n_bootstraps = 1000  # Number of bootstraps for confidence interval calculation
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=100)



In [73]:
#removed bootstrap to run the code faster
# Loop over models
for model_name, model in models.items():
    print(f"\033[1m\nEvaluating {model_name} for the common 49 proteins, pancancer dataset (combined BLOOD)\033[0m")

    # Initialize KNNImputer for handling missing values
    imputer = KNNImputer(n_neighbors=5, weights="uniform")
    
    # Containers for accuracy and other metrics
    fold_accuracies = []
    fold_f1_scores = []
    fold_roc_auc_scores = []
    fold_confusion_matrices = []

    # k-Fold Cross-Validation Loop
    for i, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
        # Extract train and test sets for the current fold
        X_fold_train = X_train.iloc[train_index]
        X_fold_test = X_train.iloc[test_index]
        y_fold_train = y_train.iloc[train_index]
        y_fold_test = y_train.iloc[test_index]

        # Apply KNN Imputer to impute missing values for each fold
        X_fold_train_imputed = imputer.fit_transform(X_fold_train)
        X_fold_test_imputed = imputer.transform(X_fold_test)

        # Apply SMOTE to balance the data within each fold
        smote = SMOTE(random_state=100)
        X_resampled, y_resampled = smote.fit_resample(X_fold_train_imputed, y_fold_train)

        # Train the model on the bootstrap sample
        model.fit(X_resampled, y_resampled)

        # Predict and evaluate metrics for this fold
        y_pred = model.predict(X_fold_test_imputed)
        fold_accuracies.append(accuracy_score(y_fold_test, y_pred))
        fold_f1_scores.append(f1_score(y_fold_test, y_pred, average='weighted'))
        fold_roc_auc_scores.append(roc_auc_score(y_fold_test, model.predict_proba(X_fold_test_imputed), multi_class='ovr'))
        fold_confusion_matrices.append(confusion_matrix(y_fold_test, y_pred))

        # Print accuracy and other metrics for this fold
        print(f"Fold {i+1} - Accuracy: {fold_accuracies[-1]:.4f}, F1: {fold_f1_scores[-1]:.4f}, ROC AUC: {fold_roc_auc_scores[-1]:.4f}")
      
        ## Plot the confusion matrix as a heatmap
        #plt.figure(figsize=(8, 6))
        #sns.heatmap(fold_confusion_matrices[-1], annot=True, fmt='d', cmap='Blues', 
        #            xticklabels=np.unique(y_train), yticklabels=np.unique(y_train))
        #plt.xlabel('Predicted Label')
        #plt.ylabel('True Label')
        #plt.title(f"Logistic Regression - Fold {i+1} Confusion Matrix")
        #plt.show()
    
    
    # Display averaged accuracy and metrics for the cross-validation
    mean_accuracy = np.mean(fold_accuracies)
    mean_f1_score = np.mean(fold_f1_scores)
    mean_roc_auc = np.mean(fold_roc_auc_scores)
    confidence_interval = 1.96 * sem(fold_accuracies)
    
    print(f"\n{model_name} - Mean Accuracy: {mean_accuracy:.4f} ± {confidence_interval:.4f}")
    print(f"{model_name} - Mean F1 Score: {mean_f1_score:.4f}")
    print(f"{model_name} - Mean ROC AUC: {mean_roc_auc:.4f}")

    ## You could plot the confusion matrix for the last fold
    #plt.figure(figsize=(8, 6))
    #sns.heatmap(fold_confusion_matrices[-1], annot=True, fmt='d', cmap='Blues', 
    #            xticklabels=le.classes_, yticklabels=le.classes_)
    #plt.xlabel('Predicted Label')
    #plt.ylabel('True Label')
    #plt.title(f"{model_name} - Confusion Matrix (Fold {n_splits})")
    #plt.show()

# After cross-validation, we impute on the entire training and test set and train the best model

# Assuming the best model was Random Forest (or another model if selected)
best_model = RandomForestClassifier(n_estimators=100, random_state=100)
imputer = KNNImputer(n_neighbors=5, weights="uniform")

# Impute on entire training set
X_train_imputed = imputer.fit_transform(X_train)

# Impute on the test set (do not fit the imputer again, avoid data leakage)
X_test_imputed = imputer.transform(X_test)

# Train on entire training set
best_model.fit(X_train_imputed, y_train)

# Evaluate on test set
y_test_pred = best_model.predict(X_test_imputed)

# Evaluate final performance metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
test_roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test_imputed), multi_class='ovr')

print(f"\nFinal Test Accuracy: {test_accuracy:.4f}")
print(f"Final Test F1 Score: {test_f1_score:.4f}")
print(f"Final Test ROC AUC: {test_roc_auc:.4f}")


[1m
Evaluating Random Forest for the common 49 proteins, pancancer dataset (combined BLOOD)[0m
Fold 1 - Accuracy: 0.5063, F1: 0.5087, ROC AUC: 0.8209
Fold 2 - Accuracy: 0.4534, F1: 0.4551, ROC AUC: 0.8296
Fold 3 - Accuracy: 0.5169, F1: 0.5107, ROC AUC: 0.8564
Fold 4 - Accuracy: 0.4619, F1: 0.4652, ROC AUC: 0.8226
Fold 5 - Accuracy: 0.5000, F1: 0.5012, ROC AUC: 0.8394

Random Forest - Mean Accuracy: 0.4877 ± 0.0248
Random Forest - Mean F1 Score: 0.4882
Random Forest - Mean ROC AUC: 0.8338
[1m
Evaluating Extra Trees for the common 49 proteins, pancancer dataset (combined BLOOD)[0m
Fold 1 - Accuracy: 0.4684, F1: 0.4661, ROC AUC: 0.8261
Fold 2 - Accuracy: 0.4364, F1: 0.4390, ROC AUC: 0.8348
Fold 3 - Accuracy: 0.5339, F1: 0.5333, ROC AUC: 0.8473
Fold 4 - Accuracy: 0.4788, F1: 0.4813, ROC AUC: 0.8333
Fold 5 - Accuracy: 0.5042, F1: 0.5095, ROC AUC: 0.8411

Extra Trees - Mean Accuracy: 0.4843 ± 0.0323
Extra Trees - Mean F1 Score: 0.4858
Extra Trees - Mean ROC AUC: 0.8365

Final Test Accura

In [74]:
#using pipeline structure for imputation and SMOTE

from imblearn.pipeline import Pipeline  # Use imblearn's pipeline

# Loop over models
for model_name, model in models.items():
    #print(f"\nEvaluating {model_name} for the 49 proteins, pancancer dataset...")
    print(f"\033[1m\nEvaluating {model_name} for the common 49 proteins, pancancer dataset (combined BLOOD)\033[0m")


    # Container for metrics
    fold_accuracies = []
    fold_f1_scores = []
    fold_roc_auc_scores = []
    fold_confusion_matrices = []

    # k-Fold Cross-Validation Loop
    for i, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
        # Extract train and test sets for the current fold
        X_fold_train = X_train.iloc[train_index]
        X_fold_test = X_train.iloc[test_index]
        y_fold_train = y_train.iloc[train_index]
        y_fold_test = y_train.iloc[test_index]

        # Create the pipeline: Imputation -> SMOTE -> Classifier
        pipeline = Pipeline([
            ('imputer', KNNImputer(n_neighbors=5, weights="uniform")),  # kNN Imputation
            ('smote', SMOTE(random_state=100)),                        # SMOTE
            ('classifier', model)                                       # Model
        ])
        
        # Fit and evaluate the model within the pipeline
        pipeline.fit(X_fold_train, y_fold_train)

        # Predict and evaluate metrics for this fold
        y_pred = pipeline.predict(X_fold_test)
        fold_accuracies.append(accuracy_score(y_fold_test, y_pred))
        fold_f1_scores.append(f1_score(y_fold_test, y_pred, average='weighted'))
        fold_roc_auc_scores.append(roc_auc_score(y_fold_test, pipeline.predict_proba(X_fold_test), multi_class='ovr'))
        fold_confusion_matrices.append(confusion_matrix(y_fold_test, y_pred))

        # Print accuracy and other metrics for this fold
        print(f"Fold {i+1} - Accuracy: {fold_accuracies[-1]:.4f}, F1: {fold_f1_scores[-1]:.4f}, ROC AUC: {fold_roc_auc_scores[-1]:.4f}")

    # Display averaged accuracy and metrics for the cross-validation
    mean_accuracy = np.mean(fold_accuracies)
    mean_f1_score = np.mean(fold_f1_scores)
    mean_roc_auc = np.mean(fold_roc_auc_scores)
    confidence_interval = 1.96 * sem(fold_accuracies)
    
    print(f"\n{model_name} - Mean Accuracy: {mean_accuracy:.4f} ± {confidence_interval:.4f}")
    print(f"{model_name} - Mean F1 Score: {mean_f1_score:.4f}")
    print(f"{model_name} - Mean ROC AUC: {mean_roc_auc:.4f}")

    ## Optionally plot the confusion matrix for the last fold
    #plt.figure(figsize=(8, 6))
    #sns.heatmap(fold_confusion_matrices[-1], annot=True, fmt='d', cmap='Blues', 
    #            xticklabels=le.classes_, yticklabels=le.classes_)
    #plt.xlabel('Predicted Label')
    #plt.ylabel('True Label')
    #plt.title(f"{model_name} - Confusion Matrix (Fold {n_splits})")
    #plt.show()

# After cross-validation, impute on the entire training and test set and train the best model

# Assuming the best model was Random Forest (or another model if selected)
best_model = RandomForestClassifier(n_estimators=100, random_state=100)

# Create the pipeline for final training
pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5, weights="uniform")),
    ('smote', SMOTE(random_state=100)),
    ('classifier', best_model)
])

# Fit the pipeline on the entire training set
pipeline.fit(X_train, y_train)

# Evaluate on the test set
y_test_pred = pipeline.predict(X_test)

# Evaluate final performance metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
test_roc_auc = roc_auc_score(y_test, pipeline.predict_proba(X_test), multi_class='ovr')

print(f"\nFinal Test Accuracy: {test_accuracy:.4f}")
print(f"Final Test F1 Score: {test_f1_score:.4f}")
print(f"Final Test ROC AUC: {test_roc_auc:.4f}")


[1m
Evaluating Random Forest for the common 49 proteins, pancancer dataset (combined BLOOD)[0m
Fold 1 - Accuracy: 0.5063, F1: 0.5087, ROC AUC: 0.8209
Fold 2 - Accuracy: 0.4534, F1: 0.4551, ROC AUC: 0.8296
Fold 3 - Accuracy: 0.5169, F1: 0.5107, ROC AUC: 0.8564
Fold 4 - Accuracy: 0.4619, F1: 0.4652, ROC AUC: 0.8226
Fold 5 - Accuracy: 0.5000, F1: 0.5012, ROC AUC: 0.8394

Random Forest - Mean Accuracy: 0.4877 ± 0.0248
Random Forest - Mean F1 Score: 0.4882
Random Forest - Mean ROC AUC: 0.8338
[1m
Evaluating Extra Trees for the common 49 proteins, pancancer dataset (combined BLOOD)[0m
Fold 1 - Accuracy: 0.4684, F1: 0.4661, ROC AUC: 0.8261
Fold 2 - Accuracy: 0.4364, F1: 0.4390, ROC AUC: 0.8348
Fold 3 - Accuracy: 0.5339, F1: 0.5333, ROC AUC: 0.8473
Fold 4 - Accuracy: 0.4788, F1: 0.4813, ROC AUC: 0.8333
Fold 5 - Accuracy: 0.5042, F1: 0.5095, ROC AUC: 0.8411

Extra Trees - Mean Accuracy: 0.4843 ± 0.0323
Extra Trees - Mean F1 Score: 0.4858
Extra Trees - Mean ROC AUC: 0.8365

Final Test Accura

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from imblearn.over_sampling import SMOTE
from sklearn.impute import KNNImputer
from imblearn.pipeline import Pipeline  # Use imblearn's pipeline
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import sem

# Encode the cancer types as numbers (if necessary)
le = LabelEncoder()
df_bloodcombined['Cancer'] = le.fit_transform(df_bloodcombined['Cancer'])

X = df_bloodcombined[proteins]  # Ensure proteins is defined in your code
y = df_bloodcombined['Cancer']

# Split data into training and testing sets (but we only use the test set later for final evaluation)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, test_size=0.2, random_state=100)

# Initialize models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=100),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=100),
}

# Cross-validation parameters
n_splits = 5  # Number of folds
n_bootstraps = 1000  # Number of bootstraps for confidence interval calculation
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=100)

# Loop over models
for model_name, model in models.items():
    print(f"\nEvaluating {model_name} with a pipeline...")

    # Container for metrics
    fold_accuracies = []
    fold_f1_scores = []
    fold_roc_auc_scores = []
    fold_confusion_matrices = []  # This needs to be initialized before the fold loop

    # k-Fold Cross-Validation Loop
    for i, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
        # Extract train and test sets for the current fold
        X_fold_train = X_train.iloc[train_index]
        X_fold_test = X_train.iloc[test_index]
        y_fold_train = y_train.iloc[train_index]
        y_fold_test = y_train.iloc[test_index]

        # Create the pipeline: Imputation -> SMOTE -> Classifier
        pipeline = Pipeline([
            ('imputer', KNNImputer(n_neighbors=5, weights="uniform")),  # kNN Imputation
            ('smote', SMOTE(random_state=100)),                        # SMOTE
            ('classifier', model)                                       # Model
        ])
        
        # Bootstrapping - List to store bootstrap accuracies
        bootstrap_accuracies = []
        bootstrap_f1_scores = []
        bootstrap_roc_auc_scores = []

        # Bootstrapping within the fold
        for b in range(n_bootstraps):
            # Resample the training data with SMOTE (with replacement)
            X_resampled, y_resampled = resample(X_fold_train, y_fold_train, random_state=100)
            
            # Apply the pipeline to the bootstrap sample
            pipeline.fit(X_resampled, y_resampled)

            # Predict and evaluate performance on the validation set (test_index)
            y_pred = pipeline.predict(X_fold_test)
            bootstrap_accuracies.append(accuracy_score(y_fold_test, y_pred))
            bootstrap_f1_scores.append(f1_score(y_fold_test, y_pred, average='weighted'))
            bootstrap_roc_auc_scores.append(roc_auc_score(y_fold_test, pipeline.predict_proba(X_fold_test), multi_class='ovr'))

        # Calculate mean accuracy, F1 score, and ROC AUC for this fold from bootstraps
        fold_accuracy = np.mean(bootstrap_accuracies)
        fold_f1_score = np.mean(bootstrap_f1_scores)
        fold_roc_auc = np.mean(bootstrap_roc_auc_scores)

        fold_accuracies.append(fold_accuracy)
        fold_f1_scores.append(fold_f1_score)
        fold_roc_auc_scores.append(fold_roc_auc)

        # Calculate and append confusion matrix for this fold
        fold_conf_matrix = confusion_matrix(y_fold_test, y_pred)
        fold_confusion_matrices.append(fold_conf_matrix)

        # Print metrics for this fold
        print(f"Fold {i+1} - Accuracy: {fold_accuracy:.4f}, F1: {fold_f1_score:.4f}, ROC AUC: {fold_roc_auc:.4f}")

    # Display averaged accuracy and metrics for the cross-validation
    mean_accuracy = np.mean(fold_accuracies)
    mean_f1_score = np.mean(fold_f1_scores)
    mean_roc_auc = np.mean(fold_roc_auc_scores)
    confidence_interval = 1.96 * sem(fold_accuracies)
    
    print(f"\n{model_name} - Mean Accuracy: {mean_accuracy:.4f} ± {confidence_interval:.4f}")
    print(f"{model_name} - Mean F1 Score: {mean_f1_score:.4f}")
    print(f"{model_name} - Mean ROC AUC: {mean_roc_auc:.4f}")

    # Optionally plot the confusion matrix for the last fold
    if fold_confusion_matrices:
        plt.figure(figsize=(8, 6))
        sns.heatmap(fold_confusion_matrices[-1], annot=True, fmt='d', cmap='Blues', 
                    xticklabels=le.classes_, yticklabels=le.classes_)
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.title(f"{model_name} - Confusion Matrix (Last Fold)")
        plt.show()
    else:
        print(f"No confusion matrix to plot for {model_name}.")
