# OS Status: classical ML models

In [None]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
# Importing necessary modules for preprocessing, model training, and evaluation
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
# Importing classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
# Importing metrics
from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, average_precision_score, classification_report,
                             confusion_matrix, roc_curve, precision_recall_curve)
# For interpretability
import shap

# For reproducibility, the value is set for conventional reasons
SEED = 42
tf.keras.utils.set_random_seed(SEED)



In [None]:
# load the data
data = pd.read_csv('dataset_b.csv', encoding='latin-1', sep=',') # request the dataset to the author!
#data.head()

In [3]:
# target column : "os_status", binary variable
# relevant columns
relevant_columns = ['age', 'dcr', 'dnlr', 'histology', 'immuno_line', 'iorr', 
                    'ldhpre', 'leucotpre', 'nb_meta_beforeimmuno', 'neuttpre', 
                     'ps_befimmuno', 'sex', 'smoking_history', 'os_status']

data = data[relevant_columns]
data = data.dropna(axis=0)
data['dcr'] = data['dcr'].astype(int)
data['age'] = data['age'].astype(int)
data['iorr'] = data['iorr'].astype(int)
data['ps_befimmuno'] = data['ps_befimmuno'].astype(int)

#data.head()

In [4]:
data['histology'] = data['histology'].str.lower()
data['sex'] = data['sex'].str.lower()
data['smoking_history'] = data['smoking_history'].str.lower()

In [None]:
data= data.dropna(axis=0)
data.shape

In [6]:
#to randomize the data
data = data.sample(frac=1, random_state=SEED)

# one-hot encoding
one_hot_data = pd.get_dummies(data, columns=['histology', 'sex', 'smoking_history'])

one_hot_data = one_hot_data.rename(columns={
    'histology_Adenocarcinoma': 'histology_adenocarcinoma',
    'histology_Squamous': 'histology_squamous',
    'histology_Nsclc_other': 'histology_nsclc_other',
    'histology_Large_cells': 'histology_large_cells',
    'sex_Male': 'sex_male',
    'sex_Female': 'sex_female',
    'smoking_history_Non_smoker': 'smoking_history_non_smoker',
    'smoking_history_Former': 'smoking_history_former',
    'smoking_history_Current': 'smoking_history_current',
    'smoking_history_Unk': 'smoking_history_unk'
})

#one_hot_data.head()

In [None]:
# replace boolean values with 0 and 1
for col in ['histology_adenocarcinoma','histology_squamous','histology_nsclc other',
    'histology_large cells','sex_male','sex_female','smoking_history_non smoker','smoking_history_former','smoking_history_current',
     'smoking_history_unk']:
    one_hot_data[col] = one_hot_data[col].replace({False: 0, True: 1})

In [None]:
# split the data into features and target
X = one_hot_data[one_hot_data.columns.difference(['os_status'])]
y = data['os_status']


# First split: training+validation vs test (80% vs 20%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # stratify to maintain class distribution
)

# Second split: training vs validation (75% vs 25% of the remaining 80%)
# This results in 60% training, 20% validation, and 20% test overall
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

In [None]:
#This ensures that all numerical features contribute equally
numerical_features = ['age', 'dcr', 'dnlr', 'ldhpre', 'leucotpre', 
                      'nb_meta_beforeimmuno', 'neuttpre', 'ps_befimmuno']
scaler = StandardScaler()

binary_features = [col for col in X.columns if col not in numerical_features]

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_val_scaled = X_val.copy() 
X_test_scaled = X_test.copy()
X_scaled = X.copy()
X_train_val_scaled = X_temp.copy()

X_scaled[numerical_features] = scaler.fit_transform(X_scaled[numerical_features])
X_train_scaled[numerical_features] = scaler.fit_transform(X_train_scaled[numerical_features])
X_val_scaled[numerical_features] = scaler.transform(X_val_scaled[numerical_features])
X_test_scaled[numerical_features] = scaler.transform(X_test_scaled[numerical_features])
X_train_val_scaled[numerical_features] = scaler.fit_transform(X_train_val_scaled[numerical_features])

In [None]:
def evaluate_model(y_true, y_pred, y_proba, class_names=None):
    """
    To evaluate a binary classification model.

    Parameters:
        y_true: array-like, real labels (0 o 1)
        y_pred: array-like, predictive labels (0 o 1)
        y_proba: array-like, probabilitys from the model (can be [n_samples] or [n_samples, 2])
    """

    # If class names are not provided
    if class_names is None:
        class_names = ['Class 0', 'Class 1']

    # If y_proba has 2 columns, take the probability of the positive class
    if y_proba.ndim > 1 and y_proba.shape[1] == 2:
        y_proba_pos = y_proba[:, 1]
    else:
        y_proba_pos = y_proba

    # --- Global metrics ---
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}\n")

    # --- Complete Report ---
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=class_names))

    # --- Confusion Matrix ---
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')

    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'),
                     ha="center", va="center",
                     color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.show()

    # --- ROC Curve & PR Curve ---
    try:
        auc_roc = roc_auc_score(y_true, y_proba_pos)
        avg_precision = average_precision_score(y_true, y_proba_pos)
        print(f"AUC-ROC: {auc_roc:.4f}")
        print(f"Average Precision (PR AUC): {avg_precision:.4f}")

        # ROC curve
        fpr, tpr, _ = roc_curve(y_true, y_proba_pos)
        plt.figure(figsize=(6, 5))
        plt.plot(fpr, tpr, label=f"AUC = {auc_roc:.4f}")
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curve")
        plt.legend(loc="lower right")
        plt.show()

        # Precision-Recall curve
        precision_vals, recall_vals, _ = precision_recall_curve(y_true, y_proba_pos)
        plt.figure(figsize=(6, 5))
        plt.plot(recall_vals, precision_vals, label=f"AP = {avg_precision:.4f}")
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.title("Precision-Recall Curve")
        plt.legend(loc="lower left")
        plt.show()

    except Exception as e:
        print("ROC/PR Curve Error:", e)


# Logistic Regression

In [11]:
feature_names = binary_features + numerical_features
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names)
X_train_val_scaled_df = pd.DataFrame(X_train_val_scaled, columns=feature_names)


In [12]:
scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'neg_log_loss': 'neg_log_loss'
}


In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2', None],  
    'fit_intercept': [True, False],
    'class_weight': [None, 'balanced']
}

scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'neg_log_loss': 'neg_log_loss'
}

grid = GridSearchCV(
    LogisticRegression(solver='lbfgs', max_iter=1000, random_state=SEED),
    param_grid,
    scoring=scoring,
    refit='f1_macro', 
    cv=5,
    verbose=1,
    n_jobs=-1
)


grid.fit(X_train_val_scaled, y_temp)

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)


In [None]:
lr_model_best = grid.best_estimator_
lr_model_best.fit(X_train_val_scaled, y_temp)

#Predict the test set
y_pred_lr_best = lr_model_best.predict(X_test_scaled)
y_prob_lr_best = lr_model_best.predict_proba(X_test_scaled)

print("\nTest Set Evaluation:")
evaluate_model(y_test, y_pred_lr_best, y_prob_lr_best)

In [None]:
explainer = shap.Explainer(lr_model_best, X_train_val_scaled_df)
shap_values = explainer(X_test_scaled_df)

# Classe positiva (1)
print("SHAP plot for class 1:")
shap.plots.beeswarm(shap_values, max_display=len(feature_names), show=True)



# Random Forest

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],              
    'max_depth': [None, 10, 20, 30],              
    'class_weight': ['balanced', None],           
    'bootstrap': [True, False],                   
    'criterion': ['gini', 'entropy']              
}


grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=SEED),
    param_grid=param_grid,
    scoring=scoring,
    refit='f1_macro',    
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_temp, y_temp)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


In [None]:
rf_model_best = grid_search.best_estimator_


y_pred_rf_best = rf_model_best.predict(X_test)
y_prob_rf_best = rf_model_best.predict_proba(X_test)
evaluate_model(y_test, y_pred_rf_best, y_prob_rf_best)

In [13]:
X_test_df = pd.DataFrame(X_test, columns=feature_names)
X_train_df = pd.DataFrame(X_temp, columns=feature_names)


In [None]:
explainer= shap.TreeExplainer(rf_model_best, feature_perturbation="tree_path_dependent")
shap_values = explainer(X_test_df)

shap.plots.beeswarm(shap_values[:,:,1], max_display=len(feature_names), show=True)

# Gradient Boosting

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],              
    'learning_rate': [0.01, 0.05, 0.1, 0.2],       
    'subsample': [0.6, 0.8, 1.0],                  
    'max_features': ['sqrt', 'log2', None]         
}

gb = GradientBoostingClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=gb,
    param_grid=param_grid,
    cv=5,
    scoring=scoring,
    refit='f1_macro',  
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_temp, y_temp)

print("Best params:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

In [None]:
gb_model_best = grid_search.best_estimator_

y_pred_gb_best = gb_model_best.predict(X_test)
y_prob_gb_best = gb_model_best.predict_proba(X_test)

evaluate_model(y_test, y_pred_gb_best, y_prob_gb_best)

In [None]:
gb_model_best.fit(X_train_val_scaled_df, y_temp)

explainer = shap.Explainer(gb_model_best.predict_proba, X_train_val_scaled_df, feature_perturbation="interventional")
shap_values = explainer(X_test_df)

shap.plots.beeswarm(shap_values[:,:,1], max_display=len(feature_names), show=True)


# XGBoost

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1],

}

xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',  
    random_state=42,
    eval_metric='logloss',        
    use_label_encoder=False
)


grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring=scoring,
    refit='f1_macro', 
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_temp, y_temp)

print("Best params:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
 

In [None]:
best_xgb_model = grid_search.best_estimator_

y_pred_xgb_best = best_xgb_model.predict(X_test)
y_prob_xgb_best = best_xgb_model.predict_proba(X_test)
evaluate_model(y_test, y_pred_xgb_best, y_prob_xgb_best)

In [None]:
explainer = shap.Explainer(best_xgb_model, X_train_df)

shap_values = explainer(X_test_df)
shap.plots.beeswarm(shap_values, max_display=len(feature_names), show=True)

# Light Gradient Boosting Machine

In [None]:
param_grid = {
    'num_leaves': [31, 63],
    'max_depth': [-1, 10],
    'learning_rate': [0.01, 0.1],
    'min_child_samples': [1, 10],
    'min_split_gain': [0.0, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'neg_log_loss': 'neg_log_loss'
}

lgbm = LGBMClassifier(
    objective='binary',
    random_state=42,
    force_col_wise=True
)

grid = GridSearchCV(lgbm, param_grid, cv=5, scoring=scoring,refit="f1_macro" , n_jobs=-1, verbose=1)
grid.fit(X_temp, y_temp)

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)


In [None]:
best_lgbm_model = grid.best_estimator_

y_pred_lgbm_best = best_lgbm_model.predict(X_test)
y_prob_lgbm_best = best_lgbm_model.predict_proba(X_test)
evaluate_model(y_test, y_pred_lgbm_best, y_prob_lgbm_best)

In [None]:
explainer= shap.TreeExplainer(best_lgbm_model, feature_perturbation="tree_path_dependent")
shap_values = explainer(X_test_df)

shap.plots.beeswarm(shap_values, max_display=len(feature_names), show=True)

# SVC

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear', 'sigmoid'],
    'class_weight': ['balanced', None],
    'shrinking': [True, False] 
}


svc = SVC(probability=True, random_state=42)

grid = GridSearchCV(svc, param_grid, cv=5, scoring=scoring,refit="f1_macro" , n_jobs=-1, verbose=1)
grid.fit(X_train_val_scaled, y_temp)

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)


In [None]:
best_svc_model = grid.best_estimator_

y_pred_svc_best = best_svc_model.predict(X_test)
y_prob_svc_best = best_svc_model.predict_proba(X_test)
evaluate_model(y_test, y_pred_svc_best, y_prob_svc_best)

In [None]:
explainer = shap.KernelExplainer(best_svc_model.predict_proba, shap.kmeans(X_train_val_scaled_df, 10))
shap_values = explainer.shap_values(X_test_scaled_df, nsamples=100)

explanation = shap.Explanation(
    values=shap_values[:, :, 1],  
    data=X_test_scaled_df,
    feature_names=feature_names
)
shap.plots.beeswarm(explanation, max_display=len(feature_names))

# K Neighbors

In [None]:
param_grid = {
    'n_neighbors': [10, 12, 15, 17, 20],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  
    
}

knn = KNeighborsClassifier(algorithm='auto')

grid = GridSearchCV(knn, param_grid, cv=5, scoring=scoring, refit="f1_macro" , n_jobs=-1, verbose=1)
grid.fit(X_train_val_scaled, y_temp)

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)

In [None]:
best_knn_model = grid.best_estimator_

y_pred_knn_best = best_knn_model.predict(X_test)
y_prob_knn_best = best_knn_model.predict_proba(X_test)
evaluate_model(y_test, y_pred_knn_best, y_prob_knn_best)

In [None]:
background = X_train_scaled_df.sample(50, random_state=42)
explainer = shap.KernelExplainer(best_knn_model.predict_proba, background)
X_subset = X_test_scaled_df[:20]  # taking a subset for faster computation
shap_values = explainer.shap_values(X_subset)


explanation = shap.Explanation(
    values=shap_values[:, :, 1],  
    data=X_subset,
    feature_names=feature_names
)

shap.plots.beeswarm(explanation, max_display=len(feature_names), show=True)

# Ridge Classifier

In [None]:
param_grid = {
    'alpha': [0.1, 1.0, 10, 100],
    'class_weight': [None, 'balanced'],
    'solver': ['auto','sag', 'lsqr', 'sparse_cg']
    
}
scoring_rc = {
    'accuracy': 'accuracy',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'f1_macro': 'f1_macro'
}

rc = RidgeClassifier(random_state=42)

grid = GridSearchCV(rc, param_grid, cv=5, scoring=scoring_rc, refit="f1_macro" , n_jobs=-1, verbose=1)
grid.fit(X_train_val_scaled, y_temp)

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)


In [None]:
best_rc_model = grid.best_estimator_

y_pred_rc_best = best_rc_model.predict(X_test_scaled)
y_prob_rc_best = best_rc_model.decision_function(X_test_scaled)
evaluate_model(y_test, y_pred_rc_best, y_prob_rc_best)

In [None]:
masker = shap.maskers.Independent(X_train_val_scaled_df)
explainer = shap.LinearExplainer(best_rc_model, masker)
shap_values = explainer.shap_values(X_test_scaled_df)

explanation = shap.Explanation(
    values=shap_values,  
    data=X_test_scaled_df,
    feature_names=feature_names
)
shap.plots.beeswarm(explanation, max_display=len(feature_names), show=True)


# Naive Bayes

In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train_val_scaled, y_temp)  

y_pred_nb = nb_model.predict(X_test_scaled)
y_prob_nb = nb_model.predict_proba(X_test_scaled)

print("\nTest Set Evaluation:")
evaluate_model(y_test, y_pred_nb, y_prob_nb)


In [None]:
background = shap.kmeans(X_train_val_scaled_df, 10)
explainer = shap.KernelExplainer(nb_model.predict_proba, background)
shap_values = explainer.shap_values(X_test_scaled_df, nsamples=100)
explanation = shap.Explanation(
    values=shap_values[:, :, 1],  
    data=X_test_scaled_df,
    feature_names=feature_names
)
shap.plots.beeswarm(explanation, max_display=len(feature_names))

# Decison Tree Classifier

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', None],
    'splitter': ['best', 'random']
}

tree = DecisionTreeClassifier(random_state=42)

grid = GridSearchCV(
    estimator=tree,
    param_grid=param_grid,
    scoring= scoring, 
    refit="f1_macro", 
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid.fit(X_temp, y_temp)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)

In [None]:
best_dt_model = grid.best_estimator_

y_pred_dt_best = best_dt_model.predict(X_test)
y_prob_dt_best = best_dt_model.predict_proba(X_test)
evaluate_model(y_test, y_pred_dt_best, y_prob_dt_best)

In [None]:
explainer = shap.TreeExplainer(best_dt_model, feature_perturbation="tree_path_dependent")
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test, feature_names=feature_names)