# RespGroup_fromTTP: classical ML models

In [None]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
#Importing libaries for data preprocessing and fine-tuning
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
#Importing libraries for building models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
#Importing libraries to evaluate the models
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, average_precision_score, classification_report,
                             confusion_matrix)
#Importing for interpretability
import shap

# for reproducibility, the value is set for conventional reasons
SEED = 42
tf.keras.utils.set_random_seed(SEED)

In [None]:
# load the data
data = pd.read_csv('dataset_d.csv', encoding='latin-1', sep=',') # request the dataset to the author
#data.head()

In [None]:
# target column : "progression_cat" multi-classification problem
# relevant columns for the model
relevant_columns = [ 'age', 'sex', 'smoking', 'ps_at_diagnosis_ad', 'n#_mets_sites', 'lung_only_m1', 'pleural', 'pericard','lymph_nodes_only_m1','soft_tissue',
    'leptomingeal','skin','peritoneal','renal','pancreas', 'brain', 'liver', 'bone', 'adrenal', 'histology', 'hbbaselineio','leucotbaselineio',
    'neut_abs...143','linfo_abs...144','baso_abs...145', 'mono_abs...147', 'plaqtbaselineio', 'progression_cat']
df= data.copy()
df = df[relevant_columns]
df.shape

In [None]:
df = df.dropna(axis=0)
df.shape

In [None]:
#to randomize the data
df = df.sample(frac=1, random_state=SEED)

var_int = ['ps_at_diagnosis_ad', 'n#_mets_sites', 'lung_only_m1', 'pleural', 'pericard', 'lymph_nodes_only_m1', 'soft_tissue',
           'leptomingeal','skin','peritoneal','renal','pancreas', 'brain', 'liver', 'bone', 'adrenal']
for i in var_int:
    df[i] = df[i].astype(int)
    
df['sex'] = df['sex'].str.lower()

sex_dummies = pd.get_dummies(df['sex'], prefix='sex', drop_first=True)

# One-hot encoding completo per le altre (senza drop_first)
other_dummies = pd.get_dummies(df[['histology', 'smoking']])

# Unisci tutto
df_encoded = pd.concat([df.drop(columns=['sex', 'histology', 'smoking']),
                        sex_dummies, other_dummies], axis=1)

cols_to_convert = ['histology_adenocarcinoma', 'histology_nsclc', 'histology_squamous' ,'sex_male', 
                   'smoking_current', 'smoking_former', 'smoking_non-smoker'] #,'histology_adenosquamous'

df_encoded[cols_to_convert] = df_encoded[cols_to_convert].astype(int)

In [None]:
# split the data into features and target
X = df_encoded[df_encoded.columns.difference(['progression_cat'])]  
y = df_encoded['progression_cat']  

le = LabelEncoder()
y_encoded = le.fit_transform(y) 

In [None]:
# Step 1: Find the indices of each class
y = np.array(y_encoded)
idx_class_0 = np.where(y == 0)[0]
idx_class_1 = np.where(y == 1)[0]
idx_class_2 = np.where(y == 2)[0]

# Step 2: Randomly select 5 samples from class 1 for the test set
np.random.seed(42)
idx_class_1_test = np.random.choice(idx_class_1, size=5, replace=False)

# Step 3: Build the rest of the test indices
# First, remove the already selected class 1 samples from the full index list
remaining_idx = np.setdiff1d(np.arange(len(y)), idx_class_1_test)

# Define total test set size (20% of the dataset)
test_size = int(0.2 * len(y))

# Compute how many more samples are needed to reach the desired test size
other_needed = test_size - len(idx_class_1_test)

# Randomly select the remaining test samples
idx_remaining_test = np.random.choice(remaining_idx, size=other_needed, replace=False)

# Step 4: Combine all test indices and define the train+validation indices
idx_test = np.concatenate([idx_class_1_test, idx_remaining_test])
idx_train_val = np.setdiff1d(np.arange(len(y)), idx_test)

# Step 5: Create the corresponding test and train+validation sets
X_test = X.iloc[idx_test]
y_test = y[idx_test]
X_train_val = X.iloc[idx_train_val]
y_train_val = y[idx_train_val]

# Step 6: Split train and validation sets with stratification
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val,
    y_train_val,
    test_size=0.2,
    random_state=42,
    stratify=y_train_val
)


In [None]:
print("Shape of X_train_val:",X_train_val.shape)
print("Shape of X_train:",X_train.shape)
print("Shape of X_val:",X_val.shape)
print("Shape of X_test:",X_test.shape)

In [None]:
binary_features = ['lung_only_m1', 'pleural', 'pericard', 'lymph_nodes_only_m1', 'soft_tissue', 'leptomingeal','skin','peritoneal','renal',
                   'pancreas', 'brain', 'liver', 'bone', 'adrenal','histology_adenocarcinoma', 'histology_nsclc', 
                   'histology_squamous', 'sex_male','smoking_current', 'smoking_former', 'smoking_non-smoker' ]#'histology_adenosquamous',
numeric_features = ['neut_abs...143','linfo_abs...144', 'plaqtbaselineio', 'age', 'ps_at_diagnosis_ad', 'n#_mets_sites', 'leucotbaselineio',
                    'hbbaselineio','baso_abs...145', 'mono_abs...147'] #'duration_l1', 'time_to_l1_start'


X_train_scaled = X_train.copy()
X_val_scaled = X_val.copy() 
X_test_scaled = X_test.copy()
X_train_val_scaled = X_train_val.copy()

scaler = StandardScaler()

X_train_val_scaled[numeric_features] = scaler.fit_transform(X_train_val_scaled[numeric_features])
X_train_scaled[numeric_features] = scaler.fit_transform(X_train_scaled[numeric_features])
X_val_scaled[numeric_features] = scaler.fit_transform(X_val_scaled[numeric_features])
X_test_scaled[numeric_features] = scaler.fit_transform(X_test_scaled[numeric_features])

In [None]:
def evaluate_model(y_true, y_pred, y_proba, class_names=le.classes_):
    """
    y_true: true labels 
    y_pred: predicted labels
    y_proba: predicted probabilities (array [n_samples, n_classes])
    class_names: le.classes
    """

    # Main metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision (weighted): {precision:.4f}")
    print(f"Recall (weighted): {recall:.4f}")
    print(f"F1-score (weighted): {f1:.4f}\n")
    
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=class_names))

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')

    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'),
                     ha="center", va="center",
                     color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.show()

    # ROC & PR only if available
    try:
        # AUC-ROC Macro
        auc_macro = roc_auc_score(y_true, y_proba, multi_class='ovr', average='macro')
        avg_precision_macro = average_precision_score(y_true, y_proba, average='macro')
        print(f"AUC-ROC (macro): {auc_macro:.4f}")
        print(f"Average Precision (macro): {avg_precision_macro:.4f}")
    except Exception as e:
        print("ROC/PR Curve Error:", e)


In [None]:
feature_names = binary_features + numeric_features
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names)
X_train_val_scaled_df = pd.DataFrame(X_train_val_scaled, columns=feature_names)
X_test_df = pd.DataFrame(X_test, columns=feature_names)
X_train_df = pd.DataFrame(X_train, columns=feature_names)

class_names = le.classes_

# Logistic Regression

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2', None],  
    'fit_intercept': [True, False],
    'class_weight': [None, 'balanced']
}

scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'neg_log_loss': 'neg_log_loss'
}

grid = GridSearchCV(
    LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=SEED),
    param_grid,
    scoring=scoring,
    refit='f1_macro', 
    cv=5,
    verbose=1,
    n_jobs=-1
)


grid.fit(X_train_val_scaled, y_train_val)

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)


In [None]:
lr_model_best = grid.best_estimator_
lr_model_best.fit(X_train_val_scaled, y_train_val)

#Predict the test set
y_pred_lr_best = lr_model_best.predict(X_test_scaled)
y_prob_lr_best = lr_model_best.predict_proba(X_test_scaled)

print("\nTest Set Evaluation:")
evaluate_model(y_test, y_pred_lr_best, y_prob_lr_best)

In [None]:
explainer = shap.Explainer(lr_model_best, X_train_val_scaled_df)
shap_values = explainer(X_test_scaled_df)

for i, class_name in enumerate(class_names): 
    print(f"SHAP plot for class {class_name}:")
    shap.plots.beeswarm(shap_values[:,:,i], max_display=len(feature_names), show=True)

# Random Forest

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],              
    'max_depth': [None, 10, 20, 30],              
    'class_weight': ['balanced', None],           
    'bootstrap': [True, False],                   
    'criterion': ['gini', 'entropy']              
}


grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=SEED),
    param_grid=param_grid,
    scoring=scoring,
    refit='f1_macro',    
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_val, y_train_val)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


In [None]:
rf_model_best = grid_search.best_estimator_


y_pred_rf_best = rf_model_best.predict(X_test)
y_prob_rf_best = rf_model_best.predict_proba(X_test)
evaluate_model(y_test, y_pred_rf_best, y_prob_rf_best)

In [None]:
explainer= shap.TreeExplainer(rf_model_best, feature_perturbation="tree_path_dependent")
shap_values = explainer(X_test_df)

for i, class_name in enumerate(class_names):  
    print(f"SHAP plot for class {class_name}:")
    shap.plots.beeswarm(shap_values[:,:,i], max_display=len(feature_names), show=True)

# Gradient Boosting

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],               
    'learning_rate': [0.01, 0.05, 0.1, 0.2],       
    'subsample': [0.6, 0.8, 1.0],                 
    'max_features': ['sqrt', 'log2', None]         
}

gb = GradientBoostingClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=gb,
    param_grid=param_grid,
    cv=5,
    scoring=scoring,
    refit='f1_macro',  
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train_val, y_train_val)

print("Best params:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

In [None]:
gb_model_best = grid_search.best_estimator_

y_pred_gb_best = gb_model_best.predict(X_test)
y_prob_gb_best = gb_model_best.predict_proba(X_test)

evaluate_model(y_test, y_pred_gb_best, y_prob_gb_best)

In [None]:
explainer = shap.Explainer(gb_model_best.predict_proba, X_train_df, feature_perturbation="interventional")
shap_values = explainer(X_test_df)

for i, class_name in enumerate(class_names):  
    print(f"SHAP plot for class {class_name}:")
    shap.plots.beeswarm(shap_values[:,:,i], max_display=len(feature_names), show=True)

# XGBoost

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1],

}

xgb_clf = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=3,  
    random_state=42,
    eval_metric='mlogloss',
    use_label_encoder=False
)

grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring=scoring,
    refit='f1_macro', 
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_val, y_train_val)

print("Best params:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

In [None]:
best_xgb_model = grid_search.best_estimator_

y_pred_xgb_best = best_xgb_model.predict(X_test)
y_prob_xgb_best = best_xgb_model.predict_proba(X_test)
evaluate_model(y_test, y_pred_xgb_best, y_prob_xgb_best)

In [None]:
explainer= shap.TreeExplainer(best_xgb_model, feature_perturbation="tree_path_dependent")
shap_values = explainer(X_test_df)

for i, class_name in enumerate(class_names):  
    print(f"SHAP plot for class {class_name}:")    
    shap.plots.beeswarm(shap_values[:,:,i], max_display=len(feature_names), show=True)

# Light Gradient Boosting Machine

In [None]:
param_grid = {
    'num_leaves': [31, 63],
    'max_depth': [-1, 10],
    'learning_rate': [0.01, 0.1],
    'min_child_samples': [1, 10],
    'min_split_gain': [0.0, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'neg_log_loss': 'neg_log_loss'
}

lgbm = LGBMClassifier(
    objective='multiclass',
    num_class=4,
    is_unbalance=True,
    random_state=42,
    force_col_wise=True
)

grid = GridSearchCV(lgbm, param_grid, cv=3, scoring=scoring,refit="f1_macro" , n_jobs=-1, verbose=1)
grid.fit(X_train_val, y_train_val)

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)

In [None]:
best_lgbm_model = grid.best_estimator_

y_pred_lgbm_best = best_lgbm_model.predict(X_test)
y_prob_lgbm_best = best_lgbm_model.predict_proba(X_test)
evaluate_model(y_test, y_pred_lgbm_best, y_prob_lgbm_best)

In [None]:
explainer= shap.TreeExplainer(best_lgbm_model, feature_perturbation="tree_path_dependent")
shap_values = explainer(X_test_df)

for i, class_name in enumerate(class_names):  
    print(f"SHAP plot for class {class_name}:")
    shap.plots.beeswarm(shap_values[:,:,i], max_display=len(feature_names), show=True)

# SVC

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear', 'sigmoid'],
    'class_weight': ['balanced', None],
    'shrinking': [True, False] 
}


svc = SVC(probability=True, random_state=42)

grid = GridSearchCV(svc, param_grid, cv=3, scoring=scoring,refit="f1_macro" , n_jobs=-1, verbose=1)
grid.fit(X_train_val_scaled, y_train_val)

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)


In [None]:
best_svc_model = grid.best_estimator_

y_pred_svc_best = best_svc_model.predict(X_test)
y_prob_svc_best = best_svc_model.predict_proba(X_test)
evaluate_model(y_test, y_pred_svc_best, y_prob_svc_best)

In [None]:
explainer = shap.KernelExplainer(best_svc_model.predict_proba, shap.kmeans(X_train_val_scaled_df, 10))
shap_values = explainer.shap_values(X_test_scaled_df, nsamples=100)

for i, class_name in enumerate(class_names):  
    explanation = shap.Explanation(
    values=shap_values[:, :, i],  
    data=X_test_scaled_df,
    feature_names=feature_names
)
    print(f"SHAP plot for class {class_name}:")
    shap.plots.beeswarm(explanation, max_display=len(feature_names), show=True)

# K Neighbors

In [None]:
param_grid = {
    'n_neighbors': [10, 12, 15, 17, 20],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1=Manhattan, 2=Euclidean
    
}

knn = KNeighborsClassifier(algorithm='auto')

grid = GridSearchCV(knn, param_grid, cv=3, scoring=scoring, refit="f1_macro" , n_jobs=-1, verbose=1)
grid.fit(X_train_val_scaled, y_train_val)

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)

In [None]:
best_knn_model = grid.best_estimator_

y_pred_knn_best = best_knn_model.predict(X_test)
y_prob_knn_best = best_knn_model.predict_proba(X_test)
evaluate_model(y_test, y_pred_knn_best, y_prob_knn_best)

In [None]:
background = shap.utils.sample(X_train_val_scaled_df, 50, random_state=42)  
explainer = shap.KernelExplainer(best_knn_model.predict_proba, background)
X_subset = X_test_scaled_df[:50]  
shap_values = explainer.shap_values(X_subset)

for i, class_name in enumerate(class_names):  
    explanation = shap.Explanation(
    values=shap_values[:, :, i],  
    data=X_subset,
    feature_names=feature_names
)
    print(f"SHAP plot for class {class_name}:")
    shap.plots.beeswarm(explanation, max_display=len(feature_names), show=True)

# Ridge Classifier

In [None]:
param_grid = {
    'alpha': [0.1, 1.0, 10, 100],
    'class_weight': [None, 'balanced'],
    'solver': ['auto','sag', 'lsqr', 'sparse_cg']
    
}
scoring_rc = {
    'accuracy': 'accuracy',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'f1_macro': 'f1_macro'
}

rc = RidgeClassifier(random_state=42)

grid = GridSearchCV(rc, param_grid, cv=3, scoring=scoring_rc, refit="f1_macro" , n_jobs=-1, verbose=1)
grid.fit(X_train_val_scaled, y_train_val)

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)


In [None]:
best_rc_model = grid.best_estimator_

y_pred_rc_best = best_rc_model.predict(X_test_scaled)
y_prob_rc_best = best_rc_model.decision_function(X_test_scaled)
evaluate_model(y_test, y_pred_rc_best, y_prob_rc_best)

In [None]:
masker = shap.maskers.Independent(X_train_val_scaled_df)
explainer = shap.LinearExplainer(best_rc_model, masker)
shap_values = explainer.shap_values(X_test_scaled_df)

for i, class_name in enumerate(class_names):  
    explanation = shap.Explanation(
    values=shap_values[:,:, i],  
    data=X_test_scaled_df,
    feature_names=feature_names
)
    print(f"SHAP plot for class {class_name}:")
    shap.plots.beeswarm(explanation, max_display=len(feature_names), show=True)

# Naive Bayes

In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train_val_scaled, y_train_val)  

y_pred_nb = nb_model.predict(X_test_scaled)
y_prob_nb = nb_model.predict_proba(X_test_scaled)

print("\nTest Set Evaluation:")
evaluate_model(y_test, y_pred_nb, y_prob_nb)


In [None]:
background = shap.kmeans(X_train_val_scaled_df, 10)
explainer = shap.KernelExplainer(nb_model.predict_proba, background)
shap_values = explainer.shap_values(X_test_scaled_df, nsamples=100)

for i, class_name in enumerate(class_names):  
    explanation = shap.Explanation(
    values=shap_values[:, :, i],  
    data=X_test_scaled_df,
    feature_names=feature_names
)
    print(f"SHAP plot per classe {class_name}:")
    shap.plots.beeswarm(explanation, max_display=len(feature_names))

# Decison Tree Classifier

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', None],
    'splitter': ['best', 'random']
}

tree = DecisionTreeClassifier(random_state=42)

grid = GridSearchCV(
    estimator=tree,
    param_grid=param_grid,
    scoring= scoring, 
    refit="f1_macro", # o altra metrica rilevante per te
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid.fit(X_train_val, y_train_val)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)

In [None]:
best_dt_model = grid.best_estimator_

y_pred_dt_best = best_dt_model.predict(X_test)
y_prob_dt_best = best_dt_model.predict_proba(X_test)
evaluate_model(y_test, y_pred_dt_best, y_prob_dt_best)

In [None]:
explainer = shap.TreeExplainer(best_dt_model)
shap_values = explainer.shap_values(X_test)

for i, class_name in enumerate(class_names):  
    print(f"SHAP plot for class {class_name}:")
    shap.summary_plot(shap_values[:,:,i], X_test, feature_names=X_test_df.columns)
