# Imports

In [25]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

# Open data and prepare it

In [16]:
file_path = './data/synthetic_data_lung_cancer.csv'
df = pd.read_csv(file_path)

df['DIED'] = 0

# Identify rows where the "death" state is present
death_rows = df[df['DEFINITION_ID'].str.lower() == 'death']

# Iterate through each death row and update DIED column for corresponding rows
for _, death_row in death_rows.iterrows():
    patient_id = death_row['SUBJECT_ID']
    death_time = death_row['TIME']
    
    if death_time < 5:
        # Update DIED to 1 for rows with the same PATIENT_ID and TIME within 1 year
        df.loc[(df['SUBJECT_ID'] == patient_id), 'DIED'] = 1

# Delete death rows
df = df[df['DEFINITION_ID'].str.lower() != 'death']
df= df.drop(['TIME'], axis=1)

# One-hot encode the original DataFrame
df = pd.get_dummies(df, columns=['DEFINITION_ID'], prefix='DEF')

# Group by 'SUBJECT_ID' and perform a logical OR on 'DIED'
df = df.groupby('SUBJECT_ID').max()

# Separate features and target variable
X = df.drop(['DIED'], axis=1)
y = df['DIED']

# Model test

Given our context and requirements, here are several models that we can consider for our binary classification problem in the medical field:

Logistic Regression:  
* Simple yet effective, especially for binary classification problems.
* Provides interpretable coefficients.
* Fast to train and suitable for your dataset size.  
  
Random Forest:  
* Ensemble model that can handle both numerical and categorical features.
* Robust and less prone to overfitting.
* Can provide feature importances.  
  
Gradient Boosting (e.g., XGBoost):  
* Ensemble model known for high performance.
* Handles complex relationships well.
* Can provide feature importances.  
  
Support Vector Machines (SVM):  
* Suitable for binary classification.
* Effective in high-dimensional spaces.
* Kernel trick can capture complex relationships.  
  
Naive Bayes:  
* Simple probabilistic model that can work well with sparse, high-dimensional data.
* Assumes independence between features.  
  
Neural Networks:  
* Deep learning models can capture complex patterns.
* May require more data and computational resources.
* Can be effective for feature learning.

Given the nature of our data and the problem, starting with Logistic Regression, Random Forest, and XGBoost seems to be a good start. These models cover a range of complexities, and we can evaluate their performance using cross-validation and metrics such as AUC-ROC.

In [17]:
# Logistic Regression
lr = LogisticRegression()
lr_scores = cross_val_score(lr, X, y, cv=5, scoring='roc_auc')

# Random Forest
rf = RandomForestClassifier()
rf_scores = cross_val_score(rf, X, y, cv=5, scoring='roc_auc')

# XGBoost
xgb = XGBClassifier()
xgb_scores = cross_val_score(xgb, X, y, cv=5, scoring='roc_auc')

# Gradient Boosting
gb = GradientBoostingClassifier()
gb_scores = cross_val_score(gb, X, y, cv=5, scoring='roc_auc')


# Support Vector Machines (SVM)
svm = SVC(probability=True)
svm_scores = cross_val_score(svm, X, y, cv=5, scoring='roc_auc')

# Naive Bayes
nb = BernoulliNB()
nb_scores = cross_val_score(nb, X, y, cv=5, scoring='roc_auc')

# Neural Network
nn = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
nn_scores = cross_val_score(nn, X, y, cv=5, scoring='roc_auc')

# Print AUC-ROC scores for comparison
print("Logistic Regression AUC-ROC:", lr_scores.mean())
print("Random Forest AUC-ROC:", rf_scores.mean())
print("XGBoost AUC-ROC:", xgb_scores.mean())
print("Gradient Boosting AUC-ROC:", gb_scores.mean())
print("SVM Cross-Validation AUC-ROC:", np.mean(svm_scores))
print("Naive Bayes Cross-Validation AUC-ROC:", np.mean(nb_scores))
print("Neural Network Cross-Validation AUC-ROC:", np.mean(nn_scores))

Logistic Regression AUC-ROC: 0.82577379231791
Random Forest AUC-ROC: 0.862573361654244
XGBoost AUC-ROC: 0.8633256681050799
Gradient Boosting AUC-ROC: 0.8681618378309555
SVM Cross-Validation AUC-ROC: 0.8453044239808947
Naive Bayes Cross-Validation AUC-ROC: 0.7835459337297573
Neural Network Cross-Validation AUC-ROC: 0.822034090048796


Based on the AUC-ROC scores we obtain, we can see:
* Gradient Boosting: Achieves the highest AUC-ROC score (0.8682), indicating good discriminatory power. This model seems promising.
* XGBoost: Performs well with an AUC-ROC score of 0.8633. Similar to Gradient Boosting, XGBoost is an ensemble method known for its effectiveness.
* Random Forest: Also performs well with an AUC-ROC score of 0.8602. Random Forest is another ensemble method that tends to handle complex relationships.
* SVM: Achieves a respectable AUC-ROC score of 0.8453. SVMs are known for their effectiveness in high-dimensional spaces.
* Logistic Regression: Provides a decent AUC-ROC score of 0.8258. Logistic Regression is a simple yet effective model.
* Neural Network: Shows a reasonable AUC-ROC score of 0.8220. Neural Networks have the potential to capture complex patterns.
* Naive Bayes: Achieves the lowest AUC-ROC score of 0.7835. Naive Bayes might be less suitable for this specific problem or could benefit from further tuning.  

# Hyperparameter tuning

In [18]:
# GRADIENT BoOSTING
# Define the parameter grid
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Create the Gradient Boosting model
gb = GradientBoostingClassifier()

# Perform grid search
grid_search_gb = GridSearchCV(gb, param_grid_gb, cv=5, scoring='roc_auc')
grid_search_gb.fit(X, y)

# Print the best hyperparameters
print("Best Hyperparameters for Gradient Boosting:", grid_search_gb.best_params_)

# XGBOOST
# Define the parameter grid
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Create the XGBoost model
xgb = XGBClassifier()

# Perform grid search
grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5, scoring='roc_auc')
grid_search_xgb.fit(X, y)

# Print the best hyperparameters
print("Best Hyperparameters for XGBoost:", grid_search_xgb.best_params_)

# RANDOM FOREST
# Define the parameter grid
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Create the Random Forest model
rf = RandomForestClassifier()

# Perform grid search
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='roc_auc')
grid_search_rf.fit(X, y)

# Print the best hyperparameters
print("Best Hyperparameters for Random Forest:", grid_search_rf.best_params_)

# SVM
# Define the parameter grid
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Create the SVM model
svm = SVC(probability=True)

# Perform grid search
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='roc_auc')
grid_search_svm.fit(X, y)

# Print the best hyperparameters
print("Best Hyperparameters for SVM:", grid_search_svm.best_params_)

Best Hyperparameters for Gradient Boosting: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100}
Best Hyperparameters for XGBoost: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}
Best Hyperparameters for Random Forest: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}
Best Hyperparameters for SVM: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}


Best Hyperparameters for Gradient Boosting: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100}  
Best Hyperparameters for XGBoost: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}  
Best Hyperparameters for Random Forest: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}  
Best Hyperparameters for SVM: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}  

# Fitting the model and computing roc_auc scores

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Gradient Boosting
gb = GradientBoostingClassifier(learning_rate=0.05, max_depth=5, n_estimators=100)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict_proba(X_test)[:, 1]
auc_gb = roc_auc_score(y_test, y_pred_gb)
print("Gradient Boosting AUC-ROC on Test Set:", auc_gb)

# Model Evaluation Metrics
print("\nGradient Boosting Model Evaluation:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, gb.predict(X_test)))
print("\nClassification Report:")
print(classification_report(y_test, gb.predict(X_test)))

# XGBoost
xgb = XGBClassifier(learning_rate=0.2, max_depth=5, n_estimators=200)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict_proba(X_test)[:, 1]
auc_xgb = roc_auc_score(y_test, y_pred_xgb)
print("\nXGBoost AUC-ROC on Test Set:", auc_xgb)

# Model Evaluation Metrics
print("\nXGBoost Model Evaluation:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, xgb.predict(X_test)))
print("\nClassification Report:")
print(classification_report(y_test, xgb.predict(X_test)))

# Random Forest
rf = RandomForestClassifier(max_depth=20, min_samples_split=5, n_estimators=200)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict_proba(X_test)[:, 1]
auc_rf = roc_auc_score(y_test, y_pred_rf)
print("\nRandom Forest AUC-ROC on Test Set:", auc_rf)

# Model Evaluation Metrics
print("\nRandom Forest Model Evaluation:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf.predict(X_test)))
print("\nClassification Report:")
print(classification_report(y_test, rf.predict(X_test)))

# SVM
svm = SVC(C=0.1, gamma='scale', kernel='rbf', probability=True)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict_proba(X_test)[:, 1]
auc_svm = roc_auc_score(y_test, y_pred_svm)
print("\nSVM AUC-ROC on Test Set:", auc_svm)

# Model Evaluation Metrics
print("\nSVM Model Evaluation:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, svm.predict(X_test)))
print("\nClassification Report:")
print(classification_report(y_test, svm.predict(X_test)))

Gradient Boosting AUC-ROC on Test Set: 0.8608265027322405
XGBoost AUC-ROC on Test Set: 0.8609972677595628
Random Forest AUC-ROC on Test Set: 0.8551912568306012
SVM AUC-ROC on Test Set: 0.832308743169399


# Trying to use an ensemble of our top performing models

In [24]:
# Initialize individual models
gb = GradientBoostingClassifier(learning_rate=0.05, max_depth=5, n_estimators=100)
xgb = XGBClassifier(learning_rate=0.2, max_depth=5, n_estimators=200)
rf = RandomForestClassifier(max_depth=20, min_samples_split=5, n_estimators=200)
svm = SVC(C=0.1, gamma='scale', kernel='rbf', probability=True)

# Create an ensemble using VotingClassifier
ensemble = VotingClassifier(estimators=[
    ('gb', gb),
    ('xgb', xgb),
    ('rf', rf),
    ('svm', svm)
], voting='soft')  # 'soft' for averaging predicted probabilities

# Fit the ensemble model on the training data
ensemble.fit(X_train, y_train)

# Predict probabilities on the test set
y_pred_ensemble = ensemble.predict_proba(X_test)[:, 1]

# Evaluate AUC-ROC on the test set
auc_ensemble = roc_auc_score(y_test, y_pred_ensemble)
print("Ensemble AUC-ROC on Test Set:", auc_ensemble)

Ensemble AUC-ROC on Test Set: 0.8705601092896175
