In [None]:
import pandas as pd
import numpy as np
import time
import random
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from skopt import BayesSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from imblearn.pipeline import Pipeline
import shap
from sklearn.inspection import permutation_importance
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
from xgboost import plot_importance
import pickle
import seaborn as sns
sns.set_theme(style="white", palette="rocket_r")

# For reproducibility
np.random.seed(42)

In [None]:
# Get the validation folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=33)

# 6-hour Death Prediction

In [None]:
X_train = np.load('X_train_static_6.npy')
X_test = np.load('X_test_static_6.npy')
y_train = np.load('y_train_static_6.npy')
y_test = np.load('y_test_static_6.npy')

In [None]:
# Remove APACHE
X_train = np.delete(X_train, 2, 1)
X_test = np.delete(X_test, 2, 1)

In [None]:
np.unique(y_train, return_counts=True)

In [None]:
18463/2511

# Impute Missing Values with MICE

In [None]:
imp_mean = IterativeImputer(random_state=0, max_iter = 10)

In [None]:
imp_mean.fit(X_train)
X_train_res = imp_mean.transform(X_train)
X_test_res = imp_mean.transform(X_test)

# XGBoost

In [None]:
# Define the model you are interested in
XGBOOSTmodel = XGBClassifier(use_label_encoder=False)
imputer = IterativeImputer(random_state=0, max_iter = 10)
pipeline = Pipeline(steps=[('i', imputer), ('m', XGBOOSTmodel)])

In [None]:
# Tune the XGBoost model using Bayesian optimisation
# Define the hyperparameters you want to sweep through (important it is manual for generalisation)
# C for regularisation if doing regression
# kernel if doing SVM for example

# In this case we are tuning for XGBoost hyperparameters
# Depth of tree
max_depth = np.linspace(1, 12, 12, endpoint=True)
max_depth = [round(x) for x in max_depth]
# maximum features
n_estimators = list(range(50, 400, 50))
# Learning rate
lr = [0.001, 0.01, 0.1, 0.3, 0.5, 0.7]
# Regularisation for imbalanced data
max_delta_step = [0, 1, 3, 5, 7, 10]
# Gamma for overfitting control
min_split_loss = [0, 0.5]
# Balance weights for imbalanced classes for AUC
scale_pos_weight = [7.352847471127041]

# Create the grid
param_grid = {'m__max_depth': max_depth,
               'm__n_estimators': n_estimators,
               'm__learning_rate': lr,
              'm__max_delta_step': max_delta_step,
             'm__min_split_loss': min_split_loss,
             'm__scale_pos_weight': scale_pos_weight}

In [None]:
# Try with Bayesian optimisation for faster computation of tuning
opt = BayesSearchCV(pipeline, param_grid, n_iter=50, cv=cv, verbose=1, refit=False, scoring='roc_auc')
opt.fit(X_train, np.ravel(y_train))

In [None]:
# With the following parameter combination being optimal
print("Best parameter combo:", opt.best_params_)
# Having the following score
print("Best AUROC:", opt.best_score_)

In [None]:
# Test the model on standalone set
XGBOOSTmodel = XGBClassifier(scale_pos_weight = 7.352847471127041, max_depth = 10, n_estimators = 200,learning_rate = 0.1, min_split_loss = 0.5, max_delta_step = 0, use_label_encoder=False)

In [None]:
XGBOOSTmodel.fit(X_train_res, np.ravel(y_train))

In [None]:
print('Results:')
print('AUROC is:', metrics.roc_auc_score(y_test, XGBOOSTmodel.predict_proba(X_test_res)[:,1]))
print('Accuracy is:', metrics.accuracy_score(y_test, XGBOOSTmodel.predict(X_test_res)))
print('Average Precision is:', metrics.average_precision_score(y_test, XGBOOSTmodel.predict_proba(X_test_res)[:,1]))
print('Weighted F1 is:', metrics.f1_score(y_test, XGBOOSTmodel.predict(X_test_res), average='weighted'))
print('Sensitivity is:', metrics.recall_score(y_test, XGBOOSTmodel.predict(X_test_res)))
print('Balanced accuracy is:', metrics.balanced_accuracy_score(y_test, XGBOOSTmodel.predict(X_test_res)))
print('Specificity is:', metrics.recall_score(y_test, XGBOOSTmodel.predict(X_test_res), pos_label=0))
print('#################################################')

In [None]:
# Make metric plots
metrics.plot_roc_curve(XGBOOSTmodel, X_test_res, y_test) 
metrics.plot_precision_recall_curve(XGBOOSTmodel, X_test_res, y_test) 
plt.show()

In [None]:
# Confusion matrix
ax = sns.heatmap(confusion_matrix(y_test, XGBOOSTmodel.predict(X_test_res)), annot=True, cmap='Reds')

ax.set_title('Confusion Matrix\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])

## Display the visualization of the Confusion Matrix.
plt.show()

In [None]:
features = ['secondary MI',
 'Time Since Admission',
 'gender',
 'age',
 'admissionheight',
 'admissionweight',
 'hour',
 'teachingstatus',
 'ethnicity_African American',
 'ethnicity_Asian',
 'ethnicity_Caucasian',
 'ethnicity_Hispanic',
 'ethnicity_Native American',
 'ethnicity_Other/Unknown',
 'unittype_CCU-CTICU',
 'unittype_CSICU',
 'unittype_CTICU',
 'unittype_Cardiac ICU',
 'unittype_MICU',
 'unittype_Med-Surg ICU',
 'unittype_Neuro ICU',
 'unittype_SICU',
 'unitadmitsource_Acute Care/Floor',
 'unitadmitsource_Direct Admit',
 'unitadmitsource_Emergency Department',
 'unitadmitsource_Floor',
 'unitadmitsource_Operating Room',
 'unitadmitsource_Other Hospital',
 'unitadmitsource_PACU',
 'unitadmitsource_Recovery Room',
 'unitadmitsource_Step-Down Unit (SDU)',
 'unitadmitsource_misc',
 'unitvisitnumber_1',
 'unitvisitnumber_2',
 'unitvisitnumber_3',
 'unitvisitnumber_misc',
 'unitstaytype_admit',
 'unitstaytype_readmit',
 'unitstaytype_transfer',
 'physicianspeciality_Specialty Not Specified',
 'physicianspeciality_cardiology',
 'physicianspeciality_critical care medicine (CCM)',
 'physicianspeciality_family practice',
 'physicianspeciality_hospitalist',
 'physicianspeciality_internal medicine',
 'physicianspeciality_misc',
 'physicianspeciality_neurology',
 'physicianspeciality_other',
 'physicianspeciality_pulmonary',
 'physicianspeciality_pulmonary/CCM',
 'physicianspeciality_surgery-cardiac',
 'physicianspeciality_surgery-general',
 'physicianspeciality_surgery-neuro',
 'physicianspeciality_surgery-trauma',
 'physicianspeciality_surgery-vascular',
 'physicianspeciality_unknown',
 'numbedscategory_100 - 249',
 'numbedscategory_250 - 499',
 'numbedscategory_<100',
 'numbedscategory_>= 500',
 'region_Midwest',
 'region_Northeast',
 'region_South',
 'region_West',
 'noninvasivesystolic_mean',
 'noninvasivediastolic_mean',
 'noninvasivemean_mean',
 'noninvasivesystolic_std',
 'noninvasivediastolic_std',
 'noninvasivemean_std',
 '-basos_mean',
 '-eos_mean',
 '-lymphs_mean',
 '-monos_mean',
 '-polys_mean',
 'ALT (SGPT)_mean',
 'AST (SGOT)_mean',
 'BUN_mean',
 'Base Excess_mean',
 'FiO2_mean',
 'HCO3_mean',
 'Hct_mean',
 'Hgb_mean',
 'MCH_mean',
 'MCHC_mean',
 'MCV_mean',
 'MPV_mean',
 'O2 Sat (%)_mean',
 'PT_mean',
 'PT - INR_mean',
 'PTT_mean',
 'RBC_mean',
 'RDW_mean',
 'WBC x 1000_mean',
 'albumin_mean',
 'alkaline phos._mean',
 'anion gap_mean',
 'bedside glucose_mean',
 'bicarbonate_mean',
 'calcium_mean',
 'chloride_mean',
 'creatinine_mean',
 'glucose_mean',
 'lactate_mean',
 'magnesium_mean',
 'pH_mean',
 'paCO2_mean',
 'paO2_mean',
 'phosphate_mean',
 'platelets x 1000_mean',
 'potassium_mean',
 'sodium_mean',
 'total bilirubin_mean',
 'total protein_mean',
 'troponin - I_mean',
 'urinary specific gravity_mean',
 '-basos_std',
 '-eos_std',
 '-lymphs_std',
 '-monos_std',
 '-polys_std',
 'ALT (SGPT)_std',
 'AST (SGOT)_std',
 'BUN_std',
 'Base Excess_std',
 'FiO2_std',
 'HCO3_std',
 'Hct_std',
 'Hgb_std',
 'MCH_std',
 'MCHC_std',
 'MCV_std',
 'MPV_std',
 'O2 Sat (%)_std',
 'PT_std',
 'PT - INR_std',
 'PTT_std',
 'RBC_std',
 'RDW_std',
 'WBC x 1000_std',
 'albumin_std',
 'alkaline phos._std',
 'anion gap_std',
 'bedside glucose_std',
 'bicarbonate_std',
 'calcium_std',
 'chloride_std',
 'creatinine_std',
 'glucose_std',
 'lactate_std',
 'magnesium_std',
 'pH_std',
 'paCO2_std',
 'paO2_std',
 'phosphate_std',
 'platelets x 1000_std',
 'potassium_std',
 'sodium_std',
 'total bilirubin_std',
 'total protein_std',
 'troponin - I_std',
 'urinary specific gravity_std',
 'mechanical_ventilation']

In [None]:
# Feature importance
explainer = shap.TreeExplainer(XGBOOSTmodel)
shap_values = explainer.shap_values(X_test_res)

In [None]:
# Define colormap
my_cmap = plt.get_cmap('Reds')

f = plt.figure()
shap.summary_plot(shap_values, X_test_res, feature_names = features, show=False, color_bar=False)

# Change the colormap of the artists
for fc in plt.gcf().get_children():
    for fcc in fc.get_children():
        if hasattr(fcc, "set_cmap"):
            fcc.set_cmap(my_cmap)
            fcc.set_clim(vmin=-0.2)
plt.colorbar(fraction=0.01)

# 24-hour prediction

In [None]:
X_train = np.load('X_train_static_24.npy')
X_test = np.load('X_test_static_24.npy')
y_train = np.load('y_train_static_24.npy')
y_test = np.load('y_test_static_24.npy')

In [None]:
# Remove APACHE
X_train = np.delete(X_train, 2, 1)
X_test = np.delete(X_test, 2, 1)

In [None]:
imp_mean = IterativeImputer(random_state=0, max_iter = 10)
imp_mean.fit(X_train)
X_train_res = imp_mean.transform(X_train)
X_test_res = imp_mean.transform(X_test)

In [None]:
np.unique(y_train, return_counts=True)

# XGBoost

In [None]:
# Define the model you are interested in
XGBOOSTmodel = XGBClassifier(use_label_encoder=False)
imputer = IterativeImputer(random_state=0, max_iter = 10)
pipeline = Pipeline(steps=[('i', imputer), ('m', XGBOOSTmodel)])

In [None]:
# Tune the XGBoost model using Bayesian optimisation
# Define the hyperparameters you want to sweep through (important it is manual for generalisation)
# C for regularisation if doing regression
# kernel if doing SVM for example

# In this case we are tuning for XGBoost hyperparameters
# Depth of tree
max_depth = np.linspace(1, 12, 12, endpoint=True)
max_depth = [round(x) for x in max_depth]
# maximum features
n_estimators = list(range(50, 400, 50))
# Learning rate
lr = [0.001, 0.01, 0.1, 0.3, 0.5, 0.7]
# Regularisation for imbalanced data
max_delta_step = [0, 1, 3, 5, 7, 10]
# Gamma for overfitting control
min_split_loss = [0, 0.5]
# Balance weights for imbalanced classes for AUC
scale_pos_weight = [7.08745445081]

# Create the grid
param_grid = {'m__max_depth': max_depth,
               'm__n_estimators': n_estimators,
               'm__learning_rate': lr,
              'm__max_delta_step': max_delta_step,
             'm__min_split_loss': min_split_loss,
             'm__scale_pos_weight': scale_pos_weight}

In [None]:
# Try with Bayesian optimisation for faster computation of tuning
opt = BayesSearchCV(pipeline, param_grid, n_iter=50, cv=cv, verbose=1, refit=False, scoring='roc_auc')
opt.fit(X_train, np.ravel(y_train))

In [None]:
# With the following parameter combination being optimal
print("Best parameter combo:", opt.best_params_)
# Having the following score
print("Best AUROC:", opt.best_score_)

In [None]:
# Test the model on standalone set
XGBOOSTmodel = XGBClassifier(scale_pos_weight = 7.08745445081, max_depth = 12, n_estimators = 350,learning_rate = 0.1, min_split_loss = 0.0, max_delta_step = 10, use_label_encoder=False)

In [None]:
XGBOOSTmodel.fit(X_train_res, np.ravel(y_train))

In [None]:
print('Results:')
print('AUROC is:', metrics.roc_auc_score(y_test, XGBOOSTmodel.predict_proba(X_test_res)[:,1]))
print('Accuracy is:', metrics.accuracy_score(y_test, XGBOOSTmodel.predict(X_test_res)))
print('Average Precision is:', metrics.average_precision_score(y_test, XGBOOSTmodel.predict_proba(X_test_res)[:,1]))
print('Weighted F1 is:', metrics.f1_score(y_test, XGBOOSTmodel.predict(X_test_res), average='weighted'))
print('Sensitivity is:', metrics.recall_score(y_test, XGBOOSTmodel.predict(X_test_res)))
print('Balanced accuracy is:', metrics.balanced_accuracy_score(y_test, XGBOOSTmodel.predict(X_test_res)))
print('Specificity is:', metrics.recall_score(y_test, XGBOOSTmodel.predict(X_test_res), pos_label=0))
print('#################################################')

In [None]:
# Make metric plots
metrics.plot_roc_curve(XGBOOSTmodel, X_test_res, y_test) 
metrics.plot_precision_recall_curve(XGBOOSTmodel, X_test_res, y_test) 
plt.show()

In [None]:
# Feature importance
explainer = shap.TreeExplainer(XGBOOSTmodel)
shap_values = explainer.shap_values(X_test_res)

In [None]:
# Define colormap
my_cmap = plt.get_cmap('Reds')

f = plt.figure()
shap.summary_plot(shap_values, X_test_res, feature_names = features, show=False, color_bar=False)

# Change the colormap of the artists
for fc in plt.gcf().get_children():
    for fcc in fc.get_children():
        if hasattr(fcc, "set_cmap"):
            fcc.set_cmap(my_cmap)
            fcc.set_clim(vmin=-0.2)
plt.colorbar(fraction=0.01)

# 18-hour Prediction

In [None]:
X_train = np.load('X_train_static_18.npy')
X_test = np.load('X_test_static_18.npy')
y_train = np.load('y_train_static_18.npy')
y_test = np.load('y_test_static_18.npy')

In [None]:
# Remove APACHE
X_train = np.delete(X_train, 2, 1)
X_test = np.delete(X_test, 2, 1)

In [None]:
imp_mean = IterativeImputer(random_state=0, max_iter = 10)
imp_mean.fit(X_train)
X_train_res = imp_mean.transform(X_train)
X_test_res = imp_mean.transform(X_test)

In [None]:
np.unique(y_train, return_counts=True)

# XGBoost

In [None]:
# Define the model you are interested in
XGBOOSTmodel = XGBClassifier(use_label_encoder=False)
imputer = IterativeImputer(random_state=0, max_iter = 10)
pipeline = Pipeline(steps=[('i', imputer), ('m', XGBOOSTmodel)])

In [None]:
# Tune the XGBoost model using Bayesian optimisation
# Define the hyperparameters you want to sweep through (important it is manual for generalisation)
# C for regularisation if doing regression
# kernel if doing SVM for example

# In this case we are tuning for XGBoost hyperparameters
# Depth of tree
max_depth = np.linspace(1, 12, 12, endpoint=True)
max_depth = [round(x) for x in max_depth]
# maximum features
n_estimators = list(range(50, 400, 50))
# Learning rate
lr = [0.001, 0.01, 0.1, 0.3, 0.5, 0.7]
# Regularisation for imbalanced data
max_delta_step = [0, 1, 3, 5, 7, 10]
# Gamma for overfitting control
min_split_loss = [0, 0.5]
# Balance weights for imbalanced classes for AUC
scale_pos_weight = [7.65759312321]

# Create the grid
param_grid = {'m__max_depth': max_depth,
               'm__n_estimators': n_estimators,
               'm__learning_rate': lr,
              'm__max_delta_step': max_delta_step,
             'm__min_split_loss': min_split_loss,
             'm__scale_pos_weight': scale_pos_weight}

In [None]:
# Try with Bayesian optimisation for faster computation of tuning
opt = BayesSearchCV(pipeline, param_grid, n_iter=50, cv=cv, verbose=1, refit=False, scoring='roc_auc')
opt.fit(X_train, np.ravel(y_train))

In [None]:
# With the following parameter combination being optimal
print("Best parameter combo:", opt.best_params_)
# Having the following score
print("Best AUROC:", opt.best_score_)

In [None]:
# Test the model on standalone set
XGBOOSTmodel = XGBClassifier(scale_pos_weight = 7.65759312321, max_depth = 12, n_estimators = 350,learning_rate = 0.1, min_split_loss = 0.0, max_delta_step = 1, use_label_encoder=False)

In [None]:
XGBOOSTmodel.fit(X_train_res, np.ravel(y_train))

In [None]:
print('Results:')
print('AUROC is:', metrics.roc_auc_score(y_test, XGBOOSTmodel.predict_proba(X_test_res)[:,1]))
print('Accuracy is:', metrics.accuracy_score(y_test, XGBOOSTmodel.predict(X_test_res)))
print('Average Precision is:', metrics.average_precision_score(y_test, XGBOOSTmodel.predict_proba(X_test_res)[:,1]))
print('Weighted F1 is:', metrics.f1_score(y_test, XGBOOSTmodel.predict(X_test_res), average='weighted'))
print('Sensitivity is:', metrics.recall_score(y_test, XGBOOSTmodel.predict(X_test_res)))
print('Balanced accuracy is:', metrics.balanced_accuracy_score(y_test, XGBOOSTmodel.predict(X_test_res)))
print('Specificity is:', metrics.recall_score(y_test, XGBOOSTmodel.predict(X_test_res), pos_label=0))
print('#################################################')

In [None]:
# Make metric plots
metrics.plot_roc_curve(XGBOOSTmodel, X_test_res, y_test) 
metrics.plot_precision_recall_curve(XGBOOSTmodel, X_test_res, y_test) 
plt.show()

In [None]:
# Feature importance
explainer = shap.TreeExplainer(XGBOOSTmodel)
shap_values = explainer.shap_values(X_test_res)

In [None]:
# Define colormap
my_cmap = plt.get_cmap('Reds')

f = plt.figure()
shap.summary_plot(shap_values, X_test_res, feature_names = features, show=False, color_bar=False)

# Change the colormap of the artists
for fc in plt.gcf().get_children():
    for fcc in fc.get_children():
        if hasattr(fcc, "set_cmap"):
            fcc.set_cmap(my_cmap)
            fcc.set_clim(vmin=-0.2)
plt.colorbar(fraction=0.01)

# 12 hour Prediction

In [None]:
X_train = np.load('X_train_static_12.npy')
X_test = np.load('X_test_static_12.npy')
y_train = np.load('y_train_static_12.npy')
y_test = np.load('y_test_static_12.npy')

In [None]:
# Remove APACHE
X_train = np.delete(X_train, 2, 1)
X_test = np.delete(X_test, 2, 1)

In [None]:
imp_mean = IterativeImputer(random_state=0)
imp_mean.fit(X_train)
X_train_res = imp_mean.transform(X_train)
X_test_res = imp_mean.transform(X_test)

In [None]:
np.unique(y_train, return_counts=True)

# XGBoost

In [None]:
# Define the model you are interested in
XGBOOSTmodel = XGBClassifier(use_label_encoder=False)
imputer = IterativeImputer(random_state=0, max_iter = 10)
pipeline = Pipeline(steps=[('i', imputer), ('m', XGBOOSTmodel)])

In [None]:
# Tune the XGBoost model using Bayesian optimisation
# Define the hyperparameters you want to sweep through (important it is manual for generalisation)
# C for regularisation if doing regression
# kernel if doing SVM for example

# In this case we are tuning for XGBoost hyperparameters
# Depth of tree
max_depth = np.linspace(1, 12, 12, endpoint=True)
max_depth = [round(x) for x in max_depth]
# maximum features
n_estimators = list(range(50, 400, 50))
# Learning rate
lr = [0.001, 0.01, 0.1, 0.3, 0.5, 0.7]
# Regularisation for imbalanced data
max_delta_step = [0, 1, 3, 5, 7, 10]
# Gamma for overfitting control
min_split_loss = [0, 0.5]
# Balance weights for imbalanced classes for AUC
scale_pos_weight = [7.710456942]

# Create the grid
param_grid = {'m__max_depth': max_depth,
               'm__n_estimators': n_estimators,
               'm__learning_rate': lr,
              'm__max_delta_step': max_delta_step,
             'm__min_split_loss': min_split_loss,
             'm__scale_pos_weight': scale_pos_weight}

In [None]:
# Try with Bayesian optimisation for faster computation of tuning
opt = BayesSearchCV(pipeline, param_grid, n_iter=50, cv=cv, verbose=1, refit=False, scoring='roc_auc')
opt.fit(X_train, np.ravel(y_train))

In [None]:
# With the following parameter combination being optimal
print("Best parameter combo:", opt.best_params_)
# Having the following score
print("Best AUROC:", opt.best_score_)

In [None]:
# Test the model on standalone set
XGBOOSTmodel = XGBClassifier(scale_pos_weight = 7.710456942, max_depth = 12, n_estimators = 350,learning_rate = 0.1, min_split_loss = 0.0, max_delta_step = 1, use_label_encoder=False)

In [None]:
XGBOOSTmodel.fit(X_train_res, np.ravel(y_train))

In [None]:
print('Results:')
print('AUROC is:', metrics.roc_auc_score(y_test, XGBOOSTmodel.predict_proba(X_test_res)[:,1]))
print('Accuracy is:', metrics.accuracy_score(y_test, XGBOOSTmodel.predict(X_test_res)))
print('Average Precision is:', metrics.average_precision_score(y_test, XGBOOSTmodel.predict_proba(X_test_res)[:,1]))
print('Weighted F1 is:', metrics.f1_score(y_test, XGBOOSTmodel.predict(X_test_res), average='weighted'))
print('Sensitivity is:', metrics.recall_score(y_test, XGBOOSTmodel.predict(X_test_res)))
print('Balanced accuracy is:', metrics.balanced_accuracy_score(y_test, XGBOOSTmodel.predict(X_test_res)))
print('Specificity is:', metrics.recall_score(y_test, XGBOOSTmodel.predict(X_test_res), pos_label=0))
print('#################################################')

In [None]:
# Feature importance
explainer = shap.TreeExplainer(XGBOOSTmodel)
shap_values = explainer.shap_values(X_test_res)

In [None]:
# Define colormap
my_cmap = plt.get_cmap('Reds')

f = plt.figure()
shap.summary_plot(shap_values, X_test_res, feature_names = features, show=False, color_bar=False)

# Change the colormap of the artists
for fc in plt.gcf().get_children():
    for fcc in fc.get_children():
        if hasattr(fcc, "set_cmap"):
            fcc.set_cmap(my_cmap)
            fcc.set_clim(vmin=-0.2)
plt.colorbar(fraction=0.01)

# Try with Random Noise to see effects on Shapley values

In [None]:
X_train = np.load('X_train_static_6.npy')
X_test = np.load('X_test_static_6.npy')
y_train = np.load('y_train_static_6.npy')
y_test = np.load('y_test_static_6.npy')

In [None]:
# Remove APACHE
X_train = np.delete(X_train, 2, 1)
X_test = np.delete(X_test, 2, 1)

In [None]:
X_train.shape, X_test.shape

In [None]:
imp_mean = IterativeImputer(random_state=0, max_iter = 10)

In [None]:
imp_mean.fit(X_train)
X_train_res = imp_mean.transform(X_train)
X_test_res = imp_mean.transform(X_test)

In [None]:
mu, sigma = 0, 1
noise_train = np.random.normal(mu, sigma, [20974,1]) 
noise_test = np.random.normal(mu, sigma, [5244,1]) 

X_train_res = np.hstack((noise_train,X_train_res))
X_test_res = np.hstack((noise_test,X_test_res))

In [None]:
# Test the model on standalone set
XGBOOSTmodel = XGBClassifier(scale_pos_weight = 7.352847471127041, max_depth = 10, n_estimators = 200,learning_rate = 0.1, min_split_loss = 0.5, max_delta_step = 0, use_label_encoder=False)

In [None]:
XGBOOSTmodel.fit(X_train_res, np.ravel(y_train))

In [None]:
features = [
    'Noise', 'secondary MI',
 'Time Since Admission',
 'gender',
 'age',
 'admissionheight',
 'admissionweight',
 'hour',
 'teachingstatus',
 'ethnicity_African American',
 'ethnicity_Asian',
 'ethnicity_Caucasian',
 'ethnicity_Hispanic',
 'ethnicity_Native American',
 'ethnicity_Other/Unknown',
 'unittype_CCU-CTICU',
 'unittype_CSICU',
 'unittype_CTICU',
 'unittype_Cardiac ICU',
 'unittype_MICU',
 'unittype_Med-Surg ICU',
 'unittype_Neuro ICU',
 'unittype_SICU',
 'unitadmitsource_Acute Care/Floor',
 'unitadmitsource_Direct Admit',
 'unitadmitsource_Emergency Department',
 'unitadmitsource_Floor',
 'unitadmitsource_Operating Room',
 'unitadmitsource_Other Hospital',
 'unitadmitsource_PACU',
 'unitadmitsource_Recovery Room',
 'unitadmitsource_Step-Down Unit (SDU)',
 'unitadmitsource_misc',
 'unitvisitnumber_1',
 'unitvisitnumber_2',
 'unitvisitnumber_3',
 'unitvisitnumber_misc',
 'unitstaytype_admit',
 'unitstaytype_readmit',
 'unitstaytype_transfer',
 'physicianspeciality_Specialty Not Specified',
 'physicianspeciality_cardiology',
 'physicianspeciality_critical care medicine (CCM)',
 'physicianspeciality_family practice',
 'physicianspeciality_hospitalist',
 'physicianspeciality_internal medicine',
 'physicianspeciality_misc',
 'physicianspeciality_neurology',
 'physicianspeciality_other',
 'physicianspeciality_pulmonary',
 'physicianspeciality_pulmonary/CCM',
 'physicianspeciality_surgery-cardiac',
 'physicianspeciality_surgery-general',
 'physicianspeciality_surgery-neuro',
 'physicianspeciality_surgery-trauma',
 'physicianspeciality_surgery-vascular',
 'physicianspeciality_unknown',
 'numbedscategory_100 - 249',
 'numbedscategory_250 - 499',
 'numbedscategory_<100',
 'numbedscategory_>= 500',
 'region_Midwest',
 'region_Northeast',
 'region_South',
 'region_West',
 'noninvasivesystolic_mean',
 'noninvasivediastolic_mean',
 'noninvasivemean_mean',
 'noninvasivesystolic_std',
 'noninvasivediastolic_std',
 'noninvasivemean_std',
 '-basos_mean',
 '-eos_mean',
 '-lymphs_mean',
 '-monos_mean',
 '-polys_mean',
 'ALT (SGPT)_mean',
 'AST (SGOT)_mean',
 'BUN_mean',
 'Base Excess_mean',
 'FiO2_mean',
 'HCO3_mean',
 'Hct_mean',
 'Hgb_mean',
 'MCH_mean',
 'MCHC_mean',
 'MCV_mean',
 'MPV_mean',
 'O2 Sat (%)_mean',
 'PT_mean',
 'PT - INR_mean',
 'PTT_mean',
 'RBC_mean',
 'RDW_mean',
 'WBC x 1000_mean',
 'albumin_mean',
 'alkaline phos._mean',
 'anion gap_mean',
 'bedside glucose_mean',
 'bicarbonate_mean',
 'calcium_mean',
 'chloride_mean',
 'creatinine_mean',
 'glucose_mean',
 'lactate_mean',
 'magnesium_mean',
 'pH_mean',
 'paCO2_mean',
 'paO2_mean',
 'phosphate_mean',
 'platelets x 1000_mean',
 'potassium_mean',
 'sodium_mean',
 'total bilirubin_mean',
 'total protein_mean',
 'troponin - I_mean',
 'urinary specific gravity_mean',
 '-basos_std',
 '-eos_std',
 '-lymphs_std',
 '-monos_std',
 '-polys_std',
 'ALT (SGPT)_std',
 'AST (SGOT)_std',
 'BUN_std',
 'Base Excess_std',
 'FiO2_std',
 'HCO3_std',
 'Hct_std',
 'Hgb_std',
 'MCH_std',
 'MCHC_std',
 'MCV_std',
 'MPV_std',
 'O2 Sat (%)_std',
 'PT_std',
 'PT - INR_std',
 'PTT_std',
 'RBC_std',
 'RDW_std',
 'WBC x 1000_std',
 'albumin_std',
 'alkaline phos._std',
 'anion gap_std',
 'bedside glucose_std',
 'bicarbonate_std',
 'calcium_std',
 'chloride_std',
 'creatinine_std',
 'glucose_std',
 'lactate_std',
 'magnesium_std',
 'pH_std',
 'paCO2_std',
 'paO2_std',
 'phosphate_std',
 'platelets x 1000_std',
 'potassium_std',
 'sodium_std',
 'total bilirubin_std',
 'total protein_std',
 'troponin - I_std',
 'urinary specific gravity_std',
 'mechanical_ventilation']

In [None]:
# Feature importance
explainer = shap.TreeExplainer(XGBOOSTmodel)
shap_values = explainer.shap_values(X_test_res)

In [None]:
# Define colormap
my_cmap = plt.get_cmap('Reds')

f = plt.figure()
shap.summary_plot(shap_values, X_test_res, feature_names = features, show=False, color_bar=False)

# Change the colormap of the artists
for fc in plt.gcf().get_children():
    for fcc in fc.get_children():
        if hasattr(fcc, "set_cmap"):
            fcc.set_cmap(my_cmap)
            fcc.set_clim(vmin=-0.2)
plt.colorbar(fraction=0.01)