In [None]:
##install and import necessary modules
##this code was originally designed and run in google colab
##use outside of colab may require modification
##if using colab, you may need to restart your runtime after installing modules,
##depending on enviornment at time of code running.
##due to potential module dependencies, we will install DeepTables later

!pip install scikit-learn==1.5.2
!pip install tensorflow==2.12.1
!pip install xgboost==2.0.2
!pip install shap
import shap
import sys
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import seaborn as sn
from google.colab import drive
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, roc_curve, auc, precision_recall_curve, recall_score, confusion_matrix, brier_score_loss
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample
sn.set(style='whitegrid')

print("Python version:", sys.version)
print("scikit-learn version:", sklearn.__version__)
print("XGBoost version:", xgb.__version__)
print("shap version:", shap.__version__)

In [None]:
##import your dataset

##mount google drive if using in colab. Replace <MOUNT_POINT> with the directory where you want to mount the drive (e.g., /content/drive).
drive.mount('<MOUNT_POINT>')

# Replace <YOUR_FILE_PATH> with the actual path inside your Google Drive (e.g., My Drive/FileNameHere).
file_path = '<MOUNT_POINT>/<YOUR_FILE_PATH>.csv'

In [None]:
# Import data and specify missing values
data = pd.read_csv(file_path, na_values=['NA', 'N/A', 'NULL', ' ', '', '-99', '-98', '-99.0', '-99.00', '-98.0', '-98.00', 'NaN'])

# Filter out rows where 'TRAUMATYPE' is 26, 'Other/unspecified', or 'Burn'
try:
  exclude_values = ['26', 'Other/unspecified', 'Burn']
  data = data[~data['TRAUMATYPE'].isin(exclude_values)]
except:
  pass

In [None]:
##check dataframe to ensure it appears as it should
data.head()

In [None]:
##check for missing data
data.isnull().sum(axis=0)

In [None]:
##create a dataframe of all complications/things not available on admission.  We can remove all of these from the X data set and pick one to be
#our Y dataset

complications_df=pd.DataFrame()
complications_list= [
                    'HC_CLABSI', 'HC_DEEPSSI', 'HC_DVTHROMBOSIS', 'HC_ALCOHOLWITHDRAWAL', 'HC_CARDARREST', 'HC_CAUTI',
                    'HC_EMBOLISM', 'HC_EXTREMITYCS', 'HC_INTUBATION', 'HC_KIDNEY', 'HC_MI', 'HC_ORGANSPACESSI',
                    'HC_OSTEOMYELITIS', 'HC_RESPIRATORY', 'HC_RETURNOR', 'HC_SEPSIS', 'HC_STROKECVA', 'HC_SUPERFICIALINCISIONSSI',
                    'HC_PRESSUREULCER', 'HC_UNPLANNEDICU', 'HC_VAPNEUMONIA',
                    ##'EDDISCHARGEDISPOSITION',
                    'HOSPDISCHARGEDISPOSITION',
                    ##'EDDISCHARGEHRS',
                    'WITHDRAWALLST',
                    'VTEPROPHYLAXISTYPE',
                    'TOTALICULOS',
                    'TOTALVENTDAYS',
                    'VTEPROPHYLAXISHRS',
                    'VTEPROPHYLAXISDAYS', 'MORTALITY', 'EDDISCHARGEDAYS','FINALDISCHARGEDAYS','FINALDISCHARGEHRS', 'HMRRHGCTRLSURGDAYS',  'WITHDRAWALLSTHRS',
                    ##'AMERICANINDIAN', 'ASIAN', 'BLACK', 'PACIFICISLANDER', 'RACEOTHER', 'WHITE', 'RACE_NA', 'RACE_UK',
                    'ISS_05'
                    , 'AIS_FACE', 'AIS_NECK', 'AIS_HEAD', 'AIS_THORAX', 'AIS_ABDOMEN', 'AIS_SPINE', 'AIS_UPPEREX', 'AIS_LOWEREX', 'AIS_SKIN', 'AIS_OTHER'
                    ##, 'VTEPPXStartOver48', 'VTEPPXStartOver24', 'ICUOver48', 'ICUOver24', 'VentOver48', 'VentOver24'
                    , 'VTEPPXStartOver72', 'VTEPPXStartOver96', 'ICUOver72', 'ICUOver96', 'VentOver72', 'VentOver96'
                    , 'FacilityTotalWLST', 'factilityTotalPatients', 'FacilityWLSTRate'
                    , 'facilityWLSTNew', 'WLSTRateNew', 'WLSTRateNewCensored'
                    ]
for c in complications_list:
    complications_df[c] = data[c]
complications_df

In [None]:
##this is where we choose our outcome variable, in this case, WLST, and move it to a separate dataframe
Y_data = pd.DataFrame()
Y_data['WLST'] = data['WITHDRAWALLST']
Y_data

In [None]:
##clean Y_data by replacing "Yes" and "No" vcalues with 0's and 1's
Y_data['WLST'] = Y_data['WLST'].replace({'Yes': 1, 'No': 0})
Y_data

In [None]:
##remove all unwanted variables as defined above from the input space
X_data = data.drop(columns=complications_list)
X_data.shape

In [None]:
##need to remove any cases with missing data for our outcome variable
Missing_Y = Y_data.isnull().sum(axis=0)
Missing_Y

In [None]:
##here we find which rows in Y have missing values
bad_row_index_list=[]
for index, row in Y_data.iterrows():
    n_missings=row.isnull().sum()
    if n_missings>0:
        bad_row_index_list.append(index)
bad_row_index_list

In [None]:
##now remove the bad rows in Y
Y_clean = Y_data.drop(bad_row_index_list, axis=0)
Y_clean

In [None]:
##ensure all cases with missing values for the outcome have been dropped
Missing_Y_clean = Y_clean.isnull().sum(axis=0)
Missing_Y_clean

In [None]:
##and remove bad rows in X
X_data=X_data.drop(bad_row_index_list, axis=0)

In [None]:
##check which variables in the input space have missing variables
Missing = X_data.isnull().sum(axis=0)
Missing[Missing>0]

In [None]:
##order variables with missing data by percentage
data_missing = (X_data.isnull().sum(axis=0)/X_data.shape[0]) * 100
data_missing

In [None]:
##display variables withOUT mising data
data_missing[data_missing == 0].index

In [None]:
#remove the good columns (no missing values) from data_missing
data_missing = data_missing.drop(data_missing[data_missing == 0].index)
data_missing

In [None]:
#sort this in ascending order
data_missing = data_missing.sort_values(ascending=False)
data_missing

In [None]:
##prepare to drop variables with >50% missing values
dropCutoff=50
bad_column_names = data_missing[data_missing >=dropCutoff].index
bad_column_names

In [None]:
##perform median imputation for continuous variable and mode imputation for categorical
for c in to_be_cleaned_column_names:
    v=X_data_new[c]#get values in this column
    v_valid=v[~v.isnull()] # get valid values
    if X_data_new[c].dtype == np.dtype('O'): # non-numeric values
        X_data_new[c]=X_data_new[c].fillna(v.value_counts().index[0]).astype(object) # the most frequent category
    else: # numeric
        X_data_new[c]=X_data_new[c].fillna(v_valid.median()) #replace nan with median value

In [None]:
##confirm no more missing data in input space
X_data_new.isnull().sum().sum()

In [None]:
##verify cleaned dataframe appears as intended
X_data_new.head()

In [None]:
# Rename the 'TRAUMATYPE' column to 'Penetrating' and map the values to 0 and 1
X_data_new['Penetrating'] = X_data_new['TRAUMATYPE'].map({'Penetrating': 1, 'Blunt': 0})

# Drop the old 'TRAUMATYPE' column
X_data_new.drop(columns=['TRAUMATYPE'], inplace=True)

In [None]:
##remove any additional variables necessary
## Remove the "RACE" and "TRANSPORTMODE" columns, as these are composite varibles that have already been 1 hot encoded
columns_to_remove = ['RACE', 'TRANSPORTMODE']
X_data_new = X_data_new.drop(columns=columns_to_remove, errors='ignore')

In [None]:
##first we will convert No's and Yes's to 0's and 1's to minimize the amount of double variables (want to avoid Yes/Nos being converted to 1-hot variables)
##want code to be reusable between different populations of input data.  Not every population will have all of these variables
##Therefore, will do everything within separate try/except blocks

try:
    X_data_new= X_data_new.replace({True: 1, 'Yes': 1, "Female": 1, False: 0, 'No': 0, "Male": 0})
except:
    pass
try:
    X_data_new['ETHNICITY'] = X_data_new['ETHNICITY'].replace({'Hispanic or Latino': 1, 'Not Hispanic or Latino': 0})
except:
    pass
try:
    X_data_new['EMSGCSEYE'] = X_data_new['EMSGCSEYE'].replace({'None': 1, 'To pressure': 2, 'To sound': 3,
                                                               'Spontaneous': 4})
except:
    pass
try:
    X_data_new['GCSEYE'] = X_data_new['GCSEYE'].replace({'None': 1, 'To pressure': 2, 'To sound': 3, 'Spontaneous': 4})
except:
    pass
try:
    X_data_new['EMSGCSVERBAL'] = X_data_new['EMSGCSVERBAL'].replace({'None': 1, 'Sounds': 2, 'Words': 3,
                                                                     'Confused': 4, 'Oriented': 5})
except:
    pass
try:
    X_data_new['EMSGCSMOTOR'] = X_data_new['EMSGCSMOTOR'].replace({'None': 1, 'Extension': 2, 'Abnormal Flexion': 3,
                                                                 'Normal Flexion': 4, 'Localising': 5, 'Obeys commands': 6})
except:
    pass
try:
    X_data_new['TBIGCSMOTOR'] = X_data_new['TBIGCSMOTOR'].replace({'None': 1, 'Extension': 2, 'Abnormal Flexion': 3,
                                                                 'Normal Flexion': 4, 'Localising': 5, 'Obeys commands': 6})
except:
    pass
try:
    X_data_new['GCSVERBAL'] = X_data_new['GCSVERBAL'].replace({'None': 1, 'Sounds': 2, 'Words': 3,
                                                               'Confused': 4, 'Orientated': 5})
except:
    pass
try:
    X_data_new['GCSMOTOR'] = X_data_new['GCSMOTOR'].replace({'None': 1, 'Extension': 2, 'Abnormal Flexion': 3,
                                                           'Normal Flexion': 4, 'Localising': 5, 'Obeys commands': 6})
except:
    pass
try:
    X_data_new['RESPIRATORYASSISTANCE'] = X_data_new['RESPIRATORYASSISTANCE'].replace({'Assisted Respiratory Rate': 1,
                                                                                   'Unassisted Respiratory Rate': 0})
except:
    pass
try:
    X_data_new['SUPPLEMENTALOXYGEN'] = X_data_new['SUPPLEMENTALOXYGEN'].replace({'Supplemental Oxygen': 1,
                                                                             'No Supplemental Oxygen': 0})
except:
    pass

X_data_new.head()

##male coded as 0
##female coded as 1

##not hispanic coded as 0
##hispanic coded as 1

In [None]:
##need to convert categorical values to numerical values using one-hot encoding
categorical_column=[]
for c in X_data_new.columns:
    if X_data_new[c].dtype == np.dtype('O', 'category'): # non-numeric values
        categorical_column.append(c)
categorical_column

In [None]:
##check how many variables we need to one-hot encode
len(categorical_column)

In [None]:
##verify dataframe shape
X_data_new.shape

In [None]:
##one-hot encode variables above
X_clean=pd.get_dummies(X_data_new, columns=categorical_column, sparse=False)
X_clean.shape

In [None]:
##verify cleaned true label dataframe shape
Y_clean.shape

In [None]:
##verify no missing data in the cleaned input space
X_clean.isnull().sum().sum()

In [None]:
##drop patient ID's
X_clean.drop(['inc_key'], axis=1, inplace=True)

In [None]:
##replace boolean values in binary variables to numeric values
X_clean = X_clean.replace({True: 1, False: 0})

In [None]:
##verify dataframe appears as intended
X_clean.head()

In [None]:
##split cleaned input space into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X_clean, Y_clean, test_size=0.2, random_state=0)

In [None]:
##Before converting to Numpy arrays, we generate copies of thed data in tensor format to ensure we have access to
##tensor format data if needed

X_train_tensor=X_train.copy()
Y_train_tensor=Y_train.copy()
X_test_tensor=X_test.copy()
Y_test_tesnor=Y_test.copy()

In [None]:
##convert sets to Numpy arrays:
X_train=X_train.values
Y_train=Y_train.values.reshape(-1)
X_test=X_test.values
Y_test=Y_test.values.reshape(-1)

In [None]:
##now we have X_train, Y_train, X_test, Y_test as numpy arrays

scaler=StandardScaler()
#get the parameters of the transform
scaler.fit(X_train)
#normalize the features in the training set
X_train_s = scaler.transform(X_train)
#normalize the features in the test set
X_test_s = scaler.transform(X_test)

##lets also scale the tensor copies we created
X_train_tensor_s = scaler.transform(X_train_tensor)
X_test_tensor_s = scaler.transform(X_test_tensor)

In [None]:
##further split the training set into a training and validation/calibration set
X_train_s_cal, X_val_s_cal, Y_train_cal, Y_val_cal = train_test_split(X_train_s, Y_train, test_size=0.2, random_state=0)

In [None]:
##create a dictionary of model hyper-parameter(s)

##for KNN
n_list=np.arange(1, 810, 2)
param_grid_knc = {'n_neighbors':n_list}

##for RF
param_grid_rf = {
    'n_estimators': [100, 200, 400],        ## Number of trees in the forest
    'max_depth': [None, 10, 20, 30],       ## Maximum depth of the trees
    'min_samples_split': [2, 5, 10],       ## Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],         ## Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt']               ## Number of features to consider for the best split
}

##for LR
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],   ## Inverse of regularization strength
    'penalty': ['l1', 'l2'],               ## Regularization type
    'solver': ['liblinear', 'saga'],       ## Optimization algorithm
    'max_iter': [100, 200, 300]            ## Maximum number of iterations
    }

##this is for XGBoost
param_grid_gb = {
    'learning_rate': [0.01, 0.05, 0.1],    ## Learning rate
    'max_depth': [3, 5, 7],                ## Maximum depth of the trees
    'subsample': [0.6, 0.8, 1.0],          ## Subsample ratio of the training instances
    'colsample_bytree': [0.6, 0.8, 1.0],   ## Subsample ratio of columns when constructing each tree.
    'n_estimators': [50, 100, 150]        ## Number of trees
}


In [None]:
##now, optimize GB hyperparameters
model_gb=xgb.XGBClassifier(random_state=0) #create an empty model
##initialize gridsearch
gs_gb = GridSearchCV(estimator=model_gb,
                  param_grid=param_grid_gb,
                  scoring='roc_auc',
                  cv=5, verbose=2)
#set cv=5, then it will do 5-fold cross-validation

gs_gb.fit(X_train_s_cal, Y_train_cal)

In [None]:
##display best parameters
gs_gb.best_params_

In [None]:
##now, optimize LR hyperparameters
model_lr=LogisticRegression() #create an empty model
##initialize gridsearch
gs_lr = GridSearchCV(estimator=model_lr,
                  param_grid=param_grid_lr,
                  scoring='roc_auc',
                  cv=5, verbose=2)
#set cv=5, then it will do 5-fold cross-validation

#actually perform hyperparmeter optimization
gs_lr.fit(X_train_s_cal, Y_train_cal)

In [None]:
##display best parameters
gs_lr.best_params_

In [None]:
# ##now, optimize RF hyperparameters
model_rf=RandomForestClassifier(random_state=0) #create an empty model
##initialize gridsearch
gs_rf = GridSearchCV(estimator=model_rf,
                  param_grid=param_grid_rf,
                  scoring='roc_auc',
                  cv=5, verbose=2)
#set cv=5, then it will do 5-fold cross-validation

#actually perform hyperparmeter optimization
gs_rf.fit(X_train_s_cal, Y_train_cal)

In [None]:
##display best parameters
gs_rf.best_params_

In [None]:
##KNN with GS_CV to optimize hyperparameter
from sklearn.neighbors import KNeighborsClassifier
model_knno=KNeighborsClassifier() #create an empty model
##initialize gridsearch
gs_knno = GridSearchCV(estimator=model_knno,
                  param_grid=param_grid_knc,
                  scoring='roc_auc',
                  cv=5, verbose=2)
#set cv=5, then it will do 5-fold cross-validation

#actually perform hyperparmeter optimization
gs_knno.fit(X_train_s_cal, Y_train_cal)

In [None]:
##display best parameters
gs_knno.best_params_

In [None]:
##copy existing dataframes to use in neural networks

X_clean_nn_test=X_test_s.copy()
Y_clean_nn_test=Y_test.copy()

X_clean_nn_train=X_train_s_cal.copy()
Y_clean_nn_train=Y_train_cal.copy()

X_clean_nn_cal=X_val_s_cal.copy()
Y_clean_nn_cal=Y_val_cal.copy()

In [None]:
##ensure data is in pandas dataframe
X_train_df = pd.DataFrame(X_clean_nn_train)
Y_train_s = pd.Series(Y_clean_nn_train)

X_val_df = pd.DataFrame(X_clean_nn_cal)
Y_val_s = pd.Series(Y_clean_nn_cal)

X_test_df = pd.DataFrame(X_clean_nn_test)
Y_test_s = pd.Series(Y_clean_nn_test)

In [None]:
!pip install deeptables
##revert to sklearn 1.5 to resolve dependency issues
!pip install scikit-learn==1.5
import deeptables
print("dt version:", deeptables.__version__)
from deeptables.models.deeptable import DeepTable, ModelConfig
from deeptables.models.deepnets import DeepFM, WideDeep, DCN

In [None]:
#initialize neural network model and fit--try DeepFM first
# `auto_discrete` is used to decide wether to discretize continous varibles automatically.
conf = ModelConfig(
    nets=DeepFM,
    metrics=['AUC', 'accuracy'],
    auto_discrete=True
)
dt = DeepTable(config=conf)
model, history = dt.fit( X_train_df, Y_train_s, epochs=100, validation_data=(X_val_df, Y_val_s))
score = dt.evaluate(X_test_df, Y_test_s)
preds = dt.predict(X_test_df)

In [None]:
# Calculate ROC curve
y_pred_prob_ANN = dt.predict_proba(X_clean_nn_test)[:, 1]
fpr_ANN, tpr_ANN, thresholds = roc_curve(Y_clean_nn_test, y_pred_prob_ANN)

# Calculate the Area Under the ROC Curve (AUC)
roc_auc_ANN = auc(fpr_ANN, tpr_ANN)

# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr_ANN, tpr_ANN, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_ANN:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
#initialize neural network model and fit--try WideDeep second
# `auto_discrete` is used to decide wether to discretize continous varibles automatically.
conf = ModelConfig(
    nets=WideDeep,
    metrics=['AUC', 'accuracy'],
    auto_discrete=True
)
dt = DeepTable(config=conf)
model, history = dt.fit( X_train_df, Y_train_s, epochs=100, validation_data=(X_val_df, Y_val_s))
score = dt.evaluate(X_test_df, Y_test_s)
preds = dt.predict(X_test_df)

In [None]:
# Calculate ROC curve
y_pred_prob_ANN = dt.predict_proba(X_clean_nn_test)[:, 1]
fpr_ANN, tpr_ANN, thresholds = roc_curve(Y_clean_nn_test, y_pred_prob_ANN)

# Calculate the Area Under the ROC Curve (AUC)
roc_auc_ANN = auc(fpr_ANN, tpr_ANN)

# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr_ANN, tpr_ANN, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_ANN:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
#initialize neural network model and fit--try DCN last
# `auto_discrete` is used to decide wether to discretize continous varibles automatically.
conf = ModelConfig(
    nets=DCN,
    metrics=['AUC', 'accuracy'],
    auto_discrete=True
)
dt = DeepTable(config=conf)
model, history = dt.fit( X_train_df, Y_train_s, epochs=100, validation_data=(X_val_df, Y_val_s))
score = dt.evaluate(X_test_df, Y_test_s)
preds = dt.predict(X_test_df)

In [None]:
# Calculate ROC curve
y_pred_prob_ANN = dt.predict_proba(X_clean_nn_test)[:, 1]
fpr_ANN, tpr_ANN, thresholds = roc_curve(Y_clean_nn_test, y_pred_prob_ANN)

# Calculate the Area Under the ROC Curve (AUC)
roc_auc_ANN = auc(fpr_ANN, tpr_ANN)

# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr_ANN, tpr_ANN, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_ANN:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()