In [None]:
##install and import necessary modules
##this code was originally designed and run in google colab
##use outside of colab may require modification
##if using colab, you may need to restart your runtime after installing modules,
##depending on enviornment at time of code running.
##due to potential module dependencies, we will install DeepTables later

!pip install scikit-learn==1.5.2
!pip install tensorflow==2.12.1
!pip install xgboost==2.0.2
import time
import os
import sys
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import seaborn as sn
from google.colab import drive
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, ParameterGrid
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import roc_auc_score, f1_score, roc_curve, auc, precision_recall_curve, recall_score, confusion_matrix, brier_score_loss
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
sn.set(style='whitegrid')

print("Python version:", sys.version)
print("scikit-learn version:", sklearn.__version__)
print("XGBoost version:", xgb.__version__)

In [None]:
#import your dataset
##mount google drive if using in colab. Replace <MOUNT_POINT> with the directory where you want to mount the drive (e.g., /content/drive).
drive.mount('<MOUNT_POINT>')

# Replace <YOUR_FILE_PATH> with the actual path inside your Google Drive (e.g., My Drive/FileNameHere).
file_path = '<MOUNT_POINT>/<YOUR_FILE_PATH>.csv'

In [None]:
##specify columns to load from your dataset.  We will only load the columns necessary for the score and the variables necessary for the ISS/TRISS comparisons

columns_to_load = ['AGEYEARS', 'TOTALGCS', 'SBP'
                  , 'TEMPERATURE'
                  , 'PULSERATE', 'TRISS', 'TRISS_Death', 'MORTALITY', 'TRAUMATYPE'
                  , 'WEIGHT'
                  , 'ISS_05', 'NumberOfInjuries',
                   'IntracranialVascularInjury','BrainStemInjury','EDH','SAH','SDH','SkullFx','DAI','NeckVascularInjury','ThoracicVascularInjury','AeroDigestiveInjury',
                   'CardiacInjury','LungInjury','AbdominalVascular','RibFx','KidneyInjury','StomachInjury','SpleenInjury','UroGenInternalInjury','SCI','SpineFx',
                   'UEAmputation','UEVascularInjury','UELongBoneFx','LEVascularInjury','PelvicFx','LEAmputation','PancreasInjury','LELongBoneFx','LiverInjury',
                   'ColorectalInjury','SmallBowelInjury','IPH'
                   ]

In [None]:
# Import data and specify missing values
data = pd.read_csv(file_path, na_values=['NA', 'N/A', 'NULL', ' ', '', '-99', '-98', '-99.0', '-99.00', '-98.0', '-98.00', 'NaN'], usecols=columns_to_load)


# Filter out rows where 'TRAUMATYPE' is 26, 'Other/unspecified', or 'Burn'
try:
  exclude_values = ['26', 'Other/unspecified', 'Burn']
  data = data[~data['TRAUMATYPE'].isin(exclude_values)]
except:
  pass

##explicitly list variables that need to be present for inclusion and drop cases without these
##we cannot compare our score to ISS/TRISS without those metrics, and we need our target outcome mortality
required_vars = ['ISS_05', 'TRISS_Death', 'MORTALITY']
data = data.dropna(subset=required_vars)

# Create ShockIndex with the required logic
data['ShockIndex'] = np.where(
    data['SBP'] == 0, 2.0,  # Case where SBP is 0 → set ShockIndex to 2.0
    data['PULSERATE'] / data['SBP']  # Normal calculation
)

# Set ShockIndex to NaN if PULSERATE or SBP is missing
data.loc[data['PULSERATE'].isna() | data['SBP'].isna(), 'ShockIndex'] = np.nan

##reset indices of the df
data.reset_index(drop=True, inplace=True)

In [None]:
##verify data appears as intended
data.head()

In [None]:
##check for missing values
data.isnull().sum(axis=0)

In [None]:
##create a datafram of all complications/vars to remove later.  We can remove all of these from the X data set and pick one to be
#our Y dataset

complications_df=pd.DataFrame()
complications_list= [
                    'MORTALITY', 'TRISS'
                    ]
for c in complications_list:
    complications_df[c] = data[c]
complications_df

In [None]:
##this is where we choose our outcome variable, mortality, and give it its own dataframe

Y_data = pd.DataFrame()
Y_data['MORTALITY'] = data['MORTALITY']
Y_data

In [None]:
##clean Y_data by replacing "Yes" and "No" vcalues with 0's and 1's

Y_data['MORTALITY'] = Y_data['MORTALITY'].replace({'Yes': 1, 'No': 0})
Y_data

In [None]:
##now drop the outcome from our feature space as well as TRISS, since were using 1-TRISS (aka TRISS_Death) and this varibale is now useless
X_data = data.drop(columns=['MORTALITY', 'TRISS'])
X_data.shape

In [None]:
##ensure no missing outcome data
Missing_Y = Y_data.isnull().sum(axis=0)
Missing_Y

In [None]:
##If we have no missing values here, our data is clean
Y_clean=Y_data.copy()

In [None]:
##if above check passes, outcome data is now clean
Missing_Y_clean = Y_clean.isnull().sum(axis=0)
Missing_Y_clean

In [None]:
##check which variables in the input space have missing variables

Missing = X_data.isnull().sum(axis=0)
Missing[Missing>0]

In [None]:
##order variables with missing data by percentage

data_missing = (X_data.isnull().sum(axis=0)/X_data.shape[0]) * 100
data_missing

In [None]:
##display variables withOUT mising data

data_missing[data_missing == 0].index

In [None]:
#remove the good columns (no missing values) from data_missing

data_missing = data_missing.drop(data_missing[data_missing == 0].index)
data_missing

In [None]:
#sort this in ascending order
data_missing = data_missing.sort_values(ascending=False)
data_missing

In [None]:
##prepare to drop variables with >50% missing values
##tried different cutoffs for this (33%, 66%), but 50% yielded best results

dropCutoff=50
bad_column_names = data_missing[data_missing >=dropCutoff].index
bad_column_names

In [None]:
##actually drop bad variables
X_data_new=X_data.drop(columns=bad_column_names, axis=1)

##check for which variables still have missing data (<50% missing values)
Missing = X_data_new.isnull().sum(axis=0)
Missing[Missing>0]

In [None]:
#display columns with less than 50% missing that need to be cleaned

to_be_cleaned_column_names = data_missing[data_missing <50].index
to_be_cleaned_column_names

In [None]:
# Rename the 'TRAUMATYPE' column to 'Penetrating' and map the values to 0 and 1
X_data_new['Penetrating'] = X_data_new['TRAUMATYPE'].map({'Penetrating': 1, 'Blunt': 0})

# Drop the old 'TRAUMATYPE' column
X_data_new.drop(columns=['TRAUMATYPE'], inplace=True)

print(X_data_new.head())

In [None]:
# Display the entire DataFrame without truncation
pd.set_option('display.max_columns', None)

# Get column names and data types
columns_info = []
for column_name, dtype in zip(X_data_new.columns, X_data_new.dtypes):
    columns_info.append(f"{column_name}: {dtype}")

formatted_columns_info = "\n".join(columns_info)

# Print column names and data types
print("Column Names and Data Types:")
print(formatted_columns_info)

In [None]:
##convert No's and Yes's to 0's and 1's to minimize the amount of double variables (want to avoid Yes/Nos being converted to 1-hot variables)

try:
    X_data_new= X_data_new.replace({True: 1, 'Yes': 1, "Female": 1, False: 0, 'No': 0, "Male": 0})
except:
    pass

##drop any non blunt/penetrating mechanisms
try:
    X_data_new=X_data_new.drop(['TRAUMATYPE_26', 'TRAUMATYPE_Other/unspecified'], axis=1)
except:
    pass

X_data_new.head()

In [None]:
##split into train, test, calibrate sets
X_train, X_test, Y_train, Y_test = train_test_split(X_data_new, Y_clean, test_size=0.2, random_state=0, stratify=Y_clean)
X_train_cal, X_val_cal, Y_train_cal, Y_val_cal = train_test_split(X_train, Y_train, test_size=0.2, random_state=0, stratify=Y_train)

In [None]:
##perform median/mode imputation on the inputs vars that are missing
for c in to_be_cleaned_column_names:
    v = X_train[c]
    v_valid = v[~v.isnull()]

    if v.dtype == np.dtype('O'):  # Categorical column
        mode_value = v_valid.value_counts().index[0]
        for df in [X_train, X_test, X_train_cal, X_val_cal]:
            df[c] = df[c].fillna(mode_value).astype(object)

    else:  # Numeric column
        median_value = v_valid.median()
        for df in [X_train, X_test, X_train_cal, X_val_cal]:
            df[c] = df[c].fillna(median_value)


In [None]:
##now for one-hot encoding

# Identify categorical columns from X_train only
categorical_column = [c for c in X_train_cal.columns if X_train_cal[c].dtype == np.dtype('O')]

# Apply pd.get_dummies to training data
X_train_cal = pd.get_dummies(X_train_cal, columns=categorical_column, sparse=False)

categorical_column

In [None]:
# Align test and validation sets to match training set columns
X_test = pd.get_dummies(X_test, columns=categorical_column, sparse=False)
X_val_cal = pd.get_dummies(X_val_cal, columns=categorical_column, sparse=False)

# Ensure same columns across all datasets
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
X_val_cal = X_val_cal.reindex(columns=X_train.columns, fill_value=0)

In [None]:
#verify data appears as intended
X_train_cal.head()

In [None]:
##verify no missing data in any split dataset
print(X_train_cal.isnull().sum().sum())
print(X_test.isnull().sum().sum())
print(X_val_cal.isnull().sum().sum())

In [None]:
##final list of training columns
X_train_cal.columns

In [None]:
#verify data is intended size
X_test.shape

In [None]:
##now with data cleaned, take comparison vars and move them to their own dataframe prior to dropping
new_to_drop = ['TRISS_Death', 'ISS_05']

X_ISS=pd.DataFrame()
X_ISS['ISS']=X_test['ISS_05']

X_TRISS=pd.DataFrame()
X_TRISS['TRISS']=X_test['TRISS_Death']

In [None]:
##now drop those comparison vars from the data that will be fed to the model
X_train_cal.drop(columns=new_to_drop, inplace=True)
X_test.drop(columns=new_to_drop, inplace=True)
X_val_cal.drop(columns=new_to_drop, inplace=True)


##store copies of data as tensors
X_train_tensor=X_train_cal.copy()
Y_train_tensor=Y_train_cal.copy()

X_val_tensor=X_val_cal.copy()
Y_val_tensor=Y_val_cal.copy()

X_test_tensor=X_test.copy()
Y_test_tensor=Y_test.copy()

In [None]:
##verify data appears as intended
X_test.head()

In [None]:
##Next step is to normalize data

scaler=StandardScaler()
#get the parameters of the transform
scaler.fit(X_train_cal)

#normalize the features in the training set
X_train_s_cal = scaler.transform(X_train_cal)
#normalize the features in the test set
print("After train/test split, X_test shape:", X_test.shape)
X_test_s = scaler.transform(X_test)
print("After scaling, X_test_s shape:", X_test_s.shape)
#normalize the features in the val set
X_val_s_cal = scaler.transform(X_val_cal)

In [None]:
##create a dictionary of model hyper-parameter(s)

##for KNN
n_list=np.arange(1, 803, 2)
param_grid_knc = {'n_neighbors':n_list}

##for RF
param_grid_rf = {
    'n_estimators': [100, 200, 400],       ## Number of trees in the forest
    'max_depth': [None, 10, 20, 30],       ## Maximum depth of the trees
    'min_samples_split': [2, 5, 10],       ## Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],         ## Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt']               ## Number of features to consider for the best split
}

##for LR
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],   ## Inverse of regularization strength
    'penalty': ['l1', 'l2'],               ## Regularization type
    'solver': ['liblinear', 'saga'],       ## Optimization algorithm
    'max_iter': [100, 200, 300]            ## Maximum number of iterations
    }

##this is for XGBoost
param_grid_gb = {
    'learning_rate': [0.01, 0.05, 0.1],    ## Learning rate
    'max_depth': [3, 5, 7],                ## Maximum depth of the trees
    'subsample': [0.6, 0.8, 1.0],          ## Subsample ratio of the training instances
    'colsample_bytree': [0.6, 0.8, 1.0],   ## Subsample ratio of columns when constructing each tree.
    'n_estimators': [100, 150, 200]        ## Number of trees
}

In [None]:
##now, optimize GB hyperparameters
model_gb=xgb.XGBClassifier(random_state=0) #create an empty model
##initialize gridsearch
gs_gb = GridSearchCV(estimator=model_gb,
                  param_grid=param_grid_gb,
                  scoring='roc_auc',
                  cv=5, verbose=2)
#set cv=5, then it will do 5-fold cross-validation

gs_gb.fit(X_train_s_cal, Y_train_cal)

In [None]:
##display best parameters
gs_gb.best_params_

In [None]:
##display auroc for XGB
model_best_gb=gs_gb.best_estimator_
y_prob_gbo_mtp = model_best_gb.predict_proba(X_test_s)[:, 1]
auroc_gbo = roc_auc_score(Y_test, y_prob_gbo_mtp)
print(f"AUROC on the test set: {auroc_gbo}")

In [None]:
##now, optimize LR hyperparameters
model_lr=LogisticRegression() #create an empty model
##initialize gridsearch
gs_lr = GridSearchCV(estimator=model_lr,
                  param_grid=param_grid_lr,
                  scoring='roc_auc',
                  cv=5, verbose=2)
#set cv=5, then it will do 5-fold cross-validation

#actually perform hyperparmeter optimization
gs_lr.fit(X_train_s_cal, Y_train_cal)

In [None]:
##display best parameters
gs_lr.best_params_

In [None]:
##display auroc for LR
model_best_lr=gs_lr.best_estimator_
y_prob_lr_mtp = model_best_lr.predict_proba(X_test_s)[:, 1]
auroc_lr = roc_auc_score(Y_test, y_prob_lr_mtp)
print(f"AUROC on the test set: {auroc_lr}")

In [None]:
# ##now, optimize RF hyperparameters
model_rf=RandomForestClassifier(random_state=0) #create an empty model
##initialize gridsearch
gs_rf = GridSearchCV(estimator=model_rf,
                  param_grid=param_grid_rf,
                  scoring='roc_auc',
                  cv=5, verbose=2)
#set cv=5, then it will do 5-fold cross-validation

#actually perform hyperparmeter optimization
gs_rf.fit(X_train_s_cal, Y_train_cal)

In [None]:
##display auroc for RF
model_best_rf=gs_rf.best_estimator_
y_prob_rf_mtp = model_best_rf.predict_proba(X_test_s)[:, 1]
auroc_rf = roc_auc_score(Y_test, y_prob_rf_mtp)
print(f"AUROC on the test set: {auroc_rf}")

In [None]:
##KNN with GS_CV to optimize hyperparameter
from sklearn.neighbors import KNeighborsClassifier
model_knno=KNeighborsClassifier() #create an empty model
##initialize gridsearch
gs_knno = GridSearchCV(estimator=model_knno,
                  param_grid=param_grid_knc,
                  scoring='roc_auc',
                  cv=5, verbose=2)
#set cv=5, then it will do 5-fold cross-validation

#actually perform hyperparmeter optimization
gs_knno.fit(X_train_s_cal, Y_train_cal)

In [None]:
##display best parameters
gs_knno.best_params_

In [None]:
##display auroc for KNN
model_best_knno=gs_knno.best_estimator_
y_prob_knno_mtp = model_best_knno.predict_proba(X_test_s)[:, 1]
auroc_knno = roc_auc_score(Y_test, y_prob_knno_mtp)
print(f"AUROC on the test set: {auroc_knno}")

In [None]:
##copy existing dataframes to use in neural networks

X_clean_nn_test=X_test_s.copy()
Y_clean_nn_test=Y_test.copy()

X_clean_nn_train=X_train_s_cal.copy()
Y_clean_nn_train=Y_train_cal.copy()

X_clean_nn_cal=X_val_s_cal.copy()
Y_clean_nn_cal=Y_val_cal.copy()

In [None]:
##ensure data is in pandas dataframe
X_train_df = pd.DataFrame(X_clean_nn_train)
Y_train_s = pd.Series(Y_clean_nn_train.squeeze())

X_val_df = pd.DataFrame(X_clean_nn_cal)
Y_val_s = pd.Series(Y_clean_nn_cal.squeeze())

X_test_df = pd.DataFrame(X_clean_nn_test)
Y_test_s = pd.Series(Y_clean_nn_test.squeeze())

In [None]:
!pip install deeptables
##revert to sklearn 1.5 to resolve dependency issues
!pip install scikit-learn==1.5
import deeptables
print("dt version:", deeptables.__version__)
from deeptables.models.deeptable import DeepTable, ModelConfig
from deeptables.models.deepnets import DeepFM, WideDeep, DCN

In [None]:
#initialize neural network model and fit--try DeepFM first
# `auto_discrete` is used to decide wether to discretize continous varibles automatically.
conf = ModelConfig(
    nets=DeepFM,
    metrics=['AUC', 'accuracy'],
    auto_discrete=True
)
dt = DeepTable(config=conf)
model, history = dt.fit( X_train_df, Y_train_s, epochs=100, validation_data=(X_val_df, Y_val_s))
score = dt.evaluate(X_test_df, Y_test_s)
preds = dt.predict(X_test_df)

In [None]:
# Calculate ROC curve
y_pred_prob_ANN = dt.predict_proba(X_clean_nn_test)[:, 1]
fpr_ANN, tpr_ANN, thresholds = roc_curve(Y_clean_nn_test, y_pred_prob_ANN)

# Calculate the Area Under the ROC Curve (AUC)
roc_auc_ANN = auc(fpr_ANN, tpr_ANN)

# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr_ANN, tpr_ANN, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_ANN:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
#initialize neural network model and fit--try WideDeep second
# `auto_discrete` is used to decide wether to discretize continous varibles automatically.
conf = ModelConfig(
    nets=WideDeep,
    metrics=['AUC', 'accuracy'],
    auto_discrete=True
)
dt = DeepTable(config=conf)
model, history = dt.fit( X_train_df, Y_train_s, epochs=100, validation_data=(X_val_df, Y_val_s))
score = dt.evaluate(X_test_df, Y_test_s)
preds = dt.predict(X_test_df)

In [None]:
# Calculate ROC curve
y_pred_prob_ANN = dt.predict_proba(X_clean_nn_test)[:, 1]
fpr_ANN, tpr_ANN, thresholds = roc_curve(Y_clean_nn_test, y_pred_prob_ANN)

# Calculate the Area Under the ROC Curve (AUC)
roc_auc_ANN = auc(fpr_ANN, tpr_ANN)

# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr_ANN, tpr_ANN, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_ANN:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
#initialize neural network model and fit--try DCN last
# `auto_discrete` is used to decide wether to discretize continous varibles automatically.
conf = ModelConfig(
    nets=DCN,
    metrics=['AUC', 'accuracy'],
    auto_discrete=True
)
dt = DeepTable(config=conf)
model, history = dt.fit( X_train_df, Y_train_s, epochs=100, validation_data=(X_val_df, Y_val_s))
score = dt.evaluate(X_test_df, Y_test_s)
preds = dt.predict(X_test_df)

In [None]:
# Calculate ROC curve
y_pred_prob_ANN = dt.predict_proba(X_clean_nn_test)[:, 1]
fpr_ANN, tpr_ANN, thresholds = roc_curve(Y_clean_nn_test, y_pred_prob_ANN)

# Calculate the Area Under the ROC Curve (AUC)
roc_auc_ANN = auc(fpr_ANN, tpr_ANN)

# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr_ANN, tpr_ANN, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_ANN:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()