In [None]:
##install and import necessary modules
##this code was originally designed and run in google colab
##use outside of colab may require modification
##if using colab, you may need to restart your runtime after installing modules,
##depending on enviornment at time of code running.

!pip install scikit-learn==1.5.2
!pip install tensorflow==2.12.1
!pip install xgboost==2.0.2
!pip install shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import shap
import seaborn as sn
import sys
import sklearn
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from IPython import display
from sklearn.metrics import roc_curve, auc, roc_auc_score

sn.set(style='whitegrid')
pd.set_option('display.max_columns', None)

print("Python version:", sys.version)
print("scikit-learn version:", sklearn.__version__)
print("XGBoost version:", xgb.__version__)
print("shap version:", shap.__version__)

In [None]:
##import your data
##mount google drive if using in colab. Replace <MOUNT_POINT> with the directory where you want to mount the drive (e.g., /content/drive).
drive.mount('<MOUNT_POINT>')

# Replace <YOUR_FILE_PATH> with the actual path inside your Google Drive (e.g., My Drive/FileNameHere).
file_path = '<MOUNT_POINT>/<YOUR_FILE_PATH>.csv'

In [None]:
# Step 1: Get all columns in the file without loading the full data
all_columns = pd.read_csv(file_path, nrows=0).columns.tolist()

# Step 2: Define columns you want to exclude
columns_to_exclude = ['TRISS_Death', 'TRISS', 'TRISS_b_neg', 'TRISS_b', 'TRISS_AGE', 'RTS', 'RTS_GCS', 'RTS_SBP', 'RTS_RR']

# Step 3: Create the list of columns to include
columns_to_include = [col for col in all_columns if col not in columns_to_exclude]

# Step 4: Load data only with included columns
data = pd.read_csv(file_path, usecols=columns_to_include,
                   na_values=['NA', 'N/A', 'NULL', ' ', '', '-99', '-98', '-99.0', '-99.00', '-98.0', '-98.00', 'NaN'])

In [None]:
# Filter out rows where 'TRAUMATYPE' is 26 (a type of missing), 'Other/unspecified', or 'Burn'
try:
  exclude_values = ['26', 'Other/unspecified', 'Burn']
  data = data[~data['TRAUMATYPE'].isin(exclude_values)]
except:
  pass

# Create ShockIndex with the required logic
data['ShockIndex'] = np.where(
    data['SBP'] == 0, 2.0,  # Case where SBP is 0 → set ShockIndex to 2.0
    data['PULSERATE'] / data['SBP']  # Normal calculation
)

# Set ShockIndex to NaN if PULSERATE or SBP is missing
data.loc[data['PULSERATE'].isna() | data['SBP'].isna(), 'ShockIndex'] = np.nan

##reset indices of the df
data.reset_index(drop=True, inplace=True)

In [None]:
##verify data appears as intended
data.head()

In [None]:
##check for missing values
data.isnull().sum(axis=0)

In [None]:
##create a datafram of all other variables we want to remove from training the model.  Some are available too late, other are essentially duplicates
complications_df=pd.DataFrame()
complications_list= [
                    'EDDISCHARGEDISPOSITION',
                    'HOSPDISCHARGEDISPOSITION',
                    'EDDISCHARGEHRS',
                    'WITHDRAWALLST',
                    'VTEPROPHYLAXISTYPE',
                    'TOTALICULOS',
                    'TOTALVENTDAYS',
                    'VTEPROPHYLAXISHRS',
                    'VTEPROPHYLAXISDAYS', 'MORTALITY', 'EDDISCHARGEDAYS','FINALDISCHARGEDAYS','FINALDISCHARGEHRS', 'HMRRHGCTRLSURGDAYS',  'WITHDRAWALLSTHRS',
                    'AMERICANINDIAN', 'ASIAN', 'BLACK', 'PACIFICISLANDER', 'RACEOTHER', 'WHITE', 'RACE_NA', 'RACE_UK',
                    "IntracranialVascularInjury",
                    "BrainStemInjury",
                    "EDH",
                    "SAH",
                    "SDH",
                    "SkullFx",
                    "DAI",
                    "NeckVascularInjury",
                    "ThoracicVascularInjury",
                    "AeroDigestiveInjury",
                    "CardiacInjury",
                    "LungInjury",
                    "AbdominalVascular",
                    "RibFx",
                    "KidneyInjury",
                    "StomachInjury",
                    "SpleenInjury",
                    "UroGenInternalInjury",
                    "SCI",
                    "SpineFx",
                    "UEAmputation",
                    "UEVascularInjury",
                    "UELongBoneFx",
                    "LEVascularInjury",
                    "PelvicFx",
                    "LEAmputation",
                    "PancreasInjury",
                    "LELongBoneFx",
                    "LiverInjury",
                    "ColorectalInjury",
                    "SmallBowelInjury",
                    "NumberOfInjuries"
                    ]
for c in complications_list:
    complications_df[c] = data[c]
complications_df

In [None]:
##this is where we choose our outcome variable, mortality, and give it its own dataframe

Y_data = pd.DataFrame()
Y_data['MORTALITY'] = data['MORTALITY']
Y_data

In [None]:
##clean Y_data by replacing "Yes" and "No" vcalues with 0's and 1's

Y_data['MORTALITY'] = Y_data['MORTALITY'].replace({'Yes': 1, 'No': 0})
Y_data

In [None]:
##now drop the these not-used vars from our input space
X_data = data.drop(columns=complications_list)
X_data.shape

In [None]:
##ensure no missing outcome data
Missing_Y = Y_data.isnull().sum(axis=0)
Missing_Y

In [None]:
##If we have no missing values here, our data is clean
Y_clean=Y_data.copy()

In [None]:
##if above check passes, outcome data is now clean
Missing_Y_clean = Y_clean.isnull().sum(axis=0)
Missing_Y_clean

In [None]:
##check which variables in the input space have missing variables

Missing = X_data.isnull().sum(axis=0)
Missing[Missing>0]

In [None]:
##order variables with missing data by percentage

data_missing = (X_data.isnull().sum(axis=0)/X_data.shape[0]) * 100
data_missing

In [None]:
##display variables withOUT mising data

data_missing[data_missing == 0].index

In [None]:
#remove the good columns (no missing values) from data_missing

data_missing = data_missing.drop(data_missing[data_missing == 0].index)
data_missing

In [None]:
#sort this in ascending order
data_missing = data_missing.sort_values(ascending=False)
data_missing

In [None]:
##prepare to drop variables with >50% missing values

dropCutoff=50
bad_column_names = data_missing[data_missing >=dropCutoff].index
bad_column_names

In [None]:
##actually drop bad variables
X_data_new=X_data.drop(columns=bad_column_names, axis=1)

##check for which variables still have missing data (<50% missing values)
Missing = X_data_new.isnull().sum(axis=0)
Missing[Missing>0]

In [None]:
#display columns with less than 50% missing that need to be cleaned

to_be_cleaned_column_names = data_missing[data_missing <50].index
to_be_cleaned_column_names

In [None]:
# Display the entire DataFrame without truncation
pd.set_option('display.max_columns', None)

# Get column names and data types
columns_info = []
for column_name, dtype in zip(X_data_new.columns, X_data_new.dtypes):
    columns_info.append(f"{column_name}: {dtype}")

formatted_columns_info = "\n".join(columns_info)

# Print column names and data types
print("Column Names and Data Types:")
print(formatted_columns_info)

In [None]:
##convert No's and Yes's to 0's and 1's to minimize the amount of double variables (want to avoid Yes/Nos being converted to 1-hot variables)

try:
    X_data_new= X_data_new.replace({True: 1, 'Yes': 1, "Female": 1, False: 0, 'No': 0, "Male": 0})
except:
    pass

##drop any non blunt/penetrating mechanisms
try:
    X_data_new=X_data_new.drop(['TRAUMATYPE_26', 'TRAUMATYPE_Other/unspecified'], axis=1)
except:
    pass

X_data_new.head()

In [None]:
##split into train, test, calibrate sets
X_train, X_test, Y_train, Y_test = train_test_split(X_data_new, Y_clean, test_size=0.2, random_state=0, stratify=Y_clean)
X_train_cal, X_val_cal, Y_train_cal, Y_val_cal = train_test_split(X_train, Y_train, test_size=0.2, random_state=0, stratify=Y_train)

In [None]:
##perform median/mode imputation on the inputs vars that are missing
for c in to_be_cleaned_column_names:
    v = X_train[c]
    v_valid = v[~v.isnull()]

    if v.dtype == np.dtype('O'):  # Categorical column
        mode_value = v_valid.value_counts().index[0]
        for df in [X_train, X_test, X_train_cal, X_val_cal]:
            df[c] = df[c].fillna(mode_value).astype(object)

    else:  # Numeric column
        median_value = v_valid.median()
        for df in [X_train, X_test, X_train_cal, X_val_cal]:
            df[c] = df[c].fillna(median_value)

In [None]:
##now for one-hot encoding

# Identify categorical columns from X_train only
categorical_column = [c for c in X_train_cal.columns if X_train_cal[c].dtype == np.dtype('O')]

# Apply pd.get_dummies to training data
X_train_cal = pd.get_dummies(X_train_cal, columns=categorical_column, sparse=False)

categorical_column

In [None]:
# Align test and validation sets to match training set columns
X_test = pd.get_dummies(X_test, columns=categorical_column, sparse=False)
X_train = pd.get_dummies(X_train, columns=categorical_column, sparse=False)
X_val_cal = pd.get_dummies(X_val_cal, columns=categorical_column, sparse=False)

# Ensure same columns across all datasets
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
X_train = X_train.reindex(columns=X_train.columns, fill_value=0)
X_val_cal = X_val_cal.reindex(columns=X_train.columns, fill_value=0)

In [None]:
#verify data appears as intended
X_train_cal.head()

In [None]:
##verify no missing data in any split dataset
print(X_train_cal.isnull().sum().sum())
print(X_test.isnull().sum().sum())
print(X_val_cal.isnull().sum().sum())

In [None]:
##final list of training columns
X_train_cal.columns

In [None]:
#verify data is intended size
X_test.shape

In [None]:
##store copies of data as tensors
X_train_tensor=X_train_cal.copy()
Y_train_tensor=Y_train_cal.copy()

X_val_tensor=X_val_cal.copy()
Y_val_tensor=Y_val_cal.copy()

X_test_tensor=X_test.copy()
Y_test_tensor=Y_test.copy()

In [None]:
##verify data appears as intended
X_test.head()

In [None]:
##Next step is to normalize data

scaler=StandardScaler()
#get the parameters of the transform
scaler.fit(X_train_cal)

#normalize the features in the training set
X_train_s_cal = scaler.transform(X_train_cal)
#normalize the features in the test set
print("After train/test split, X_test shape:", X_test.shape)
X_test_s = scaler.transform(X_test)
print("After scaling, X_test_s shape:", X_test_s.shape)
#normalize the features in the val set
X_val_s_cal = scaler.transform(X_val_cal)

In [None]:
##now, fit model with hyperparameters based on other Jupyternotebook optimization
model_best_gb = xgb.XGBClassifier(random_state=0, colsample_bytree=0.6, learning_rate=0.1, max_depth=7, n_estimators=200, subsample=1.0)
model_best_gb.fit(X_train_s_cal, Y_train_cal)

In [None]:
# Get predicted probabilities for test set (evaluate model)

y_prob_gbo_mtp = model_best_gb.predict_proba(X_test_s)[:, 1]

# Compute AUROC on test set
auroc_gbo = roc_auc_score(Y_test, y_prob_gbo_mtp)
print(f"AUROC on the test set: {auroc_gbo}")

In [None]:
##now, use Shapley Additive Explanations for better assessment and visualization of feature imprtance

your_dataframe = X_train_tensor  # will use this to get column labels, so need the tensor
model=model_best_gb

# Calculate SHAP values for X_test
explainer = shap.TreeExplainer(model)
shap_values_test = explainer.shap_values(X_train_s_cal)

# Calculate mean absolute SHAP values
mean_abs_shap = np.abs(shap_values_test).mean(axis=0)

# Sort feature indices based on mean absolute SHAP values
sorted_indices = np.argsort(mean_abs_shap)

# Identify top 20 most important features
top_5_percent_indices = sorted_indices[-20:]

# Extract top 20 SHAP values and features
top_5_percent_shap_values = shap_values_test[:, top_5_percent_indices]
top_5_percent_feature_names = your_dataframe.columns[top_5_percent_indices]

# Create horizontal bar chart for top 20 most important features
fig1, ax1 = plt.subplots(figsize=(12, 6))
bars = ax1.barh(top_5_percent_feature_names, mean_abs_shap[top_5_percent_indices], color='lightblue')
ax1.set_xlabel('Mean Absolute SHAP Value')
ax1.set_title('Top 5% Most Important Features - Mean Absolute SHAP Values')
plt.tight_layout()
plt.show()