In [None]:
##install and import necessary modules
##this code was originally designed and run in google colab
##use outside of colab may require modification
##if using colab, you may need to restart your runtime after installing modules,
##depending on enviornment at time of code running.

!pip install scikit-learn==1.5.2
!pip install tensorflow==2.12.1
!pip install xgboost==2.0.2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import seaborn as sn
import sys
import sklearn
from google.colab import drive
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.utils import resample
from scipy.stats import mannwhitneyu
from IPython import display
from sklearn.metrics import roc_curve, auc, roc_auc_score, precision_recall_curve, recall_score, confusion_matrix, brier_score_loss, f1_score

sn.set(style='whitegrid')
pd.set_option('display.max_columns', None)

print("Python version:", sys.version)
print("scikit-learn version:", sklearn.__version__)
print("XGBoost version:", xgb.__version__)
print("shap version:", shap.__version__)

In [None]:
# #import your dataset
from google.colab import drive
##mount google drive if using in colab. Replace <MOUNT_POINT> with the directory where you want to mount the drive (e.g., /content/drive).
drive.mount('<MOUNT_POINT>')

# Replace <YOUR_FILE_PATH> with the actual path inside your Google Drive (e.g., My Drive/FileNameHere).
file_path = '<MOUNT_POINT>/<YOUR_FILE_PATH>.csv'

In [None]:
##specify columns to load from your dataset.  We will only load the columns necessary for the score and the variables necessary for the ISS/TRISS comparisons

columns_to_load = ['AGEYEARS', 'TOTALGCS', 'SBP'
                  , 'TEMPERATURE'
                  , 'PULSERATE', 'TRISS', 'TRISS_Death', 'MORTALITY', 'TRAUMATYPE'
                  , 'WEIGHT'
                  , 'ISS_05', 'NumberOfInjuries',
                   'IntracranialVascularInjury','BrainStemInjury','EDH','SAH','SDH','SkullFx','DAI','NeckVascularInjury','ThoracicVascularInjury','AeroDigestiveInjury',
                   'CardiacInjury','LungInjury','AbdominalVascular','RibFx','KidneyInjury','StomachInjury','SpleenInjury','UroGenInternalInjury','SCI','SpineFx',
                   'UEAmputation','UEVascularInjury','UELongBoneFx','LEVascularInjury','PelvicFx','LEAmputation','PancreasInjury','LELongBoneFx','LiverInjury',
                   'ColorectalInjury','SmallBowelInjury','IPH'
                   , 'SEX', 'ETHNICITY', 'PRIMARYMETHODPAYMENT', 'RACE'
                   ]

In [None]:
# Import data and specify missing values
data = pd.read_csv(file_path, na_values=['NA', 'N/A', 'NULL', ' ', '', '-99', '-98', '-99.0', '-99.00', '-98.0', '-98.00', 'NaN'], usecols=columns_to_load)


# Filter out rows where 'TRAUMATYPE' is 26, 'Other/unspecified', or 'Burn'
try:
  exclude_values = ['26', 'Other/unspecified', 'Burn']
  data = data[~data['TRAUMATYPE'].isin(exclude_values)]
except:
  pass

##explicitly list variables that need to be present for inclusion and drop cases without these
##we cannot compare our score to ISS/TRISS without those metrics, and we need our target outcome mortality
required_vars = ['ISS_05', 'TRISS_Death', 'MORTALITY']
data = data.dropna(subset=required_vars)

# Create ShockIndex with the required logic
data['ShockIndex'] = np.where(
    data['SBP'] == 0, 2.0,  # Case where SBP is 0 → set ShockIndex to 2.0
    data['PULSERATE'] / data['SBP']  # Normal calculation
)

# Set ShockIndex to NaN if PULSERATE or SBP is missing
data.loc[data['PULSERATE'].isna() | data['SBP'].isna(), 'ShockIndex'] = np.nan

##reset indices of the df
data.reset_index(drop=True, inplace=True)

In [None]:
##verify data appears as intended
data.head()

In [None]:
##check for missing values
data.isnull().sum(axis=0)

In [None]:
##create a datafram of all complications/vars to remove later.  We can remove all of these from the X data set and pick one to be
#our Y dataset

complications_df=pd.DataFrame()
complications_list= [
                    'MORTALITY', 'TRISS', 'SEX', 'PRIMARYMETHODPAYMENT', 'RACE', 'ETHNICITY', 'ISS_05'
                    ]
for c in complications_list:
    complications_df[c] = data[c]
complications_df

In [None]:
##this is where we choose our outcome variable, mortality, and give it its own dataframe

Y_data = pd.DataFrame()
Y_data['MORTALITY'] = data['MORTALITY']
Y_data

In [None]:
##clean Y_data by replacing "Yes" and "No" vcalues with 0's and 1's

Y_data['MORTALITY'] = Y_data['MORTALITY'].replace({'Yes': 1, 'No': 0})
Y_data

In [None]:
##now drop the outcome from our feature space as well as TRISS, since were using 1-TRISS (aka TRISS_Death) and this varibale is now useless
##also drop these vars that will be used for fairness assessment but will not be used in the model
X_data = data.drop(columns=['MORTALITY', 'TRISS', 'ETHNICITY', 'PRIMARYMETHODPAYMENT', 'RACE', 'SEX'])
X_data.shape

In [None]:
##ensure no missing outcome data
Missing_Y = Y_data.isnull().sum(axis=0)
Missing_Y

In [None]:
##If we have no missing values here, our data is clean
Y_clean=Y_data.copy()

In [None]:
##if above check passes, outcome data is now clean
Missing_Y_clean = Y_clean.isnull().sum(axis=0)
Missing_Y_clean

In [None]:
##check which variables in the input space have missing variables

Missing = X_data.isnull().sum(axis=0)
Missing[Missing>0]

In [None]:
##order variables with missing data by percentage

data_missing = (X_data.isnull().sum(axis=0)/X_data.shape[0]) * 100
data_missing

In [None]:
##display variables withOUT mising data

data_missing[data_missing == 0].index

In [None]:
#remove the good columns (no missing values) from data_missing

data_missing = data_missing.drop(data_missing[data_missing == 0].index)
data_missing

In [None]:
#sort this in ascending order
data_missing = data_missing.sort_values(ascending=False)
data_missing

In [None]:
##prepare to drop variables with >50% missing values
##tried different cutoffs for this (33%, 66%), but 50% yielded best results

dropCutoff=50
bad_column_names = data_missing[data_missing >=dropCutoff].index
bad_column_names

In [None]:
##actually drop bad variables
X_data_new=X_data.drop(columns=bad_column_names, axis=1)

##check for which variables still have missing data (<50% missing values)
Missing = X_data_new.isnull().sum(axis=0)
Missing[Missing>0]

In [None]:
#display columns with less than 50% missing that need to be cleaned

to_be_cleaned_column_names = data_missing[data_missing <50].index
to_be_cleaned_column_names

In [None]:
# Rename the 'TRAUMATYPE' column to 'Penetrating' and map the values to 0 and 1
X_data_new['Penetrating'] = X_data_new['TRAUMATYPE'].map({'Penetrating': 1, 'Blunt': 0})

# Drop the old 'TRAUMATYPE' column
X_data_new.drop(columns=['TRAUMATYPE'], inplace=True)

print(X_data_new.head())

In [None]:
# Display the entire DataFrame without truncation
pd.set_option('display.max_columns', None)

# Get column names and data types
columns_info = []
for column_name, dtype in zip(X_data_new.columns, X_data_new.dtypes):
    columns_info.append(f"{column_name}: {dtype}")

formatted_columns_info = "\n".join(columns_info)

# Print column names and data types
print("Column Names and Data Types:")
print(formatted_columns_info)

In [None]:
##convert No's and Yes's to 0's and 1's to minimize the amount of double variables (want to avoid Yes/Nos being converted to 1-hot variables)

try:
    X_data_new= X_data_new.replace({True: 1, 'Yes': 1, "Female": 1, False: 0, 'No': 0, "Male": 0})
except:
    pass

##drop any non blunt/penetrating mechanisms
try:
    X_data_new=X_data_new.drop(['TRAUMATYPE_26', 'TRAUMATYPE_Other/unspecified'], axis=1)
except:
    pass

X_data_new.head()

In [None]:
##split into train, test, calibrate sets
X_train, X_test, Y_train, Y_test = train_test_split(X_data_new, Y_clean, test_size=0.2, random_state=0, stratify=Y_clean)
X_train_cal, X_val_cal, Y_train_cal, Y_val_cal = train_test_split(X_train, Y_train, test_size=0.2, random_state=0, stratify=Y_train)

In [None]:
##perform median/mode imputation on the inputs vars that are missing
for c in to_be_cleaned_column_names:
    v = X_train[c]
    v_valid = v[~v.isnull()]

    if v.dtype == np.dtype('O'):  # Categorical column
        mode_value = v_valid.value_counts().index[0]
        for df in [X_train, X_test, X_train_cal, X_val_cal]:
            df[c] = df[c].fillna(mode_value).astype(object)

    else:  # Numeric column
        median_value = v_valid.median()
        for df in [X_train, X_test, X_train_cal, X_val_cal]:
            df[c] = df[c].fillna(median_value)


In [None]:
##now for one-hot encoding

# Identify categorical columns from X_train only
categorical_column = [c for c in X_train_cal.columns if X_train_cal[c].dtype == np.dtype('O')]

# Apply pd.get_dummies to training data
X_train_cal = pd.get_dummies(X_train_cal, columns=categorical_column, sparse=False)

categorical_column

In [None]:
# Align test and validation sets to match training set columns
X_test = pd.get_dummies(X_test, columns=categorical_column, sparse=False)
X_val_cal = pd.get_dummies(X_val_cal, columns=categorical_column, sparse=False)

# Ensure same columns across all datasets
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
X_val_cal = X_val_cal.reindex(columns=X_train.columns, fill_value=0)

In [None]:
#verify data appears as intended
X_train_cal.head()

In [None]:
##verify no missing data in any split dataset
print(X_train_cal.isnull().sum().sum())
print(X_test.isnull().sum().sum())
print(X_val_cal.isnull().sum().sum())

In [None]:
##final list of training columns
X_train_cal.columns

In [None]:
#verify data is intended size
X_test.shape

In [None]:
##now with data cleaned, take comparison vars and move them to their own dataframe prior to dropping
new_to_drop = ['TRISS_Death', 'ISS_05']

X_ISS=pd.DataFrame()
X_ISS['ISS']=X_test['ISS_05']

X_TRISS=pd.DataFrame()
X_TRISS['TRISS']=X_test['TRISS_Death']

In [None]:
##now drop those comparison vars from the data that will be fed to the model
X_train_cal.drop(columns=new_to_drop, inplace=True)
X_test.drop(columns=new_to_drop, inplace=True)
X_val_cal.drop(columns=new_to_drop, inplace=True)


##store copies of data as tensors
X_train_tensor=X_train_cal.copy()
Y_train_tensor=Y_train_cal.copy()

X_val_tensor=X_val_cal.copy()
Y_val_tensor=Y_val_cal.copy()

X_test_tensor=X_test.copy()
Y_test_tensor=Y_test.copy()

In [None]:
##verify data appears as intended
X_test.head()

In [None]:
##Next step is to normalize data

scaler=StandardScaler()
#get the parameters of the transform
scaler.fit(X_train_cal)

#normalize the features in the training set
X_train_s_cal = scaler.transform(X_train_cal)
#normalize the features in the test set
print("After train/test split, X_test shape:", X_test.shape)
X_test_s = scaler.transform(X_test)
print("After scaling, X_test_s shape:", X_test_s.shape)
#normalize the features in the val set
X_val_s_cal = scaler.transform(X_val_cal)

In [None]:
##now, fit model with hyperparameters based on other Jupyternotebook optimization
model_best_gb = xgb.XGBClassifier(random_state=0, colsample_bytree=0.6, learning_rate=0.1, max_depth=7, n_estimators=200, subsample=1.0)
model_best_gb.fit(X_train_s_cal, Y_train_cal)

In [None]:
# Get predicted probabilities for test set (evaluate model)
from sklearn.metrics import roc_curve, auc, precision_recall_curve, recall_score, confusion_matrix

y_prob_gbo_mtp = model_best_gb.predict_proba(X_test_s)[:, 1]

# Compute AUROC on test set
auroc_gbo = roc_auc_score(Y_test, y_prob_gbo_mtp)
print(f"AUROC on the test set: {auroc_gbo}")

In [None]:
# Calibrate the model on the validation set
calibrated_model = CalibratedClassifierCV(estimator=model_best_gb, method='isotonic', cv='prefit')
calibrated_model.fit(X_val_s_cal, Y_val_cal)

In [None]:
# Get predicted probabilities for test set (evaluate model)
y_prob_gbo_mtp = calibrated_model.predict_proba(X_test_s)[:, 1]

# Compute AUROC on test set (calibrated)
auroc_gbo = roc_auc_score(Y_test, y_prob_gbo_mtp)
print(f"AUROC on the test set: {auroc_gbo}")

In [None]:
from sklearn.linear_model import LogisticRegression

# Fit logistic regression: Outcome ~ ISS
lr_iss = LogisticRegression()
lr_iss.fit(X_ISS.values.reshape(-1,1), Y_test)

# Predict probabilities
iss_probs = lr_iss.predict_proba(X_ISS.values.reshape(-1,1))[:,1]

In [None]:
# Define a function to help evaluate model performance for a specified subgroup
def evaluate_subgroup(mask, X_test_tensor, complications_df, calibrated_model, threshold=0.5):
    """
    Evaluate model performance for a specified subgroup (e.g., males or females).

    Parameters:
    - mask: boolean mask from complications_df
    - X_test_tensor: full test feature matrix (pandas DataFrame or array)
    - complications_df: full dataframe containing 'MORTALITY' and subgroup columns
    - calibrated_model: trained and calibrated model
    - threshold: classification threshold for computing F1, confusion matrix, etc.

    Returns:
    - Dictionary of performance metrics
    """
    # Subset the test features and labels
    X_sub = X_test_tensor[mask]
    y_true = complications_df.loc[mask, 'MORTALITY'].values
    y_prob = calibrated_model.predict_proba(X_sub)[:, 1]

    # Primary metrics
    auroc = roc_auc_score(y_true, y_prob)
    brier = brier_score_loss(y_true, y_prob)

    # Threshold-dependent metrics
    y_pred = (y_prob >= threshold).astype(int)
    f1 = f1_score(y_true, y_pred)

    # Optional confusion matrix-derived stats
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    sensitivity = tp / (tp + fn) if (tp + fn) else 0
    specificity = tn / (tn + fp) if (tn + fp) else 0
    precision = tp / (tp + fp) if (tp + fp) else 0
    npv = tn / (tn + fn) if (tn + fn) else 0

    return {
        "AUROC": auroc,
        "Brier Score": brier,
        "F1 Score": f1,
        "Accuracy": accuracy,
        "Sensitivity (TPR)": sensitivity,
        "Specificity (TNR)": specificity,
        "Precision (PPV)": precision,
        "Negative Predictive Value (NPV)": npv
    }

In [None]:
# === Define Paired Bootstrap Function to compare models in subgroups ===
def paired_bootstrap_auc_test(
    y_true, predA, predB, n_boot=2000, alpha=0.025, random_state=None
):

    """
    Perform a paired bootstrap hypothesis test to compare AUROC between two models.

    Parameters:
    - y_true: array-like of true binary labels
    - predA: predicted probabilities from model A
    - predB: predicted probabilities from model B
    - n_boot: number of bootstrap iterations (default: 2000)
    - alpha: significance level for confidence intervals (default: 0.025 for 95% CI)
    - random_state: seed for reproducibility

    Returns:
    - Dictionary of AUROC scores, confidence intervals, p-value, and other summary statistics
    """

    # Ensure inputs are numpy arrays
    y_true = np.asarray(y_true)
    predA = np.asarray(predA)
    predB = np.asarray(predB)

    # Validate equal lengths of input arrays
    assert len(y_true) == len(predA) == len(predB), "Arrays must be the same length."
    n = len(y_true)

    # Compute baseline AUROC scores for both models
    aucA = roc_auc_score(y_true, predA)
    aucB = roc_auc_score(y_true, predB)
    baseline_diff = aucA - aucB

    # Initialize RNG and result containers
    rng = np.random.default_rng(random_state)
    aucAs = np.zeros(n_boot)
    aucBs = np.zeros(n_boot)
    diffs = np.zeros(n_boot)

    # Perform paired bootstrap resampling
    for i in range(n_boot):
        idx = rng.integers(0, n, size=n) # Sample with replacement
        try:
            # Compute AUROC for resampled subset
            aucAs[i] = roc_auc_score(y_true[idx], predA[idx])
            aucBs[i] = roc_auc_score(y_true[idx], predB[idx])
            diffs[i] = aucAs[i] - aucBs[i]
        except:
            # Handle edge cases (e.g., only one class in resample)
            aucAs[i] = aucBs[i] = diffs[i] = np.nan

    # Remove any invalid (NaN) results
    aucAs = aucAs[~np.isnan(diffs)]
    aucBs = aucBs[~np.isnan(diffs)]
    diffs = diffs[~np.isnan(diffs)]

    # Return summary statistics and bootstrap confidence intervals
    return {
        "aucA": aucA,
        "aucB": aucB,
        "aucA_ci_lower": np.percentile(aucAs, 100 * alpha),
        "aucA_ci_upper": np.percentile(aucAs, 100 * (1 - alpha)),
        "aucB_ci_lower": np.percentile(aucBs, 100 * alpha),
        "aucB_ci_upper": np.percentile(aucBs, 100 * (1 - alpha)),
        "baseline_diff": baseline_diff,
        "mean_diff": np.mean(diffs),
        "diff_ci_lower": np.percentile(diffs, 100 * alpha),
        "diff_ci_upper": np.percentile(diffs, 100 * (1 - alpha)),
        "p_value": min(1.0, 2 * min(np.mean(diffs < 0), np.mean(diffs > 0))),
        "coverage": (1 - alpha) * 100
    }


In [None]:
##Now were going to evaluate each subgroup against its counterpart

# Step 1: Prepare complications_test_df with mortality mapped to 0/1
complications_test_df = complications_df.loc[X_test.index].copy()
complications_test_df['MORTALITY'] = complications_test_df['MORTALITY'].map({'No': 0, 'Yes': 1})
complications_test_df = complications_test_df.dropna(subset=['MORTALITY'])
y_true_all = complications_test_df['MORTALITY'].astype(int).values

# Step 2: Generate model predictions (already trained + scaled)
X_test_scaled = scaler.transform(X_test_tensor)
y_prob_model = calibrated_model.predict_proba(X_test_scaled)[:, 1]

# Step 3: Add predictions to dataframe
complications_test_df['y_true'] = y_true_all
complications_test_df['y_prob_model'] = y_prob_model
complications_test_df['y_prob_iss'] = iss_probs.flatten()
complications_test_df['y_prob_triss'] = X_TRISS.values.flatten()

# Step 4: Evaluation function for any group
def evaluate_all_models(df, group_name):
    group_df = df[df['SEX'] == group_name]
    y_true = group_df['y_true']

    return {
        'Group': group_name,
        'AUROC_ML': roc_auc_score(y_true, group_df['y_prob_model']),
        'AUROC_ISS': roc_auc_score(y_true, group_df['y_prob_iss']),
        'AUROC_TRISS': roc_auc_score(y_true, group_df['y_prob_triss']),
        'Brier_ML': brier_score_loss(y_true, group_df['y_prob_model']),
        'Brier_ISS': brier_score_loss(y_true, group_df['y_prob_iss']),
        'Brier_TRISS': brier_score_loss(y_true, group_df['y_prob_triss']),
        'N': len(group_df),
        'Positives': int((y_true == 1).sum()),
        'Negatives': int((y_true == 0).sum())
    }

# Step 5: Run evaluation for Male and Female
results_male = evaluate_all_models(complications_test_df, 'Male')
results_female = evaluate_all_models(complications_test_df, 'Female')

# Step 6: Display results
for result in [results_male, results_female]:
    print(f"=== {result['Group']} ===")
    print(f"AUROC (ML):    {result['AUROC_ML']:.3f}")
    print(f"AUROC (ISS):   {result['AUROC_ISS']:.3f}")
    print(f"AUROC (TRISS): {result['AUROC_TRISS']:.3f}")
    print(f"Brier (ML):    {result['Brier_ML']:.3f}")
    print(f"Brier (ISS):   {result['Brier_ISS']:.3f}")
    print(f"Brier (TRISS): {result['Brier_TRISS']:.3f}")
    print(f"N:             {result['N']}")
    print(f"Positives:     {result['Positives']}")
    print(f"Negatives:     {result['Negatives']}\n")

    # Calculate AUROC delta between Male and Female for each method
delta_auroc_model = results_male['AUROC_ML'] - results_female['AUROC_ML']
delta_auroc_iss = results_male['AUROC_ISS'] - results_female['AUROC_ISS']
delta_auroc_triss = results_male['AUROC_TRISS'] - results_female['AUROC_TRISS']

# Print deltas
print("=== AUROC Deltas (Male - Female) ===")
print(f"Model: {delta_auroc_model:.4f}")
print(f"ISS:   {delta_auroc_iss:.4f}")
print(f"TRISS: {delta_auroc_triss:.4f}")
##

In [None]:
##shorthand to subset the dataset by sex and check the size

female_df= complications_test_df[complications_test_df['SEX'] == 'Female']
male_df= complications_test_df[complications_test_df['SEX'] == 'Male']
female_df.shape

In [None]:
##evaluate AUROCs in female subgroup

##get predicted probs/true labels for each model
predicted_prob_iss_female=female_df['y_prob_iss']
predicted_prob_triss_female=female_df['y_prob_triss']
predicted_prob_gbo_female=female_df['y_prob_model']
true_label_female=female_df['y_true']

# Calculate the FPR, TPR, and thresholds
fpr_iss_female, tpr_iss_female, thresholds_iss_female = roc_curve(true_label_female, predicted_prob_iss_female)

##now TRISS
fpr_triss_female, tpr_triss_female, thresholds_triss_female = roc_curve(true_label_female, predicted_prob_triss_female)

##and MLISS
fpr_gbo_female, tpr_gbo_female, thresholds_gbo_female = roc_curve(true_label_female, predicted_prob_gbo_female)

# Calculate the area under the ROC curve (AUROC)
roc_auc_iss_female = auc(fpr_iss_female, tpr_iss_female)

##now TRISS
roc_auc_triss_female = auc(fpr_triss_female, tpr_triss_female)

##and MLISS
roc_auc_gbo_female = auc(fpr_gbo_female, tpr_gbo_female)

# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr_gbo_female, tpr_gbo_female, color='b', lw=2, label=f'ROC curve (area = {roc_auc_gbo_female:.3f})')
plt.plot(fpr_triss_female, tpr_triss_female, color='green', lw=2, label=f'ROC AUC TRISS = {roc_auc_triss_female:.3f}')
plt.plot(fpr_iss_female, tpr_iss_female, color='darkorange', lw=2, label=f'ROC AUC ISS = {roc_auc_iss_female:.3f}')
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve-Female')
plt.legend(loc='lower right')
plt.show()

In [None]:
##now lets define a function to generate reliability diagrams for each model

def bootstrap_calibration_curve(y_true, y_prob, n_bins=10, n_boot=1000, random_state=None):
    """
    1) Compute the original bin-based calibration curve.
    2) Bootstrap the dataset n_boot times, each time recalculating the bin-based
       fraction of positives (prob_true) and storing it.
    3) Return the original curve + 95% CI per bin (based on 2.5 and 97.5 percentiles).
    """
    # -----------------------------
    # Original calibration curve
    # -----------------------------
    # prob_true_orig, prob_pred_orig = calibration_curve(...) does binning internally.
    # But we want to fix n_bins and ensure consistent binning across bootstraps.
    # We'll do a manual binning approach here to keep consistent bin boundaries.

    # Define bin edges (equally spaced from 0 to 1)
    bin_edges = np.linspace(0, 1, n_bins + 1)
    # Digitize predicted probabilities
    bin_indices = np.digitize(y_prob, bin_edges) - 1
    bin_indices[bin_indices == n_bins] = n_bins - 1  # cap any == n_bins to last bin

    # Prepare arrays to hold the original bin stats
    prob_pred_orig = np.zeros(n_bins)
    prob_true_orig = np.zeros(n_bins)
    counts_in_bin = np.zeros(n_bins, dtype=int)

    # Fill in the stats for each bin
    for i in range(n_bins):
        mask = (bin_indices == i)
        counts_in_bin[i] = np.sum(mask)
        if counts_in_bin[i] > 0:
            prob_pred_orig[i] = np.mean(y_prob[mask])   # mean predicted prob in this bin
            prob_true_orig[i] = np.mean(y_true[mask])   # fraction of positives (actual)
        else:
            # If bin is empty, set to NaN
            prob_pred_orig[i] = np.nan
            prob_true_orig[i] = np.nan

    # Remove empty bins (NaN) from the original arrays
    valid_mask = ~np.isnan(prob_pred_orig)
    prob_pred_orig = prob_pred_orig[valid_mask]
    prob_true_orig = prob_true_orig[valid_mask]

    # -----------------------------
    # Bootstrap to get CIs
    # -----------------------------
    rng = np.random.RandomState(random_state) if random_state else np.random

    # We'll store the fraction of positives (prob_true) for each bin in each bootstrap
    # but only for the bins that were valid in the original data
    boot_prob_true = np.zeros((n_boot, sum(valid_mask)))

    n_data = len(y_true)
    data_idx = np.arange(n_data)

    for b in range(n_boot):
        # Sample with replacement
        sample_indices = rng.randint(0, n_data, size=n_data)
        y_true_b = y_true[sample_indices]
        y_prob_b = y_prob[sample_indices]

        # Repeat the binning steps
        bin_indices_b = np.digitize(y_prob_b, bin_edges) - 1
        bin_indices_b[bin_indices_b == n_bins] = n_bins - 1

        prob_true_b = np.zeros(n_bins)
        for i in range(n_bins):
            mask_b = (bin_indices_b == i)
            if np.sum(mask_b) > 0:
                prob_true_b[i] = np.mean(y_true_b[mask_b])
            else:
                prob_true_b[i] = np.nan

        # filter to only valid bins
        prob_true_b = prob_true_b[valid_mask]
        boot_prob_true[b, :] = prob_true_b

    # Compute 2.5th and 97.5th percentile per bin (column-wise)
    lower_ci = np.nanpercentile(boot_prob_true, 2.5, axis=0)
    upper_ci = np.nanpercentile(boot_prob_true, 97.5, axis=0)

    return prob_pred_orig, prob_true_orig, lower_ci, upper_ci

# Extract predictions and true labels
y_true = np.array(female_df['y_true'])

y_prob_gbo = np.array(female_df['y_prob_model'])
y_prob_triss = np.array(female_df['y_prob_triss'])
y_prob_iss = np.array(female_df['y_prob_iss'])

# Helper function to get sorted calibration data
def get_bootstrap_calibration_data(y_true, y_prob, label, color, n_bins=10, n_boot=1000, random_state=42):
    prob_pred, prob_true, lower_ci, upper_ci = bootstrap_calibration_curve(
        y_true, y_prob, n_bins=n_bins, n_boot=n_boot, random_state=random_state
    )
    sort_idx = np.argsort(prob_pred)
    return {
        "x": prob_pred[sort_idx],
        "y": prob_true[sort_idx],
        "lower": lower_ci[sort_idx],
        "upper": upper_ci[sort_idx],
        "label": label,
        "color": color
    }

# Get calibration data for each method
calib_gbo = get_bootstrap_calibration_data(y_true, y_prob_gbo, label="ML Model", color='b')
calib_triss = get_bootstrap_calibration_data(y_true, y_prob_triss, label="TRISS", color='green')
calib_iss = get_bootstrap_calibration_data(y_true, y_prob_iss, label="ISS", color='darkorange')

# Compute Brier Scores
brier_gbo = brier_score_loss(y_true, y_prob_gbo)
brier_triss = brier_score_loss(y_true, y_prob_triss)
brier_iss = brier_score_loss(y_true, y_prob_iss)

# Print Brier scores
print(f"Brier Score - ML Model: {brier_gbo:.4f}")
print(f"Brier Score - TRISS:     {brier_triss:.4f}")
print(f"Brier Score - ISS:       {brier_iss:.4f}")

# Plot the reliability diagram
plt.figure(figsize=(8, 6))
plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')

##do this for all 3 models
for calib in [calib_gbo, calib_triss, calib_iss]:
    plt.plot(calib["x"], calib["y"], marker='o', label=f'{calib["label"]}', color=calib["color"])
    plt.fill_between(calib["x"], calib["lower"], calib["upper"], color=calib["color"], alpha=0.2)

plt.xlabel('Mean Predicted Probability')
plt.ylabel('Observed Mortality Rate')
plt.title('Reliability Diagram with 95% CI - Female')
plt.legend(loc='best')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.grid(True)
plt.show()

In [None]:
##lets define a function to generate a decision curve for each model

def net_benefit(y_true, y_prob, thresholds):
    """
    NetBenefit = (TP/N) - (FP/N)*(threshold/(1-threshold))
    """
    N = len(y_true)
    NB = []
    for t in thresholds:
        y_pred = (y_prob >= t).astype(int)
        TP = np.sum((y_true == 1) & (y_pred == 1))
        FP = np.sum((y_true == 0) & (y_pred == 1))
        if t == 1.0:
            nb_t = 0
        else:
            nb_t = (TP / N) - (FP / N) * (t / (1 - t))
        NB.append(nb_t)
    return NB

# Thresholds for decision curve
decision_thresholds = np.linspace(0.0, 1.0, 101)

# Make sure y_true and predicted probs are arrays
y_true_array_dc = np.array(female_df['y_true']).flatten()
y_prob_gbo_array_dc = np.array(female_df['y_prob_model'])
iss_probs_dc = np.array(female_df['y_prob_iss'])
X_TRISS_dc = female_df['y_prob_triss']

# Net benefit for your new model
NB_model = net_benefit(y_true_array_dc, y_prob_gbo_array_dc, decision_thresholds)

# # Net benefit for ISS (ISS predictions: X_ISS.values)
NB_ISS = net_benefit(y_true_array_dc, iss_probs_dc, decision_thresholds)

# Net benefit for TRISS (TRISS predictions: X_TRISS.values)
NB_TRISS = net_benefit(y_true_array_dc, X_TRISS_dc.values.flatten(), decision_thresholds)

# Net benefit for treat all and treat none
N = len(Y_test)
prevalence = np.mean(Y_test)  # fraction of positives
treat_all_nb = []
for t in decision_thresholds:
    if t == 1.0:
        treat_all_nb.append(0)
    else:
        treat_all_nb.append(prevalence - (1 - prevalence)*(t/(1-t)))

treat_none_nb = np.zeros_like(decision_thresholds)

# Plotting
plt.figure(figsize=(8, 6))
plt.plot(decision_thresholds, NB_model, label='New ML Model', color='b')
plt.plot(decision_thresholds, NB_ISS, label='ISS', color='darkorange')
plt.plot(decision_thresholds, NB_TRISS, label='TRISS', color='g')
plt.plot(decision_thresholds, treat_all_nb, label='Treat All', color='red', linestyle='--')
plt.plot(decision_thresholds, treat_none_nb, label='Treat None', color='grey', linestyle=':')

plt.xlabel('Threshold Probability')
plt.ylabel('Net Benefit')
plt.title('Decision Curve -Female')
plt.legend(loc='best')
plt.ylim([-0.04, 0.04])
plt.xlim([0, 1.0])
plt.grid(True)
plt.show()

In [None]:
##now do paired bootstrap testing to compare MLISS predictions to ISS/TRISS in females

# Create a dictionary mapping each comparator model (ISS, TRISS) to its predicted probabilities
pairs = {
    "ISS": female_df['y_prob_iss'],
    "TRISS": female_df['y_prob_triss']
}

# Loop through each comparator (ISS and TRISS)
for var_name, var_array in pairs.items():
    results = paired_bootstrap_auc_test(
        y_true=female_df['y_true'],
        predA=female_df['y_prob_model'],
        predB=var_array,
        n_boot=2000,      # or more for higher precision
        alpha=(0.05/2),
        random_state=42
    )
    coverage_str = f"{results['coverage']:.1f}%"

    # Print results comparing ML model to the current comparator (e.g., ISS or TRISS)
    print(f"--- ML Model vs. {var_name} --- in female patients")
    print(f"AUC(ML) = {results['aucA']:.3f}, {coverage_str} CI: "
          f"[{results['aucA_ci_lower']:.3f}, {results['aucA_ci_upper']:.3f}]")
    print(f"AUC({var_name}) = {results['aucB']:.3f}, {coverage_str} CI: "
          f"[{results['aucB_ci_lower']:.3f}, {results['aucB_ci_upper']:.3f}]")
    print(f"AUC diff (ML - {var_name}) = {results['baseline_diff']:.4f}, {coverage_str} CI: "
          f"[{results['diff_ci_lower']:.4f}, {results['diff_ci_upper']:.4f}]")
    print(f"p-value = {results['p_value']:.4f}\n")


In [None]:
##now do paired bootstrap testing to compare MLISS predictions to ISS/TRISS in males

# Create a dictionary mapping each comparator model (ISS, TRISS) to its predicted probabilities
pairs = {
    "ISS": male_df['y_prob_iss'],
    "TRISS": male_df['y_prob_triss']
}
# Loop through each comparator (ISS and TRISS)
for var_name, var_array in pairs.items():
    results = paired_bootstrap_auc_test(
        y_true=male_df['y_true'],
        predA=male_df['y_prob_model'],
        predB=var_array,
        n_boot=2000,      # or more for higher precision
        alpha=(0.05/2),
        random_state=42
    )
    coverage_str = f"{results['coverage']:.1f}%"
    # Print results comparing ML model to the current comparator (e.g., ISS or TRISS)
    print(f"--- ML Model vs. {var_name} ---")
    print(f"AUC(ML) = {results['aucA']:.3f}, {coverage_str} CI: "
          f"[{results['aucA_ci_lower']:.3f}, {results['aucA_ci_upper']:.3f}]")
    print(f"AUC({var_name}) = {results['aucB']:.3f}, {coverage_str} CI: "
          f"[{results['aucB_ci_lower']:.3f}, {results['aucB_ci_upper']:.3f}]")
    print(f"AUC diff (ML - {var_name}) = {results['baseline_diff']:.4f}, {coverage_str} CI: "
          f"[{results['diff_ci_lower']:.4f}, {results['diff_ci_upper']:.4f}]")
    print(f"p-value = {results['p_value']:.4f}\n")


In [None]:
##Now were going to evaluate each subgroup against its counterpart

# Step 1: Prepare complications_test_df with mortality mapped to 0/1
complications_test_df = complications_df.loc[X_test.index].copy()
complications_test_df['MORTALITY'] = complications_test_df['MORTALITY'].map({'No': 0, 'Yes': 1})

# Step 2: Ensure data types are numeric and clean
complications_test_df = complications_test_df.dropna(subset=['MORTALITY'])
y_true_all = complications_test_df['MORTALITY'].astype(int).values

# Step 3: Scale the test data if not already done
X_test_scaled = scaler.transform(X_test_tensor)  # Make sure you use the same scaler from training

# Step 4: Predict for entire test set once
y_prob_model = calibrated_model.predict_proba(X_test_scaled)[:, 1]

# Step 5: Attach probabilities and true labels to complications_test_df for slicing
complications_test_df['y_true'] = y_true_all
complications_test_df['y_prob_model'] = y_prob_model
complications_test_df['y_prob_iss'] = iss_probs.flatten()
complications_test_df['y_prob_triss'] = X_TRISS.values.flatten()

# Step 6: Define evaluation function
def evaluate_group(df, race_group_name, is_black=True):
    if is_black:
        group_df = df[df['RACE'] == race_group_name]
        group_label = race_group_name
    else:
        group_df = df[df['RACE'] != race_group_name]
        group_label = f"Non-{race_group_name}"

    y_true = group_df['y_true']

    return {
        'Group': group_label,
        'AUROC_ML': roc_auc_score(y_true, group_df['y_prob_model']),
        'AUROC_ISS': roc_auc_score(y_true, group_df['y_prob_iss']),
        'AUROC_TRISS': roc_auc_score(y_true, group_df['y_prob_triss']),
        'Brier_ML': brier_score_loss(y_true, group_df['y_prob_model']),
        'Brier_ISS': brier_score_loss(y_true, group_df['y_prob_iss']),
        'Brier_TRISS': brier_score_loss(y_true, group_df['y_prob_triss']),
        'N': len(group_df),
        'Positives': int((y_true == 1).sum()),
        'Negatives': int((y_true == 0).sum())
    }

# Step 7: Run evaluation
results_black = evaluate_group(complications_test_df, 'Black', is_black=True)
results_nonblack = evaluate_group(complications_test_df, 'Black', is_black=False)


# Step 8: Print results
for result in [results_black, results_nonblack]:
    print(f"=== {result['Group']} ===")
    print(f"AUROC (ML):    {result['AUROC_ML']:.3f}")
    print(f"AUROC (ISS):   {result['AUROC_ISS']:.3f}")
    print(f"AUROC (TRISS): {result['AUROC_TRISS']:.3f}")
    print(f"Brier (ML):    {result['Brier_ML']:.3f}")
    print(f"Brier (ISS):   {result['Brier_ISS']:.3f}")
    print(f"Brier (TRISS): {result['Brier_TRISS']:.3f}")
    print(f"N:             {result['N']}")
    print(f"Positives:     {result['Positives']}")
    print(f"Negatives:     {result['Negatives']}\n")

      # Calculate AUROC delta between Black and Non-Black for each method
delta_auroc_model = results_black['AUROC_ML'] - results_nonblack['AUROC_ML']
delta_auroc_iss = results_black['AUROC_ISS'] - results_nonblack['AUROC_ISS']
delta_auroc_triss = results_black['AUROC_TRISS'] - results_nonblack['AUROC_TRISS']

# Print deltas
print("=== AUROC Deltas (Black - Nonblack) ===")
print(f"Model: {delta_auroc_model:.4f}")
print(f"ISS:   {delta_auroc_iss:.4f}")
print(f"TRISS: {delta_auroc_triss:.4f}")

In [None]:
##shorthand to subset the dataset by race and check the size

black_df= complications_test_df[complications_test_df['RACE'] == 'Black']
nonblack_df= complications_test_df[complications_test_df['RACE'] != 'Black']
black_df.shape

In [None]:
##evaluate AUROCs in Black subgroup

##get predicted probs/true labels for each model
predicted_prob_iss_black=black_df['y_prob_iss']
predicted_prob_triss_black=black_df['y_prob_triss']
predicted_prob_gbo_black=black_df['y_prob_model']
true_label_black=black_df['y_true']

# Calculate the FPR, TPR, and thresholds
fpr_iss_black, tpr_iss_black, thresholds_iss_black = roc_curve(true_label_black, predicted_prob_iss_black)

##now TRISS
fpr_triss_black, tpr_triss_black, thresholds_triss_black = roc_curve(true_label_black, predicted_prob_triss_black)

##and MLISS
fpr_gbo_black, tpr_gbo_black, thresholds_gbo_black = roc_curve(true_label_black, predicted_prob_gbo_black)

# Calculate the area under the ROC curve (AUROC)
roc_auc_iss_black = auc(fpr_iss_black, tpr_iss_black)

##now TRISS
roc_auc_triss_black = auc(fpr_triss_black, tpr_triss_black)

##and MLISS
roc_auc_gbo_black = auc(fpr_gbo_black, tpr_gbo_black)


# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr_gbo_black, tpr_gbo_black, color='b', lw=2, label=f'ROC curve (area = {roc_auc_gbo_black:.3f})')
plt.plot(fpr_triss_black, tpr_triss_black, color='green', lw=2, label=f'ROC AUC TRISS = {roc_auc_triss_black:.3f}')
plt.plot(fpr_iss_black, tpr_iss_black, color='darkorange', lw=2, label=f'ROC AUC ISS = {roc_auc_iss_black:.3f}')
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve-Black')
plt.legend(loc='lower right')
plt.show()

In [None]:
##now reliability diagrams for all three methods in the Black subgroup

# Extract predictions and true labels
y_true = np.array(black_df['y_true'])
y_prob_gbo = np.array(black_df['y_prob_model'])
y_prob_triss = np.array(black_df['y_prob_triss'])
y_prob_iss = np.array(black_df['y_prob_iss'])

# Get calibration data for each method
calib_gbo = get_bootstrap_calibration_data(y_true, y_prob_gbo, label="ML Model", color='b')
calib_triss = get_bootstrap_calibration_data(y_true, y_prob_triss, label="TRISS", color='green')
calib_iss = get_bootstrap_calibration_data(y_true, y_prob_iss, label="ISS", color='darkorange')

# Compute Brier Scores
brier_gbo = brier_score_loss(y_true, y_prob_gbo)
brier_triss = brier_score_loss(y_true, y_prob_triss)
brier_iss = brier_score_loss(y_true, y_prob_iss)

# Print Brier scores
print(f"Brier Score - ML Model: {brier_gbo:.4f}")
print(f"Brier Score - TRISS:     {brier_triss:.4f}")
print(f"Brier Score - ISS:       {brier_iss:.4f}")

# Plot the reliability diagram
plt.figure(figsize=(8,6))
plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')

for calib in [calib_gbo, calib_triss, calib_iss]:
    plt.plot(calib["x"], calib["y"], marker='o', label=calib["label"], color=calib["color"])
    plt.fill_between(calib["x"], calib["lower"], calib["upper"], color=calib["color"], alpha=0.2)

plt.xlabel('Mean Predicted Probability')
plt.ylabel('Observed Mortality Rate')
plt.title('Reliability Diagram with 95% CI - Black Patients')
plt.legend(loc='best')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.grid(True)
plt.show()

In [None]:
##now decision curve analysis in the Black cohort
# ========================================


# Thresholds for decision curve
decision_thresholds = np.linspace(0.0, 1.0, 101)


# Make sure y_true and predicted probs are arrays
y_true_array_dc = np.array(black_df['y_true']).flatten()
y_prob_gbo_array_dc = np.array(black_df['y_prob_model'])
iss_probs_dc = np.array(black_df['y_prob_iss'])
X_TRISS_dc = black_df['y_prob_triss']

# Net benefit for your new model
NB_model = net_benefit(y_true_array_dc, y_prob_gbo_array_dc, decision_thresholds)

# # Net benefit for ISS (ISS predictions: X_ISS.values)
NB_ISS = net_benefit(y_true_array_dc, iss_probs_dc, decision_thresholds)

# Net benefit for TRISS (TRISS predictions: X_TRISS.values)
NB_TRISS = net_benefit(y_true_array_dc, X_TRISS_dc.values.flatten(), decision_thresholds)

# Net benefit for treat all and treat none
N = len(Y_test)
prevalence = np.mean(Y_test)  # fraction of positives
treat_all_nb = []
for t in decision_thresholds:
    if t == 1.0:
        treat_all_nb.append(0)
    else:
        treat_all_nb.append(prevalence - (1 - prevalence)*(t/(1-t)))

treat_none_nb = np.zeros_like(decision_thresholds)

# Plotting
plt.figure(figsize=(8, 6))
plt.plot(decision_thresholds, NB_model, label='New ML Model', color='b')
plt.plot(decision_thresholds, NB_ISS, label='ISS', color='darkorange')
plt.plot(decision_thresholds, NB_TRISS, label='TRISS', color='g')
plt.plot(decision_thresholds, treat_all_nb, label='Treat All', color='red', linestyle='--')
plt.plot(decision_thresholds, treat_none_nb, label='Treat None', color='grey', linestyle=':')

plt.xlabel('Threshold Probability')
plt.ylabel('Net Benefit')
plt.title('Decision Curve Analysis-Black')
plt.legend(loc='best')
plt.ylim([-0.04, 0.04])
plt.xlim([0, 1.0])
plt.grid(True)
plt.show()

In [None]:
##now do paired bootstrap testing to compare MLISS predictions to ISS/TRISS in Black patients

pairs = {
    "ISS": black_df['y_prob_iss'],
    "TRISS": black_df['y_prob_triss']
}

# Create a dictionary mapping each comparator model (ISS, TRISS) to its predicted probabilities
for var_name, var_array in pairs.items():
    # Run paired bootstrap test comparing ML model to the current comparator
    results = paired_bootstrap_auc_test(
        y_true=black_df['y_true'],
        predA=black_df['y_prob_model'],
        predB=var_array,
        n_boot=2000,      # or more for higher precision
        alpha=(0.05/2),
        random_state=42
    )
    coverage_str = f"{results['coverage']:.1f}%"
    # Print results comparing ML model to the current comparator (e.g., ISS or TRISS)
    print(f"--- ML Model vs. {var_name} --- in black patients")
    print(f"AUC(ML) = {results['aucA']:.3f}, {coverage_str} CI: "
          f"[{results['aucA_ci_lower']:.3f}, {results['aucA_ci_upper']:.3f}]")
    print(f"AUC({var_name}) = {results['aucB']:.3f}, {coverage_str} CI: "
          f"[{results['aucB_ci_lower']:.3f}, {results['aucB_ci_upper']:.3f}]")
    print(f"AUC diff (ML - {var_name}) = {results['baseline_diff']:.4f}, {coverage_str} CI: "
          f"[{results['diff_ci_lower']:.4f}, {results['diff_ci_upper']:.4f}]")
    print(f"p-value = {results['p_value']:.4f}\n")

In [None]:
##Now were going to evaluate each subgroup against its counterpart

# Step 1: Prepare complications_test_df with mortality mapped to 0/1
complications_test_df = complications_df.loc[X_test.index].copy()
complications_test_df['MORTALITY'] = complications_test_df['MORTALITY'].map({'No': 0, 'Yes': 1})
complications_test_df = complications_test_df.dropna(subset=['MORTALITY'])
y_true_all = complications_test_df['MORTALITY'].astype(int).values

# Step 2: Generate model predictions (already trained + scaled)
X_test_scaled = scaler.transform(X_test_tensor)
y_prob_model = calibrated_model.predict_proba(X_test_scaled)[:, 1]

# Step 3: Add predictions to dataframe
complications_test_df['y_true'] = y_true_all
complications_test_df['y_prob_model'] = y_prob_model
complications_test_df['y_prob_iss'] = iss_probs.flatten()
complications_test_df['y_prob_triss'] = X_TRISS.values.flatten()

# Step 4: Evaluation function for any group
def evaluate_all_models(df, group_name):
    group_df = df[df['ETHNICITY'] == group_name]
    y_true = group_df['y_true']

    return {
        'Group': group_name,
        'AUROC_ML': roc_auc_score(y_true, group_df['y_prob_model']),
        'AUROC_ISS': roc_auc_score(y_true, group_df['y_prob_iss']),
        'AUROC_TRISS': roc_auc_score(y_true, group_df['y_prob_triss']),
        'Brier_ML': brier_score_loss(y_true, group_df['y_prob_model']),
        'Brier_ISS': brier_score_loss(y_true, group_df['y_prob_iss']),
        'Brier_TRISS': brier_score_loss(y_true, group_df['y_prob_triss']),
        'N': len(group_df),
        'Positives': int((y_true == 1).sum()),
        'Negatives': int((y_true == 0).sum())
    }

# Step 5: Run evaluation for Hispanic v No
results_hisp = evaluate_all_models(complications_test_df, 'Hispanic or Latino')
results_nothisp = evaluate_all_models(complications_test_df, 'Not Hispanic or Latino')

# Step 6: Display results
for result in [results_hisp, results_nothisp]:
    print(f"=== {result['Group']} ===")
    print(f"AUROC (ML):    {result['AUROC_ML']:.3f}")
    print(f"AUROC (ISS):   {result['AUROC_ISS']:.3f}")
    print(f"AUROC (TRISS): {result['AUROC_TRISS']:.3f}")
    print(f"Brier (ML):    {result['Brier_ML']:.3f}")
    print(f"Brier (ISS):   {result['Brier_ISS']:.3f}")
    print(f"Brier (TRISS): {result['Brier_TRISS']:.3f}")
    print(f"N:             {result['N']}")
    print(f"Positives:     {result['Positives']}")
    print(f"Negatives:     {result['Negatives']}\n")

    # Calculate AUROC delta between Male and Female for each method
delta_auroc_model = results_hisp['AUROC_ML'] - results_nothisp['AUROC_ML']
delta_auroc_iss = results_hisp['AUROC_ISS'] - results_nothisp['AUROC_ISS']
delta_auroc_triss = results_hisp['AUROC_TRISS'] - results_nothisp['AUROC_TRISS']

# Print deltas
print("=== AUROC Deltas (Hisp - Not hisp) ===")
print(f"Model: {delta_auroc_model:.4f}")
print(f"ISS:   {delta_auroc_iss:.4f}")
print(f"TRISS: {delta_auroc_triss:.4f}")


In [None]:
##shorthand to subset the dataset by ethnicity and check the size
hisp_df= complications_test_df[complications_test_df['ETHNICITY'] == 'Hispanic or Latino']
nonhisp_df= complications_test_df[complications_test_df['ETHNICITY'] != 'Hispanic or Latino']
hisp_df.shape

In [None]:
##evaluate AUROCs in Hispanic subgroup

##get predicted probs/true labels for each model
predicted_prob_iss_hispanic=hisp_df['y_prob_iss']
predicted_prob_triss_hispanic=hisp_df['y_prob_triss']
predicted_prob_gbo_hispanic=hisp_df['y_prob_model']
true_label_hispanic=hisp_df['y_true']

# Calculate the FPR, TPR, and thresholds
fpr_iss_hispanic, tpr_iss_hispanic, thresholds_iss_hispanic = roc_curve(true_label_hispanic, predicted_prob_iss_hispanic)

##now TRISS
fpr_triss_hispanic, tpr_triss_hispanic, thresholds_triss_hispanic = roc_curve(true_label_hispanic, predicted_prob_triss_hispanic)

##and MLISS
fpr_gbo_hispanic, tpr_gbo_hispanic, thresholds_gbo_hispanic = roc_curve(true_label_hispanic, predicted_prob_gbo_hispanic)

# Calculate the area under the ROC curve (AUROC)
roc_auc_iss_hispanic = auc(fpr_iss_hispanic, tpr_iss_hispanic)

##now TRISS
roc_auc_triss_hispanic = auc(fpr_triss_hispanic, tpr_triss_hispanic)

##and MLISS
roc_auc_gbo_hispanic = auc(fpr_gbo_hispanic, tpr_gbo_hispanic)


# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr_gbo_hispanic, tpr_gbo_hispanic, color='b', lw=2, label=f'ROC curve (area = {roc_auc_gbo_hispanic:.3f})')
plt.plot(fpr_triss_hispanic, tpr_triss_hispanic, color='green', lw=2, label=f'ROC AUC TRISS = {roc_auc_triss_hispanic:.3f}')
plt.plot(fpr_iss_hispanic, tpr_iss_hispanic, color='darkorange', lw=2, label=f'ROC AUC ISS = {roc_auc_iss_hispanic:.3f}')
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve-Hispanic')
plt.legend(loc='lower right')
plt.show()

In [None]:
##now reliability diagrams for all three methods in the Hispanic subgroup

# Extract predictions and true labels
y_true = np.array(hisp_df['y_true'])
y_prob_gbo = np.array(hisp_df['y_prob_model'])
y_prob_triss = np.array(hisp_df['y_prob_triss'])
y_prob_iss = np.array(hisp_df['y_prob_iss'])

# Get calibration data for each method
calib_gbo = get_bootstrap_calibration_data(y_true, y_prob_gbo, label="ML Model", color='b')
calib_triss = get_bootstrap_calibration_data(y_true, y_prob_triss, label="TRISS", color='green')
calib_iss = get_bootstrap_calibration_data(y_true, y_prob_iss, label="ISS", color='darkorange')

# Compute Brier Scores
brier_gbo = brier_score_loss(y_true, y_prob_gbo)
brier_triss = brier_score_loss(y_true, y_prob_triss)
brier_iss = brier_score_loss(y_true, y_prob_iss)

# Print Brier scores
print(f"Brier Score - ML Model: {brier_gbo:.4f}")
print(f"Brier Score - TRISS:     {brier_triss:.4f}")
print(f"Brier Score - ISS:       {brier_iss:.4f}")

# Plot the reliability diagram
plt.figure(figsize=(8,6))
plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')

for calib in [calib_gbo, calib_triss, calib_iss]:
    plt.plot(calib["x"], calib["y"], marker='o', label=calib["label"], color=calib["color"])
    plt.fill_between(calib["x"], calib["lower"], calib["upper"], color=calib["color"], alpha=0.2)

plt.xlabel('Mean Predicted Probability')
plt.ylabel('Observed Mortality Rate')
plt.title('Reliability Diagram with 95% CI - Hispanic Patients')
plt.legend(loc='best')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.grid(True)
plt.show()

In [None]:
##now decision curve analysis in the Hispanic cohort
# ========================================


# Thresholds for decision curve
decision_thresholds = np.linspace(0.0, 1.0, 101)



# Make sure y_true and predicted probs are arrays
y_true_array_dc = np.array(hisp_df['y_true']).flatten()
y_prob_gbo_array_dc = np.array(hisp_df['y_prob_model'])
iss_probs_dc = np.array(hisp_df['y_prob_iss'])
X_TRISS_dc = hisp_df['y_prob_triss']

# Net benefit for your new model
NB_model = net_benefit(y_true_array_dc, y_prob_gbo_array_dc, decision_thresholds)

# # Net benefit for ISS (ISS predictions: X_ISS.values)
NB_ISS = net_benefit(y_true_array_dc, iss_probs_dc, decision_thresholds)

# Net benefit for TRISS (TRISS predictions: X_TRISS.values)
NB_TRISS = net_benefit(y_true_array_dc, X_TRISS_dc.values.flatten(), decision_thresholds)

# Net benefit for treat all and treat none
N = len(Y_test)
prevalence = np.mean(Y_test)  # fraction of positives
treat_all_nb = []
for t in decision_thresholds:
    if t == 1.0:
        treat_all_nb.append(0)
    else:
        treat_all_nb.append(prevalence - (1 - prevalence)*(t/(1-t)))

treat_none_nb = np.zeros_like(decision_thresholds)

# Plotting
plt.figure(figsize=(8, 6))
plt.plot(decision_thresholds, NB_model, label='New ML Model', color='b')
plt.plot(decision_thresholds, NB_ISS, label='ISS', color='darkorange')
plt.plot(decision_thresholds, NB_TRISS, label='TRISS', color='g')
plt.plot(decision_thresholds, treat_all_nb, label='Treat All', color='red', linestyle='--')
plt.plot(decision_thresholds, treat_none_nb, label='Treat None', color='grey', linestyle=':')

plt.xlabel('Threshold Probability')
plt.ylabel('Net Benefit')
plt.title('Decision Curve Analysis-Hispanic')
plt.legend(loc='best')
plt.ylim([-0.04, 0.04])
plt.xlim([0, 1.0])
plt.grid(True)
plt.show()

In [None]:
##now do paired bootstrap testing to compare MLISS predictions to ISS/TRISS in Hispanic patients

# Create a dictionary mapping each comparator model (ISS, TRISS) to its predicted probabilities
pairs = {
    "ISS": hisp_df['y_prob_iss'],
    "TRISS": hisp_df['y_prob_triss']
}

# Loop through each comparator (ISS and TRISS)
for var_name, var_array in pairs.items():

    # Run paired bootstrap test comparing ML model to the current comparator
    results = paired_bootstrap_auc_test(
        y_true=hisp_df['y_true'],
        predA=hisp_df['y_prob_model'],
        predB=var_array,
        n_boot=2000,      # or more for higher precision
        alpha=(0.05/2),
        random_state=42
    )
    coverage_str = f"{results['coverage']:.1f}%"
    # Print results comparing ML model to the current comparator (e.g., ISS or TRISS)
    print(f"--- ML Model vs. {var_name} --- in Hispanic patients")
    print(f"AUC(ML) = {results['aucA']:.3f}, {coverage_str} CI: "
          f"[{results['aucA_ci_lower']:.3f}, {results['aucA_ci_upper']:.3f}]")
    print(f"AUC({var_name}) = {results['aucB']:.3f}, {coverage_str} CI: "
          f"[{results['aucB_ci_lower']:.3f}, {results['aucB_ci_upper']:.3f}]")
    print(f"AUC diff (ML - {var_name}) = {results['baseline_diff']:.4f}, {coverage_str} CI: "
          f"[{results['diff_ci_lower']:.4f}, {results['diff_ci_upper']:.4f}]")
    print(f"p-value = {results['p_value']:.4f}\n")

In [None]:
##Now were going to evaluate each subgroup against its counterpart

# Step 1: Prepare complications_test_df with mortality mapped to 0/1
complications_test_df = complications_df.loc[X_test.index].copy()
complications_test_df['MORTALITY'] = complications_test_df['MORTALITY'].map({'No': 0, 'Yes': 1})

# Step 2: Ensure data types are numeric and clean
complications_test_df = complications_test_df.dropna(subset=['MORTALITY'])
y_true_all = complications_test_df['MORTALITY'].astype(int).values

# Step 3: Scale the test data if not already done
X_test_scaled = scaler.transform(X_test_tensor)  # Make sure you use the same scaler from training

# Step 4: Predict for entire test set once
y_prob_model = calibrated_model.predict_proba(X_test_scaled)[:, 1]

# Step 5: Attach probabilities and true labels to complications_test_df for slicing
complications_test_df['y_true'] = y_true_all
complications_test_df['y_prob_model'] = y_prob_model
complications_test_df['y_prob_iss'] = iss_probs.flatten()
complications_test_df['y_prob_triss'] = X_TRISS.values.flatten()

# Step 6: Define evaluation function
def evaluate_group(df, pay_group_name, is_uninsured=True):
    if is_uninsured:
        group_df = df[df['PRIMARYMETHODPAYMENT'] == pay_group_name]
        group_label = pay_group_name
    else:
        group_df = df[df['PRIMARYMETHODPAYMENT'] != pay_group_name]
        group_label = f"Non-{pay_group_name}"

    y_true = group_df['y_true']

    return {
        'Group': group_label,
        'AUROC_ML': roc_auc_score(y_true, group_df['y_prob_model']),
        'AUROC_ISS': roc_auc_score(y_true, group_df['y_prob_iss']),
        'AUROC_TRISS': roc_auc_score(y_true, group_df['y_prob_triss']),
        'Brier_ML': brier_score_loss(y_true, group_df['y_prob_model']),
        'Brier_ISS': brier_score_loss(y_true, group_df['y_prob_iss']),
        'Brier_TRISS': brier_score_loss(y_true, group_df['y_prob_triss']),
        'N': len(group_df),
        'Positives': int((y_true == 1).sum()),
        'Negatives': int((y_true == 0).sum())
    }

# Step 7: Run evaluation
results_uninsured = evaluate_group(complications_test_df, 'Self-Pay', is_uninsured=True)
results_insured = evaluate_group(complications_test_df, 'Self-Pay', is_uninsured=False)


# Step 8: Print results
for result in [results_uninsured, results_insured]:
    print(f"=== {result['Group']} ===")
    print(f"AUROC (ML):    {result['AUROC_ML']:.3f}")
    print(f"AUROC (ISS):   {result['AUROC_ISS']:.3f}")
    print(f"AUROC (TRISS): {result['AUROC_TRISS']:.3f}")
    print(f"Brier (ML):    {result['Brier_ML']:.3f}")
    print(f"Brier (ISS):   {result['Brier_ISS']:.3f}")
    print(f"Brier (TRISS): {result['Brier_TRISS']:.3f}")
    print(f"N:             {result['N']}")
    print(f"Positives:     {result['Positives']}")
    print(f"Negatives:     {result['Negatives']}\n")

      # Calculate AUROC delta between Male and Female for each method
delta_auroc_model = results_uninsured['AUROC_ML'] - results_insured['AUROC_ML']
delta_auroc_iss = results_uninsured['AUROC_ISS'] - results_insured['AUROC_ISS']
delta_auroc_triss = results_uninsured['AUROC_TRISS'] - results_insured['AUROC_TRISS']

# Print deltas
print("=== AUROC Deltas (Insured - Uninsured) ===")
print(f"Model: {delta_auroc_model:.4f}")
print(f"ISS:   {delta_auroc_iss:.4f}")
print(f"TRISS: {delta_auroc_triss:.4f}")

In [None]:
##shorthand to subset the dataset by insurance status and check the size
selfpay_df= complications_test_df[complications_test_df['PRIMARYMETHODPAYMENT'] == 'Self-Pay']
insured_df= complications_test_df[complications_test_df['PRIMARYMETHODPAYMENT'] != 'Self-Pay']
selfpay_df.shape

In [None]:
##evaluate AUROCs in uninsured subgroup

##get predicted probs/true labels for each model
predicted_prob_iss_selfpay=selfpay_df['y_prob_iss']
predicted_prob_triss_selfpay=selfpay_df['y_prob_triss']
predicted_prob_gbo_selfpay=selfpay_df['y_prob_model']
true_label_selfpay=selfpay_df['y_true']
# Calculate the FPR, TPR, and thresholds
fpr_iss_selfpay, tpr_iss_selfpay, thresholds_iss_selfpay = roc_curve(true_label_selfpay, predicted_prob_iss_selfpay)

##now TRISS
fpr_triss_selfpay, tpr_triss_selfpay, thresholds_triss_selfpay = roc_curve(true_label_selfpay, predicted_prob_triss_selfpay)

##and MLISS
fpr_gbo_selfpay, tpr_gbo_selfpay, thresholds_gbo_selfpay = roc_curve(true_label_selfpay, predicted_prob_gbo_selfpay)

# Calculate the area under the ROC curve (AUROC)
roc_auc_iss_selfpay = auc(fpr_iss_selfpay, tpr_iss_selfpay)

##now TRISS
roc_auc_triss_selfpay = auc(fpr_triss_selfpay, tpr_triss_selfpay)

##and MLISS
roc_auc_gbo_selfpay = auc(fpr_gbo_selfpay, tpr_gbo_selfpay)



# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr_gbo_selfpay, tpr_gbo_selfpay, color='b', lw=2, label=f'ROC curve (area = {roc_auc_gbo_selfpay:.3f})')
plt.plot(fpr_triss_selfpay, tpr_triss_selfpay, color='green', lw=2, label=f'ROC AUC TRISS = {roc_auc_triss_selfpay:.3f}')
plt.plot(fpr_iss_selfpay, tpr_iss_selfpay, color='darkorange', lw=2, label=f'ROC AUC ISS = {roc_auc_iss_selfpay:.3f}')
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve-Selfpay')
plt.legend(loc='lower right')
plt.show()

In [None]:
##now reliability diagrams for all three methods in the uninsured subgroup

# Extract predictions and true labels
y_true = np.array(selfpay_df['y_true'])
y_prob_gbo = np.array(selfpay_df['y_prob_model'])
y_prob_triss = np.array(selfpay_df['y_prob_triss'])
y_prob_iss = np.array(selfpay_df['y_prob_iss'])

# Get calibration data for each method
calib_gbo = get_bootstrap_calibration_data(y_true, y_prob_gbo, label="ML Model", color='b')
calib_triss = get_bootstrap_calibration_data(y_true, y_prob_triss, label="TRISS", color='green')
calib_iss = get_bootstrap_calibration_data(y_true, y_prob_iss, label="ISS", color='darkorange')

# Compute Brier Scores
brier_gbo = brier_score_loss(y_true, y_prob_gbo)
brier_triss = brier_score_loss(y_true, y_prob_triss)
brier_iss = brier_score_loss(y_true, y_prob_iss)

# Print Brier scores
print(f"Brier Score - ML Model: {brier_gbo:.4f}")
print(f"Brier Score - TRISS:     {brier_triss:.4f}")
print(f"Brier Score - ISS:       {brier_iss:.4f}")

# Plot the reliability diagram
plt.figure(figsize=(8,6))
plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')

for calib in [calib_gbo, calib_triss, calib_iss]:
    plt.plot(calib["x"], calib["y"], marker='o', label=calib["label"], color=calib["color"])
    plt.fill_between(calib["x"], calib["lower"], calib["upper"], color=calib["color"], alpha=0.2)

plt.xlabel('Mean Predicted Probability')
plt.ylabel('Observed Mortality Rate')
plt.title('Reliability Diagram with 95% CI - Self-pay Patients')
plt.legend(loc='best')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.grid(True)
plt.show()

In [None]:
##now decision curve analysis in the uninsured cohort
# ========================================


# Thresholds for decision curve
decision_thresholds = np.linspace(0.0, 1.0, 101)



# Make sure y_true and predicted probs are arrays
y_true_array_dc = np.array(selfpay_df['y_true']).flatten()
y_prob_gbo_array_dc = np.array(selfpay_df['y_prob_model'])
iss_probs_dc = np.array(selfpay_df['y_prob_iss'])
X_TRISS_dc = selfpay_df['y_prob_triss']

# Net benefit for your new model
NB_model = net_benefit(y_true_array_dc, y_prob_gbo_array_dc, decision_thresholds)

# # Net benefit for ISS (ISS predictions: X_ISS.values)
NB_ISS = net_benefit(y_true_array_dc, iss_probs_dc, decision_thresholds)

# Net benefit for TRISS (TRISS predictions: X_TRISS.values)
NB_TRISS = net_benefit(y_true_array_dc, X_TRISS_dc.values.flatten(), decision_thresholds)

# Net benefit for treat all and treat none
N = len(Y_test)
prevalence = np.mean(Y_test)  # fraction of positives
treat_all_nb = []
for t in decision_thresholds:
    if t == 1.0:
        treat_all_nb.append(0)
    else:
        treat_all_nb.append(prevalence - (1 - prevalence)*(t/(1-t)))

treat_none_nb = np.zeros_like(decision_thresholds)

# Plotting
plt.figure(figsize=(8, 6))
plt.plot(decision_thresholds, NB_model, label='New ML Model', color='b')
plt.plot(decision_thresholds, NB_ISS, label='ISS', color='darkorange')
plt.plot(decision_thresholds, NB_TRISS, label='TRISS', color='g')
plt.plot(decision_thresholds, treat_all_nb, label='Treat All', color='red', linestyle='--')
plt.plot(decision_thresholds, treat_none_nb, label='Treat None', color='grey', linestyle=':')

plt.xlabel('Threshold Probability')
plt.ylabel('Net Benefit')
plt.title('Decision Curve Analysis-Selfpay')
plt.legend(loc='best')
plt.ylim([-0.06, 0.06])
plt.xlim([0, 1.0])
plt.grid(True)
plt.show()

In [None]:
##now do paired bootstrap testing to compare MLISS predictions to ISS/TRISS in uninsured patients

# Create a dictionary mapping each comparator model (ISS, TRISS) to its predicted probabilities
pairs = {
    "ISS": selfpay_df['y_prob_iss'],
    "TRISS": selfpay_df['y_prob_triss']
}
# Loop through each comparator (ISS and TRISS)

for var_name, var_array in pairs.items():
    # Run paired bootstrap test comparing ML model to the current comparator
    results = paired_bootstrap_auc_test(
        y_true=selfpay_df['y_true'],
        predA=selfpay_df['y_prob_model'],
        predB=var_array,
        n_boot=2000,      # or more for higher precision
        alpha=(0.05/2),
        random_state=42
    )
    coverage_str = f"{results['coverage']:.1f}%"
    # Print results comparing ML model to the current comparator (e.g., ISS or TRISS)
    print(f"--- ML Model vs. {var_name} --- in uninsured patients")
    print(f"AUC(ML) = {results['aucA']:.3f}, {coverage_str} CI: "
          f"[{results['aucA_ci_lower']:.3f}, {results['aucA_ci_upper']:.3f}]")
    print(f"AUC({var_name}) = {results['aucB']:.3f}, {coverage_str} CI: "
          f"[{results['aucB_ci_lower']:.3f}, {results['aucB_ci_upper']:.3f}]")
    print(f"AUC diff (ML - {var_name}) = {results['baseline_diff']:.4f}, {coverage_str} CI: "
          f"[{results['diff_ci_lower']:.4f}, {results['diff_ci_upper']:.4f}]")
    print(f"p-value = {results['p_value']:.4f}\n")

In [None]:
##Now were going to evaluate each ISS subgroup

# Step 1: Prepare complications_test_df with mortality mapped to 0/1
complications_test_df = complications_df.loc[X_test.index].copy()
complications_test_df['MORTALITY'] = complications_test_df['MORTALITY'].map({'No': 0, 'Yes': 1})

# Step 2: Ensure data types are numeric and clean
complications_test_df = complications_test_df.dropna(subset=['MORTALITY', 'ISS_05'])
y_true_all = complications_test_df['MORTALITY'].astype(int).values

# Step 3: Scale the test data
X_test_scaled = scaler.transform(X_test_tensor)

# Step 4: Predict for entire test set once
y_prob_model = calibrated_model.predict_proba(X_test_scaled)[:, 1]

# Step 5: Attach predictions and labels
complications_test_df['y_true'] = y_true_all
complications_test_df['y_prob_model'] = y_prob_model
complications_test_df['y_prob_iss'] = iss_probs.flatten()
complications_test_df['y_prob_triss'] = X_TRISS.values.flatten()

# Step 6: Define evaluation function for ISS group
def evaluate_iss_group(df, min_iss, max_iss=None):
    if max_iss is None:
        group_df = df[df['ISS_05'] > min_iss]
        label = f"ISS > {min_iss}"
    else:
        group_df = df[(df['ISS_05'] >= min_iss) & (df['ISS_05'] <= max_iss)]
        label = f"ISS {min_iss}-{max_iss}"

    y_true = group_df['y_true']
    return {
        'Group': label,
        'AUROC_ML': roc_auc_score(y_true, group_df['y_prob_model']),
        'AUROC_ISS': roc_auc_score(y_true, group_df['y_prob_iss']),
        'AUROC_TRISS': roc_auc_score(y_true, group_df['y_prob_triss']),
        'Brier_ML': brier_score_loss(y_true, group_df['y_prob_model']),
        'Brier_ISS': brier_score_loss(y_true, group_df['y_prob_iss']),
        'Brier_TRISS': brier_score_loss(y_true, group_df['y_prob_triss']),
        'N': len(group_df),
        'Positives': int((y_true == 1).sum()),
        'Negatives': int((y_true == 0).sum())
    }

# Step 7: Evaluate each ISS group
results_iss_under9 = evaluate_iss_group(complications_test_df, 0, 8)
results_iss_9_15 = evaluate_iss_group(complications_test_df, 9, 15)
results_iss_16_25 = evaluate_iss_group(complications_test_df, 16, 25)
results_iss_over25 = evaluate_iss_group(complications_test_df, 25, None)

# Step 8: Print results
for result in [results_iss_under9, results_iss_9_15, results_iss_16_25, results_iss_over25]:
    print(f"=== {result['Group']} ===")
    print(f"AUROC (ML):    {result['AUROC_ML']:.3f}")
    print(f"AUROC (ISS):   {result['AUROC_ISS']:.3f}")
    print(f"AUROC (TRISS): {result['AUROC_TRISS']:.3f}")
    print(f"Brier (ML):    {result['Brier_ML']:.3f}")
    print(f"Brier (ISS):   {result['Brier_ISS']:.3f}")
    print(f"Brier (TRISS): {result['Brier_TRISS']:.3f}")
    print(f"N:             {result['N']}")
    print(f"Positives:     {result['Positives']}")
    print(f"Negatives:     {result['Negatives']}\n")

In [None]:
##shorthand to subset the dataset by ISS group and check the size
issunder9_df = complications_test_df[complications_test_df['ISS_05'] < 9]
iss9to15_df = complications_test_df[(complications_test_df['ISS_05'] >= 9) & (complications_test_df['ISS_05'] <= 15)]
iss16to25_df = complications_test_df[(complications_test_df['ISS_05'] >= 16) & (complications_test_df['ISS_05'] <= 25)]
issover25_df = complications_test_df[complications_test_df['ISS_05'] > 25]
print("ISS<9 shape:",issunder9_df.shape)
print("ISS 9-15 shape:", iss9to15_df.shape)
print("ISS 16-25 shape:", iss16to25_df.shape)
print("ISS>25 shape:", issover25_df.shape)

In [None]:
##evaluate AUROCs in ISS subgroups

##get predicted probs/true labels for each model
predicted_prob_iss_under9=issunder9_df['y_prob_iss']
predicted_prob_triss_under9=issunder9_df['y_prob_triss']
predicted_prob_gbo_under9=issunder9_df['y_prob_model']
true_label_under9=issunder9_df['y_true']
# Calculate the FPR, TPR, and thresholds
fpr_iss_under9, tpr_iss_under9, thresholds_iss_under9 = roc_curve(true_label_under9, predicted_prob_iss_under9)

##now TRISS
fpr_triss_under9, tpr_triss_under9, thresholds_triss_under9 = roc_curve(true_label_under9, predicted_prob_triss_under9)

##and MLISS
fpr_gbo_under9, tpr_gbo_under9, thresholds_gbo_under9 = roc_curve(true_label_under9, predicted_prob_gbo_under9)

# Calculate the area under the ROC curve (AUROC)
roc_auc_iss_under9 = auc(fpr_iss_under9, tpr_iss_under9)

##now TRISS
roc_auc_triss_under9 = auc(fpr_triss_under9, tpr_triss_under9)

##and MLISS
roc_auc_gbo_under9 = auc(fpr_gbo_under9, tpr_gbo_under9)



# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr_gbo_under9, tpr_gbo_under9, color='b', lw=2, label=f'ROC curve (area = {roc_auc_gbo_under9:.3f})')
plt.plot(fpr_triss_under9, tpr_triss_under9, color='green', lw=2, label=f'ROC AUC TRISS = {roc_auc_triss_under9:.3f}')
plt.plot(fpr_iss_under9, tpr_iss_under9, color='darkorange', lw=2, label=f'ROC AUC ISS = {roc_auc_iss_under9:.3f}')
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve-ISS<9')
plt.legend(loc='lower right')
plt.show()

In [None]:
##now reliability diagrams for all three methods in the ISS <9 cohort

# Extract predictions and true labels
y_true = np.array(issunder9_df['y_true'])

y_prob_gbo = np.array(issunder9_df['y_prob_model'])
y_prob_triss = np.array(issunder9_df['y_prob_triss'])
y_prob_iss = np.array(issunder9_df['y_prob_iss'])

# Report Brier scores
brier_gbo = brier_score_loss(y_true, y_prob_gbo)
brier_triss = brier_score_loss(y_true, y_prob_triss)
brier_iss = brier_score_loss(y_true, y_prob_iss)

print(f"Brier Score - ML Model : {brier_gbo:.4f}")
print(f"Brier Score - TRISS     : {brier_triss:.4f}")
print(f"Brier Score - ISS       : {brier_iss:.4f}")

# Helper function to get sorted calibration data
def get_bootstrap_calibration_data(y_true, y_prob, label, color, n_bins=10, n_boot=1000, random_state=42):
    prob_pred, prob_true, lower_ci, upper_ci = bootstrap_calibration_curve(
        y_true, y_prob, n_bins=n_bins, n_boot=n_boot, random_state=random_state
    )
    sort_idx = np.argsort(prob_pred)
    return {
        "x": prob_pred[sort_idx],
        "y": prob_true[sort_idx],
        "lower": lower_ci[sort_idx],
        "upper": upper_ci[sort_idx],
        "label": label,
        "color": color
    }

# Get calibration data for each method
calib_gbo = get_bootstrap_calibration_data(y_true, y_prob_gbo, label="ML Model", color='b')
calib_triss = get_bootstrap_calibration_data(y_true, y_prob_triss, label="TRISS", color='green')
calib_iss = get_bootstrap_calibration_data(y_true, y_prob_iss, label="ISS", color='darkorange')

# Plot the reliability diagram
plt.figure(figsize=(8,6))
plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')

for calib in [calib_gbo, calib_triss, calib_iss]:
    plt.plot(calib["x"], calib["y"], marker='o', label=f'{calib["label"]}', color=calib["color"])
    plt.fill_between(calib["x"], calib["lower"], calib["upper"], color=calib["color"], alpha=0.2)

plt.xlabel('Mean Predicted Probability')
plt.ylabel('Observed Mortality Rate')
plt.title('Reliability Diagram (ISS < 9)')
plt.legend(loc='best')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.grid(True)
plt.show()

In [None]:
##now decision curve analysis in the ISS<9 cohort
# ========================================


# Thresholds for decision curve
decision_thresholds = np.linspace(0.0, 1.0, 101)

# Make sure y_true and predicted probs are arrays
y_true_array_dc = np.array(issunder9_df['y_true']).flatten()
y_prob_gbo_array_dc = np.array(issunder9_df['y_prob_model'])
iss_probs_dc = np.array(issunder9_df['y_prob_iss'])
X_TRISS_dc = issunder9_df['y_prob_triss']

# Net benefit for your new model
NB_model = net_benefit(y_true_array_dc, y_prob_gbo_array_dc, decision_thresholds)

# # Net benefit for ISS (ISS predictions: X_ISS.values)
NB_ISS = net_benefit(y_true_array_dc, iss_probs_dc, decision_thresholds)

# Net benefit for TRISS (TRISS predictions: X_TRISS.values)
NB_TRISS = net_benefit(y_true_array_dc, X_TRISS_dc.values.flatten(), decision_thresholds)

# Net benefit for treat all and treat none
N = len(Y_test)
prevalence = np.mean(Y_test)  # fraction of positives
treat_all_nb = []
for t in decision_thresholds:
    if t == 1.0:
        treat_all_nb.append(0)
    else:
        treat_all_nb.append(prevalence - (1 - prevalence)*(t/(1-t)))

treat_none_nb = np.zeros_like(decision_thresholds)

# Plotting
plt.figure(figsize=(8, 6))
plt.plot(decision_thresholds, NB_model, label='New ML Model', color='b')
plt.plot(decision_thresholds, NB_ISS, label='ISS', color='darkorange')
plt.plot(decision_thresholds, NB_TRISS, label='TRISS', color='g')
plt.plot(decision_thresholds, treat_all_nb, label='Treat All', color='red', linestyle='--')
plt.plot(decision_thresholds, treat_none_nb, label='Treat None', color='grey', linestyle=':')

plt.xlabel('Threshold Probability')
plt.ylabel('Net Benefit')
plt.title('Decision Curve Analysis-ISS <9')
plt.legend(loc='best')
plt.ylim([-0.01, 0.01])
plt.xlim([0, 1.0])
plt.grid(True)
plt.show()

In [None]:
##repeat for ISS9-15

##get predicted probs and true labels
predicted_prob_iss_9to15 = iss9to15_df['y_prob_iss']
predicted_prob_triss_9to15 = iss9to15_df['y_prob_triss']
predicted_prob_gbo_9to15 = iss9to15_df['y_prob_model']
true_label_9to15 = iss9to15_df['y_true']

#get FPR/TPR
fpr_iss_9to15, tpr_iss_9to15, _ = roc_curve(true_label_9to15, predicted_prob_iss_9to15)
fpr_triss_9to15, tpr_triss_9to15, _ = roc_curve(true_label_9to15, predicted_prob_triss_9to15)
fpr_gbo_9to15, tpr_gbo_9to15, _ = roc_curve(true_label_9to15, predicted_prob_gbo_9to15)

#derive ROC
roc_auc_iss_9to15 = auc(fpr_iss_9to15, tpr_iss_9to15)
roc_auc_triss_9to15 = auc(fpr_triss_9to15, tpr_triss_9to15)
roc_auc_gbo_9to15 = auc(fpr_gbo_9to15, tpr_gbo_9to15)

#plot
plt.figure(figsize=(8, 8))
plt.plot(fpr_gbo_9to15, tpr_gbo_9to15, color='b', lw=2, label=f'ML Model (AUC = {roc_auc_gbo_9to15:.3f})')
plt.plot(fpr_triss_9to15, tpr_triss_9to15, color='green', lw=2, label=f'TRISS (AUC = {roc_auc_triss_9to15:.3f})')
plt.plot(fpr_iss_9to15, tpr_iss_9to15, color='darkorange', lw=2, label=f'ISS (AUC = {roc_auc_iss_9to15:.3f})')
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - ISS 9–15')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
##now reliability diagrams for all three methods in the ISS 9-15 cohort

# Extract predictions and true labels
y_true = np.array(iss9to15_df['y_true'])

y_prob_gbo = np.array(iss9to15_df['y_prob_model'])
y_prob_triss = np.array(iss9to15_df['y_prob_triss'])
y_prob_iss = np.array(iss9to15_df['y_prob_iss'])

# Report Brier scores
brier_gbo = brier_score_loss(y_true, y_prob_gbo)
brier_triss = brier_score_loss(y_true, y_prob_triss)
brier_iss = brier_score_loss(y_true, y_prob_iss)

print(f"Brier Score - ML Model : {brier_gbo:.4f}")
print(f"Brier Score - TRISS     : {brier_triss:.4f}")
print(f"Brier Score - ISS       : {brier_iss:.4f}")

# Helper function to get sorted calibration data
def get_bootstrap_calibration_data(y_true, y_prob, label, color, n_bins=10, n_boot=1000, random_state=42):
    prob_pred, prob_true, lower_ci, upper_ci = bootstrap_calibration_curve(
        y_true, y_prob, n_bins=n_bins, n_boot=n_boot, random_state=random_state
    )
    sort_idx = np.argsort(prob_pred)
    return {
        "x": prob_pred[sort_idx],
        "y": prob_true[sort_idx],
        "lower": lower_ci[sort_idx],
        "upper": upper_ci[sort_idx],
        "label": label,
        "color": color
    }

# Get calibration data for each method
calib_gbo = get_bootstrap_calibration_data(y_true, y_prob_gbo, label="ML Model", color='b')
calib_triss = get_bootstrap_calibration_data(y_true, y_prob_triss, label="TRISS", color='green')
calib_iss = get_bootstrap_calibration_data(y_true, y_prob_iss, label="ISS", color='darkorange')

# Plot the reliability diagram
plt.figure(figsize=(8,6))
plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')

for calib in [calib_gbo, calib_triss, calib_iss]:
    plt.plot(calib["x"], calib["y"], marker='o', label=f'{calib["label"]}', color=calib["color"])
    plt.fill_between(calib["x"], calib["lower"], calib["upper"], color=calib["color"], alpha=0.2)

plt.xlabel('Mean Predicted Probability')
plt.ylabel('Observed Mortality Rate')
plt.title('Reliability Diagram (ISS 9-15)')
plt.legend(loc='best')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.grid(True)
plt.show()

In [None]:
##now decision curve analysis in the ISS 9-15 cohort
# ========================================

# Thresholds for decision curve
decision_thresholds = np.linspace(0.0, 1.0, 101)

# Make sure y_true and predicted probs are arrays
y_true_array_dc = np.array(iss9to15_df['y_true']).flatten()
y_prob_gbo_array_dc = np.array(iss9to15_df['y_prob_model'])
iss_probs_dc = np.array(iss9to15_df['y_prob_iss'])
X_TRISS_dc = iss9to15_df['y_prob_triss']

# Net benefit for your new model
NB_model = net_benefit(y_true_array_dc, y_prob_gbo_array_dc, decision_thresholds)

# # Net benefit for ISS (ISS predictions: X_ISS.values)
NB_ISS = net_benefit(y_true_array_dc, iss_probs_dc, decision_thresholds)

# Net benefit for TRISS (TRISS predictions: X_TRISS.values)
NB_TRISS = net_benefit(y_true_array_dc, X_TRISS_dc.values.flatten(), decision_thresholds)

# Net benefit for treat all and treat none
N = len(Y_test)
prevalence = np.mean(Y_test)  # fraction of positives
treat_all_nb = []
for t in decision_thresholds:
    if t == 1.0:
        treat_all_nb.append(0)
    else:
        treat_all_nb.append(prevalence - (1 - prevalence)*(t/(1-t)))

treat_none_nb = np.zeros_like(decision_thresholds)

# Plotting
plt.figure(figsize=(8, 6))
plt.plot(decision_thresholds, NB_model, label='New ML Model', color='b')
plt.plot(decision_thresholds, NB_ISS, label='ISS', color='darkorange')
plt.plot(decision_thresholds, NB_TRISS, label='TRISS', color='g')
plt.plot(decision_thresholds, treat_all_nb, label='Treat All', color='red', linestyle='--')
plt.plot(decision_thresholds, treat_none_nb, label='Treat None', color='grey', linestyle=':')

plt.xlabel('Threshold Probability')
plt.ylabel('Net Benefit')
plt.title('Decision Curve Analysis-ISS 9to15')
plt.legend(loc='best')
plt.ylim([-0.02, 0.02])
plt.xlim([0, 1.0])
plt.grid(True)
plt.show()

In [None]:
##repeat for ISS16-25

##get predicted probs and true labels
predicted_prob_iss_16to25 = iss16to25_df['y_prob_iss']
predicted_prob_triss_16to25 = iss16to25_df['y_prob_triss']
predicted_prob_gbo_16to25 = iss16to25_df['y_prob_model']
true_label_16to25 = iss16to25_df['y_true']

#derive FPR, TPR
fpr_iss_16to25, tpr_iss_16to25, _ = roc_curve(true_label_16to25, predicted_prob_iss_16to25)
fpr_triss_16to25, tpr_triss_16to25, _ = roc_curve(true_label_16to25, predicted_prob_triss_16to25)
fpr_gbo_16to25, tpr_gbo_16to25, _ = roc_curve(true_label_16to25, predicted_prob_gbo_16to25)

#derive ROC
roc_auc_iss_16to25 = auc(fpr_iss_16to25, tpr_iss_16to25)
roc_auc_triss_16to25 = auc(fpr_triss_16to25, tpr_triss_16to25)
roc_auc_gbo_16to25 = auc(fpr_gbo_16to25, tpr_gbo_16to25)

#plot
plt.figure(figsize=(8, 8))
plt.plot(fpr_gbo_16to25, tpr_gbo_16to25, color='b', lw=2, label=f'ML Model (AUC = {roc_auc_gbo_16to25:.3f})')
plt.plot(fpr_triss_16to25, tpr_triss_16to25, color='green', lw=2, label=f'TRISS (AUC = {roc_auc_triss_16to25:.3f})')
plt.plot(fpr_iss_16to25, tpr_iss_16to25, color='darkorange', lw=2, label=f'ISS (AUC = {roc_auc_iss_16to25:.3f})')
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - ISS 16–25')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
##repeat RD for ISS 16-25

# Extract predictions and true labels
y_true = np.array(iss16to25_df['y_true'])

y_prob_gbo = np.array(iss16to25_df['y_prob_model'])
y_prob_triss = np.array(iss16to25_df['y_prob_triss'])
y_prob_iss = np.array(iss16to25_df['y_prob_iss'])

# Report Brier scores
brier_gbo = brier_score_loss(y_true, y_prob_gbo)
brier_triss = brier_score_loss(y_true, y_prob_triss)
brier_iss = brier_score_loss(y_true, y_prob_iss)

print(f"Brier Score - ML Model : {brier_gbo:.4f}")
print(f"Brier Score - TRISS     : {brier_triss:.4f}")
print(f"Brier Score - ISS       : {brier_iss:.4f}")

# Helper function to get sorted calibration data
def get_bootstrap_calibration_data(y_true, y_prob, label, color, n_bins=10, n_boot=1000, random_state=42):
    prob_pred, prob_true, lower_ci, upper_ci = bootstrap_calibration_curve(
        y_true, y_prob, n_bins=n_bins, n_boot=n_boot, random_state=random_state
    )
    sort_idx = np.argsort(prob_pred)
    return {
        "x": prob_pred[sort_idx],
        "y": prob_true[sort_idx],
        "lower": lower_ci[sort_idx],
        "upper": upper_ci[sort_idx],
        "label": label,
        "color": color
    }

# Get calibration data for each method
calib_gbo = get_bootstrap_calibration_data(y_true, y_prob_gbo, label="ML Model", color='b')
calib_triss = get_bootstrap_calibration_data(y_true, y_prob_triss, label="TRISS", color='green')
calib_iss = get_bootstrap_calibration_data(y_true, y_prob_iss, label="ISS", color='darkorange')

# Plot the reliability diagram
plt.figure(figsize=(8,6))
plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')

for calib in [calib_gbo, calib_triss, calib_iss]:
    plt.plot(calib["x"], calib["y"], marker='o', label=f'{calib["label"]}', color=calib["color"])
    plt.fill_between(calib["x"], calib["lower"], calib["upper"], color=calib["color"], alpha=0.2)

plt.xlabel('Mean Predicted Probability')
plt.ylabel('Observed Mortality Rate')
plt.title('Reliability Diagram (ISS 16-25)')
plt.legend(loc='best')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.grid(True)
plt.show()

In [None]:
#repeat DC for ISS 16-25
# ========================================


# Thresholds for decision curve
decision_thresholds = np.linspace(0.0, 1.0, 101)



# Make sure y_true and predicted probs are arrays
y_true_array_dc = np.array(iss16to25_df['y_true']).flatten()
y_prob_gbo_array_dc = np.array(iss16to25_df['y_prob_model'])
iss_probs_dc = np.array(iss16to25_df['y_prob_iss'])
X_TRISS_dc = iss16to25_df['y_prob_triss']

# Net benefit for your new model
NB_model = net_benefit(y_true_array_dc, y_prob_gbo_array_dc, decision_thresholds)

# # Net benefit for ISS (ISS predictions: X_ISS.values)
NB_ISS = net_benefit(y_true_array_dc, iss_probs_dc, decision_thresholds)

# Net benefit for TRISS (TRISS predictions: X_TRISS.values)
NB_TRISS = net_benefit(y_true_array_dc, X_TRISS_dc.values.flatten(), decision_thresholds)

# Net benefit for treat all and treat none
N = len(Y_test)
prevalence = np.mean(Y_test)  # fraction of positives
treat_all_nb = []
for t in decision_thresholds:
    if t == 1.0:
        treat_all_nb.append(0)
    else:
        treat_all_nb.append(prevalence - (1 - prevalence)*(t/(1-t)))

treat_none_nb = np.zeros_like(decision_thresholds)

# Plotting
plt.figure(figsize=(8, 6))
plt.plot(decision_thresholds, NB_model, label='New ML Model', color='b')
plt.plot(decision_thresholds, NB_ISS, label='ISS', color='darkorange')
plt.plot(decision_thresholds, NB_TRISS, label='TRISS', color='g')
plt.plot(decision_thresholds, treat_all_nb, label='Treat All', color='red', linestyle='--')
plt.plot(decision_thresholds, treat_none_nb, label='Treat None', color='grey', linestyle=':')

plt.xlabel('Threshold Probability')
plt.ylabel('Net Benefit')
plt.title('Decision Curve Analysis-ISS 16-25')
plt.legend(loc='best')
plt.ylim([-0.1, 0.1])
plt.xlim([0, 1.0])
plt.grid(True)
plt.show()

In [None]:
##repeat for ISS>25

##get predicted probs and true labels
predicted_prob_iss_over25 = issover25_df['y_prob_iss']
predicted_prob_triss_over25 = issover25_df['y_prob_triss']
predicted_prob_gbo_over25 = issover25_df['y_prob_model']
true_label_over25 = issover25_df['y_true']

##derive fpr, tpr
fpr_iss_over25, tpr_iss_over25, _ = roc_curve(true_label_over25, predicted_prob_iss_over25)
fpr_triss_over25, tpr_triss_over25, _ = roc_curve(true_label_over25, predicted_prob_triss_over25)
fpr_gbo_over25, tpr_gbo_over25, _ = roc_curve(true_label_over25, predicted_prob_gbo_over25)

#derive ROCs
roc_auc_iss_over25 = auc(fpr_iss_over25, tpr_iss_over25)
roc_auc_triss_over25 = auc(fpr_triss_over25, tpr_triss_over25)
roc_auc_gbo_over25 = auc(fpr_gbo_over25, tpr_gbo_over25)

#plot
plt.figure(figsize=(8, 8))
plt.plot(fpr_gbo_over25, tpr_gbo_over25, color='b', lw=2, label=f'ML Model (AUC = {roc_auc_gbo_over25:.3f})')
plt.plot(fpr_triss_over25, tpr_triss_over25, color='green', lw=2, label=f'TRISS (AUC = {roc_auc_triss_over25:.3f})')
plt.plot(fpr_iss_over25, tpr_iss_over25, color='darkorange', lw=2, label=f'ISS (AUC = {roc_auc_iss_over25:.3f})')
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - ISS > 25')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
##repeat RD for ISS>25

# Extract predictions and true labels
y_true = np.array(issover25_df['y_true'])

y_prob_gbo = np.array(issover25_df['y_prob_model'])
y_prob_triss = np.array(issover25_df['y_prob_triss'])
y_prob_iss = np.array(issover25_df['y_prob_iss'])

# Report Brier scores
brier_gbo = brier_score_loss(y_true, y_prob_gbo)
brier_triss = brier_score_loss(y_true, y_prob_triss)
brier_iss = brier_score_loss(y_true, y_prob_iss)

print(f"Brier Score - ML Model : {brier_gbo:.4f}")
print(f"Brier Score - TRISS     : {brier_triss:.4f}")
print(f"Brier Score - ISS       : {brier_iss:.4f}")

# Helper function to get sorted calibration data
def get_bootstrap_calibration_data(y_true, y_prob, label, color, n_bins=10, n_boot=1000, random_state=42):
    prob_pred, prob_true, lower_ci, upper_ci = bootstrap_calibration_curve(
        y_true, y_prob, n_bins=n_bins, n_boot=n_boot, random_state=random_state
    )
    sort_idx = np.argsort(prob_pred)
    return {
        "x": prob_pred[sort_idx],
        "y": prob_true[sort_idx],
        "lower": lower_ci[sort_idx],
        "upper": upper_ci[sort_idx],
        "label": label,
        "color": color
    }

# Get calibration data for each method
calib_gbo = get_bootstrap_calibration_data(y_true, y_prob_gbo, label="ML Model", color='b')
calib_triss = get_bootstrap_calibration_data(y_true, y_prob_triss, label="TRISS", color='green')
calib_iss = get_bootstrap_calibration_data(y_true, y_prob_iss, label="ISS", color='darkorange')

# Plot the reliability diagram
plt.figure(figsize=(8,6))
plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')

for calib in [calib_gbo, calib_triss, calib_iss]:
    plt.plot(calib["x"], calib["y"], marker='o', label=f'{calib["label"]}', color=calib["color"])
    plt.fill_between(calib["x"], calib["lower"], calib["upper"], color=calib["color"], alpha=0.2)

plt.xlabel('Mean Predicted Probability')
plt.ylabel('Observed Mortality Rate')
plt.title('Reliability Diagram (ISS > 25)')
plt.legend(loc='best')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.grid(True)
plt.show()

In [None]:
# repeat DC for ISS>25
# ========================================


# Thresholds for decision curve
decision_thresholds = np.linspace(0.0, 1.0, 101)



# Make sure y_true and predicted probs are arrays
y_true_array_dc = np.array(issover25_df['y_true']).flatten()
y_prob_gbo_array_dc = np.array(issover25_df['y_prob_model'])
iss_probs_dc = np.array(issover25_df['y_prob_iss'])
X_TRISS_dc = issover25_df['y_prob_triss']

# Net benefit for your new model
NB_model = net_benefit(y_true_array_dc, y_prob_gbo_array_dc, decision_thresholds)

# # Net benefit for ISS (ISS predictions: X_ISS.values)
NB_ISS = net_benefit(y_true_array_dc, iss_probs_dc, decision_thresholds)

# Net benefit for TRISS (TRISS predictions: X_TRISS.values)
NB_TRISS = net_benefit(y_true_array_dc, X_TRISS_dc.values.flatten(), decision_thresholds)

# Net benefit for treat all and treat none
N = len(Y_test)
prevalence = np.mean(Y_test)  # fraction of positives
treat_all_nb = []
for t in decision_thresholds:
    if t == 1.0:
        treat_all_nb.append(0)
    else:
        treat_all_nb.append(prevalence - (1 - prevalence)*(t/(1-t)))

treat_none_nb = np.zeros_like(decision_thresholds)

# Plotting
plt.figure(figsize=(8, 6))
plt.plot(decision_thresholds, NB_model, label='New ML Model', color='b')
plt.plot(decision_thresholds, NB_ISS, label='ISS', color='darkorange')
plt.plot(decision_thresholds, NB_TRISS, label='TRISS', color='g')
plt.plot(decision_thresholds, treat_all_nb, label='Treat All', color='red', linestyle='--')
plt.plot(decision_thresholds, treat_none_nb, label='Treat None', color='grey', linestyle=':')

plt.xlabel('Threshold Probability')
plt.ylabel('Net Benefit')
plt.title('Decision Curve Analysis-ISS >25')
plt.legend(loc='best')
plt.ylim([-0.3, 0.3])
plt.xlim([0, 1.0])
plt.grid(True)
plt.show()

In [None]:
##now lets compare performance of MLISS to both TRISS and ISS within eachn ISS cohort

# === Define ISS bins ===
bins = [0, 9, 16, 26, np.inf]
labels = ["<9", "9–15", "16–25", ">25"]
complications_test_df['ISS_BIN'] = pd.cut(complications_test_df['ISS_05'], bins=bins, labels=labels, right=False)

# === Loop over bins and run paired bootstrap ===
for group in labels:
    df_sub = complications_test_df[complications_test_df['ISS_BIN'] == group]
    y_true = df_sub['y_true'].values
    ml_pred = df_sub['y_prob_model'].values
    iss_pred = df_sub['y_prob_iss'].values
    triss_pred = df_sub['y_prob_triss'].values

    print(f"\n=== ISS Stratum: {group} ===")

    # ML vs ISS
    res_iss = paired_bootstrap_auc_test(y_true, ml_pred, iss_pred)
    print(f"--- ML vs ISS ---")
    print(f"AUC(ML): {res_iss['aucA']:.3f}, {res_iss['coverage']}% CI: [{res_iss['aucA_ci_lower']:.3f}, {res_iss['aucA_ci_upper']:.3f}]")
    print(f"AUC(ISS): {res_iss['aucB']:.3f}, {res_iss['coverage']}% CI: [{res_iss['aucB_ci_lower']:.3f}, {res_iss['aucB_ci_upper']:.3f}]")
    print(f"ΔAUC: {res_iss['baseline_diff']:.4f}, CI: [{res_iss['diff_ci_lower']:.4f}, {res_iss['diff_ci_upper']:.4f}], p = {res_iss['p_value']:.4f}")

    # ML vs TRISS
    res_triss = paired_bootstrap_auc_test(y_true, ml_pred, triss_pred)
    print(f"--- ML vs TRISS ---")
    print(f"AUC(ML): {res_triss['aucA']:.3f}, {res_triss['coverage']}% CI: [{res_triss['aucA_ci_lower']:.3f}, {res_triss['aucA_ci_upper']:.3f}]")
    print(f"AUC(TRISS): {res_triss['aucB']:.3f}, {res_triss['coverage']}% CI: [{res_triss['aucB_ci_lower']:.3f}, {res_triss['aucB_ci_upper']:.3f}]")
    print(f"ΔAUC: {res_triss['baseline_diff']:.4f}, CI: [{res_triss['diff_ci_lower']:.4f}, {res_triss['diff_ci_upper']:.4f}], p = {res_triss['p_value']:.4f}")
