In [None]:
##install and import necessary modules
##this code was originally designed and run in google colab
##use outside of colab may require modification
##if using colab, you may need to restart your runtime after installing modules,
##depending on enviornment at time of code running.

!pip install scikit-learn==1.5.2
!pip install tensorflow==2.12.1
!pip install xgboost==2.0.2
!pip install shap
import shap
import sys
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import seaborn as sn
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import roc_auc_score, f1_score, roc_curve, auc as sk_auc, precision_recall_curve, recall_score, confusion_matrix, brier_score_loss
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

sn.set(style='whitegrid')

print("Python version:", sys.version)
print("scikit-learn version:", sklearn.__version__)
print("XGBoost version:", xgb.__version__)
print("shap version:", shap.__version__)

In [None]:
##import your dataset
##mount google drive if using in colab. Replace <MOUNT_POINT> with the directory where you want to mount the drive (e.g., /content/drive).
drive.mount('<MOUNT_POINT>')

# Replace <YOUR_FILE_PATH> with the actual path inside your Google Drive (e.g., My Drive/FileNameHere).
file_path = '<MOUNT_POINT>/<YOUR_FILE_PATH>.csv'

In [None]:
# Import data and specify missing values
data = pd.read_csv(file_path, na_values=['NA', 'N/A', 'NULL', ' ', '', '-99', '-98', '-99.0', '-99.00', '-98.0', '-98.00', 'NaN'])

# Filter out rows where 'TRAUMATYPE' is 26, 'Other/unspecified', or 'Burn'
try:
  exclude_values = ['26', 'Other/unspecified', 'Burn']
  data = data[~data['TRAUMATYPE'].isin(exclude_values)]
except:
  pass

In [None]:
##check dataframe to ensure it appears as it should
data.head()

In [None]:
##check for missing data
data.isnull().sum(axis=0)

In [None]:
##create a dataframe of all complications/things not available on admission.  We can remove all of these from the X data set and pick one to be
#our Y dataset

complications_df=pd.DataFrame()
complications_list= [
                    'HC_CLABSI', 'HC_DEEPSSI', 'HC_DVTHROMBOSIS', 'HC_ALCOHOLWITHDRAWAL', 'HC_CARDARREST', 'HC_CAUTI',
                    'HC_EMBOLISM', 'HC_EXTREMITYCS', 'HC_INTUBATION', 'HC_KIDNEY', 'HC_MI', 'HC_ORGANSPACESSI',
                    'HC_OSTEOMYELITIS', 'HC_RESPIRATORY', 'HC_RETURNOR', 'HC_SEPSIS', 'HC_STROKECVA', 'HC_SUPERFICIALINCISIONSSI',
                    'HC_PRESSUREULCER', 'HC_UNPLANNEDICU', 'HC_VAPNEUMONIA',
                    ##'EDDISCHARGEDISPOSITION',
                    'HOSPDISCHARGEDISPOSITION',
                    ##'EDDISCHARGEHRS',
                    'WITHDRAWALLST',
                    # 'VTEPROPHYLAXISTYPE',
                    'TOTALICULOS',
                    'TOTALVENTDAYS',
                    'VTEPROPHYLAXISHRS',
                    'VTEPROPHYLAXISDAYS', 'MORTALITY', 'EDDISCHARGEDAYS','FINALDISCHARGEDAYS','FINALDISCHARGEHRS', 'HMRRHGCTRLSURGDAYS',  'WITHDRAWALLSTHRS',
                    'AMERICANINDIAN', 'ASIAN', 'BLACK', 'PACIFICISLANDER', 'RACEOTHER', 'WHITE', 'RACE_NA', 'RACE_UK',
                    'TM_GROUNDAMBULANCE', 'TM_HELICOPTERAMBULANCE', 'TM_FIXEDWINGAMBULANCE', 'TM_PRIVPUBVEHWALKIN', 'TM_POLICE', 'TM_OTHER', 'TM_NA', 'TM_UK',
                    'ISS_05'
                    , 'AIS_FACE', 'AIS_NECK', 'AIS_HEAD', 'AIS_THORAX', 'AIS_ABDOMEN', 'AIS_SPINE', 'AIS_UPPEREX', 'AIS_LOWEREX', 'AIS_SKIN', 'AIS_OTHER'
                    # , 'VTEPPXStartOver48', 'VTEPPXStartOver24', 'ICUOver48', 'ICUOver24', 'VentOver48', 'VentOver24'
                    # , 'VTEPPXStartOver72', 'VTEPPXStartOver96', 'ICUOver72', 'ICUOver96', 'VentOver72', 'VentOver96'
                    , 'FacilityTotalWLST', 'factilityTotalPatients', 'FacilityWLSTRate', 'FacilityKey'
                    , 'facilityWLSTNew', 'WLSTRateNew', 'WLSTRateNewCensored'
                    , "facilityPatientsNew", "WLSTRateCensorNormal", "HOSPITALTYPE", "STATEDESIGNATION", "TEACHINGSTATUS", "VERIFICATIONLEVEL"
                    ]
for c in complications_list:
    complications_df[c] = data[c]
# complications_df

In [None]:
##this is where we choose our outcome variable, in this case, WLST, and move it to a separate dataframe
Y_data = pd.DataFrame()
Y_data['WLST'] = data['WITHDRAWALLST']
Y_data

In [None]:
##clean Y_data by replacing "Yes" and "No" vcalues with 0's and 1's

Y_data['WLST'] = Y_data['WLST'].replace({'Yes': 1, 'No': 0})
Y_data

In [None]:
##remove all unwanted variables as defined above from the input space
X_data = data.drop(columns=complications_list)
X_data.shape

In [None]:
##need to remove any cases with missing data for our outcome variable
Missing_Y = Y_data.isnull().sum(axis=0)
Missing_Y

In [None]:
##here we find which rows in Y have missing values

bad_row_index_list=[]
for n in range(0, Y_data.shape[0]):
    n_missings=Y_data.iloc[n,:].isnull().sum()
    if n_missings>0:
        bad_row_index_list.append(n)
bad_row_index_list

In [None]:
##now remove the bad rows in Y
Y_clean = Y_data.drop(bad_row_index_list, axis=0)
Y_clean

In [None]:
##ensure all cases with missing values for the outcome have been dropped
Missing_Y_clean = Y_clean.isnull().sum(axis=0)
Missing_Y_clean

In [None]:
##and remove bad rows in X
X_data=X_data.drop(bad_row_index_list, axis=0)

In [None]:
# Rename the 'TRAUMATYPE' column to 'Penetrating' and map the values to 0 and 1
X_data['Penetrating'] = X_data['TRAUMATYPE'].map({'Penetrating': 1, 'Blunt': 0})

# Drop the old 'TRAUMATYPE' column
X_data.drop(columns=['TRAUMATYPE'], inplace=True)

In [None]:
##drop patient record number as its not useful in making predictions

columns_to_remove = ['inc_key']
X_data = X_data.drop(columns=columns_to_remove, errors='ignore')

In [None]:
##first we will convert No's and Yes's to 0's and 1's to minimize the amount of double variables (want to avoid Yes/Nos being converted to 1-hot variables)
##want code to be reusable between different populations of input data.  Not every population will have all of these variables
##Therefore, will do everything within separate try/except blocks

try:
    X_data= X_data.replace({True: 1, 'Yes': 1, "Female": 1, False: 0, 'No': 0, "Male": 0})
except:
    pass
try:
    X_data['ETHNICITY'] = X_data['ETHNICITY'].replace({'Hispanic or Latino': 1, 'Not Hispanic or Latino': 0})
except:
    pass
try:
    X_data['EMSGCSEYE'] = X_data['EMSGCSEYE'].replace({'None': 1, 'To pressure': 2, 'To sound': 3,
                                                               'Spontaneous': 4})
except:
    pass
try:
    X_data['GCSEYE'] = X_data['GCSEYE'].replace({'None': 1, 'To pressure': 2, 'To sound': 3, 'Spontaneous': 4})
except:
    pass
try:
    X_data['EMSGCSVERBAL'] = X_data['EMSGCSVERBAL'].replace({'None': 1, 'Sounds': 2, 'Words': 3,
                                                                     'Confused': 4, 'Oriented': 5})
except:
    pass
try:
    X_data['EMSGCSMOTOR'] = X_data['EMSGCSMOTOR'].replace({'None': 1, 'Extension': 2, 'Abnormal Flexion': 3,
                                                                 'Normal Flexion': 4, 'Localising': 5, 'Obeys commands': 6})
except:
    pass
try:
    X_data['TBIGCSMOTOR'] = X_data['TBIGCSMOTOR'].replace({'None': 1, 'Extension': 2, 'Abnormal Flexion': 3,
                                                                 'Normal Flexion': 4, 'Localising': 5, 'Obeys commands': 6})
except:
    pass
try:
    X_data['GCSVERBAL'] = X_data['GCSVERBAL'].replace({'None': 1, 'Sounds': 2, 'Words': 3,
                                                               'Confused': 4, 'Orientated': 5})
except:
    pass
try:
    X_data['GCSMOTOR'] = X_data['GCSMOTOR'].replace({'None': 1, 'Extension': 2, 'Abnormal Flexion': 3,
                                                           'Normal Flexion': 4, 'Localising': 5, 'Obeys commands': 6})
except:
    pass
try:
    X_data['RESPIRATORYASSISTANCE'] = X_data['RESPIRATORYASSISTANCE'].replace({'Assisted Respiratory Rate': 1,
                                                                                   'Unassisted Respiratory Rate': 0})
except:
    pass
try:
    X_data['SUPPLEMENTALOXYGEN'] = X_data['SUPPLEMENTALOXYGEN'].replace({'Supplemental Oxygen': 1,
                                                                             'No Supplemental Oxygen': 0})
except:
    pass

X_data.head()

##male coded as 0
##female coded as 1

##not hispanic coded as 0
##hispanic coded as 1

In [None]:
##replace boolean values in binary variables to numeric values
X_data = X_data.replace({True: 1, False: 0})

In [None]:
##check which variables in the input space have missing variables
Missing = X_data.isnull().sum(axis=0)
Missing[Missing>0]

In [None]:
##order variables with missing data by percentage
data_missing = (X_data.isnull().sum(axis=0)/X_data.shape[0]) * 100
data_missing

In [None]:
##display variables withOUT mising data
data_missing[data_missing == 0].index

In [None]:
#remove the good columns (no missing values) from data_missing
data_missing = data_missing.drop(data_missing[data_missing == 0].index)
data_missing

In [None]:
#sort this in ascending order
pd.set_option('display.max_rows', None)
data_missing = data_missing.sort_values(ascending=False)
data_missing

In [None]:
##prepare to drop variables with >50% missing values
dropCutoff=50
bad_column_names = data_missing[data_missing >=dropCutoff].index
bad_column_names

In [None]:
##actually drop bad variables
X_data_new=X_data.drop(columns=bad_column_names, axis=1)

##check for which variables still have missing data (<50% missing values)
Missing = X_data_new.isnull().sum(axis=0)
Missing[Missing>0]

In [None]:
#check for columns with less than 50% missing that need to be cleaned
pd.set_option('display.max_rows', None)
to_be_cleaned_column_names = data_missing[data_missing <50].index
for col in X_data_new:
    print(col)

In [None]:
continuous_vars = [
    "AGEYEARS", "EMSPULSERATE", "EMSRESPIRATORYRATE", "EMSTOTALGCS", "EMSDISPATCHDAYS",
    "EMSSCENEHRS", "EMSSCENEDAYS", "EMSHRS", "EMSDAYS", "SBP", "PULSERATE", "TEMPERATURE",
    "RESPIRATORYRATE", "PULSEOXIMETRY", "HEIGHT", "WEIGHT", "TOTALGCS", "ALCOHOLSCREENRESULT",
    "EDDISCHARGEHRS", "TBIHIGHESTTOTALGCS", "TBIGCSMOTOR", "BLOODUNITS", "PLASMAUNITS",
    "NumberOfInjuries", "mFI"
]

categorical_vars = [
    "SEX", "RACE", "ETHNICITY", "MECHANISM", "INTENT", "WORKRELATED", "ABUSEREPORT",
    "PROTDEV_NONE", "PROTDEV_LAP_BELT", "PROTDEV_PER_FLOAT", "PROTDEV_PROTECT_GEAR",
    "PROTDEV_EYE_PROTECT", "PROTDEV_CHILD_RESTRAINT", "PROTDEV_HELMET", "PROTDEV_AIRBAG_PRESENT",
    "PROTDEV_PROTECT_CLOTH", "PROTDEV_SHOULDER_BELT", "PROTDEV_OTHER", "PROTDEV_NA", "PROTDEV_UK",
    "AIRBAG_NOTDEPLOYED", "AIRBAG_DEPLOYED_FRNT", "AIRBAG_DEPLOYED_SIDE", "AIRBAG_DEPLOYED_OTHER",
    "AIRBAG_DEPLOYED_NA", "AIRBAG_DEPLOYED_UK", "TRANSPORTMODE", "INTERFACILITYTRANSFER",
    "PREHOSPITALCARDIACARREST", "TCCGCSLE13", "TCC10RR29", "TCCPEN", "TCCCHEST", "TCCLONGBONE",
    "TCCCRUSHED", "TCCAMPUTATION", "TCCPELVIC", "TCCSKULLFRACTURE", "TCCPARALYSIS", "TCC_NA",
    "TCC_UK", "VPOFALLADULT", "VPOFALLCHILD", "VPOCRASHINTRUSION", "VPOCRASHEJECT",
    "VPOCRASHDEATH", "VPOCRASHTELEMETRY", "VPOAUTOPEDIMPACT", "VPOMOTORCYCLECRASH",
    "VPO65SBP110", "VPOANTICOAGULANT", "VPOPREGNANCY20WKS", "VPOEMSJUDGE", "VPOBURNS",
    "VPOTRAUMABURNS", "VPO_NA", "VPO_UK", "RESPIRATORYASSISTANCE", "SUPPLEMENTALOXYGEN",
    "GCSQ_SEDATEDPARALYZED", "GCSQ_EYEOBSTRUCTION", "GCSQ_INTUBATED", "GCSQ_VALID", "GCSQ_NA",
    "GCSQ_UK", "DRGSCR_AMPHETAMINE", "DRGSCR_BARBITURATE", "DRGSCR_BENZODIAZEPINES",
    "DRGSCR_COCAINE", "DRGSCR_METHAMPHETAMINE", "DRGSCR_ECSTASY", "DRGSCR_METHADONE",
    "DRGSCR_OPIOID", "DRGSCR_OXYCODONE", "DRGSCR_PHENCYCLIDINE", "DRGSCR_TRICYCLICDEPRESS",
    "DRGSCR_CANNABINOID", "DRGSCR_OTHER", "DRGSCR_NONE", "DRGSCR_NOTTESTED", "DRGSCR_UK",
    "DRGSCR_NA", "ALCOHOLSCREEN", "EDDISCHARGEDISPOSITION", "DEATHINED", "TBIPUPILLARYRESPONSE",
    "TBIMIDLINESHIFT", "PMGCSQ_SEDATEDPARALYZED", "PMGCSQ_EYEOBSTRUCTION", "PMGCSQ_INTUBATED",
    "PMGCSQ_VALID", "PMGCSQ_NA", "PMGCSQ_UK", "ICPEVDRAIN", "ICPPARENCH", "ICPO2MONITOR",
    "ICPJVBULB", "ICPNONE", "ICP_NA", "ICP_UK", "BLOODBINARY", "PLASMABINARY", "PLATELETSBINARY",
    "CRYOBINARY", "ESLIVER", "ESSPLEEN", "ESKIDNEY", "ESPELVIS", "ESRETROPERI",
    "ESVASCULAR", "ESAORTA", "ESOTHER", "ES_UK", "ES_NA", "PRIMARYMETHODPAYMENT",
    "CC_ADHD", "CC_ADLC", "CC_ALCOHOLISM", "CC_ANGINAPECTORIS",
    "CC_ANTICOAGULANT", "CC_BLEEDING", "CC_CHEMO", "CC_CIRRHOSIS", "CC_CONGENITAL", "CC_COPD",
    "CC_CVA", "CC_DEMENTIA", "CC_DIABETES", "CC_DISCANCER", "CC_FUNCTIONAL", "CC_CHF",
    "CC_HYPERTENSION", "CC_MI", "CC_PAD", "CC_PREMATURITY", "CC_MENTALPERSONALITY", "CC_RENAL",
    "CC_SMOKING", "CC_STEROID", "CC_SUBSTANCEABUSE", "IntracranialVascularInjury", "BrainStemInjury",
    "EDH", "SAH", "SDH", "SkullFx", "DAI", "NeckVascularInjury", "ThoracicVascularInjury",
    "AeroDigestiveInjury", "CardiacInjury", "LungInjury", "AbdominalVascular", "RibFx",
    "KidneyInjury", "StomachInjury", "SpleenInjury", "UroGenInternalInjury", "SCI", "SpineFx",
    "UEAmputation", "UEVascularInjury", "UELongBoneFx", "LEVascularInjury", "PelvicFx",
    "LEAmputation", "PancreasInjury", "LELongBoneFx", "LiverInjury", "ColorectalInjury",
    "SmallBowelInjury", "isolatedTBI", "missingGCS", "missingAge", "missingSex", "missingType",
    "missingSBP", "missingHR", "missingRR", "missingPulseOx", "missingHeight", "missingWeight",
    "missingEDDispo", "missingRBC", "missingPlasma", "Penetrating",
    'VTEPPXStartOver48', 'VTEPPXStartOver24', 'ICUOver48', 'ICUOver24', 'VentOver48', 'VentOver24',
    'VTEPPXStartOver72', 'VTEPPXStartOver96', 'ICUOver72', 'ICUOver96', 'VentOver72', 'VentOver96', 'VTEPROPHYLAXISTYPE'
]

# Continuous variables that must be integers
int_constrained_vars = [
    "AGEYEARS", "EMSPULSERATE", "EMSRESPIRATORYRATE", "EMSTOTALGCS", "EMSDISPATCHDAYS",
    "EMSSCENEDAYS", "EMSDAYS", "SBP", "PULSERATE", "RESPIRATORYRATE", "PULSEOXIMETRY",
    "TOTALGCS", "TBIHIGHESTTOTALGCS", "TBIGCSMOTOR", "BLOODUNITS", "PLASMAUNITS",
    "NumberOfInjuries", "mFI"
]

time_pairs = [
    ("EMSSCENEDAYS", "EMSSCENEHRS"),
    ("EMSDAYS", "EMSHRS"),
]

In [None]:
##verify shape of these arrays

print(X_data_new.shape)
print(len(continuous_vars))
print(len(categorical_vars))

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 1) Start with your full feature matrix X_data_new and your outcome Y_clean.
# ───────────────────────────────────────────────────────────────────────────────

X_full = X_data_new.copy()
# If Y_clean is a one-column DataFrame, extract it as a Series. Otherwise adjust accordingly.
Y_full = Y_clean.iloc[:, 0]

# 2) Identify which columns are categorical / continuous / integer‐constrained.
categorical_mice = [c for c in categorical_vars if c in X_full.columns]
continuous_mice  = [c for c in continuous_vars  if c in X_full.columns]
int_mice         = [c for c in int_constrained_vars if c in X_full.columns]

# 3) Convert any "nan" strings into real np.nan in the categorical columns.
X_full[categorical_mice] = (
    X_full[categorical_mice]
      .replace("nan", np.nan)
      .astype(str)
      .replace("nan", np.nan)
)

# 4) Ordinal‐encode all categorical_mice columns at once (so train/test share the same mapping).
ordinal_encoder_mice = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1
)
X_full[categorical_mice] = ordinal_encoder_mice.fit_transform(X_full[categorical_mice])

# 5) Now do the first split: full → (train_raw, test_raw).
#    Since outcome is binary, we can stratify on it to ensure similar split between groups.
X_train_raw, X_test_raw, Y_train, Y_test = train_test_split(
    X_full,
    Y_full,
    test_size=0.20,
    random_state=0,
    stratify=Y_full
)

##sanity check to ensure appropriate dataframe shape
print(" Completed train/test split:")
print(f"   X_train_raw: {X_train_raw.shape}, X_test_raw: {X_test_raw.shape}")
print(f"   Y_train: {Y_train.shape},      Y_test: {Y_test.shape}")

# 6) Next, split X_train_raw + Y_train into (train_for_impute_raw, calibration_raw).
#    We'll use calibration_raw later to calibrate the model.
#    Use stratification here as well (since Y_train is still binary).
X_train_for_impute_raw, X_cal_raw, Y_train_for_impute, Y_cal = train_test_split(
    X_train_raw,
    Y_train,
    test_size=0.20,
    random_state=0,
    stratify=Y_train
)

##additional sanity check to ensure appropriate dataframe shape
print("\n Split train_raw into train_for_impute and calibration:")
print(f"   X_train_for_impute_raw: {X_train_for_impute_raw.shape}, Y_train_for_impute: {Y_train_for_impute.shape}")
print(f"   X_cal_raw: {X_cal_raw.shape},            Y_cal: {Y_cal.shape}")


In [None]:
#this code is used to load imputed datasets that were previously generated from the hyperparameter files

#set folder with saved imputed datasets, as pickle files.
# replace <YOUR_SAVE_DIR> with whereever your files are stored
save_dir = '<YOUR_SAVE_DIR>'

## Now define a list of the 3 imputed datasets for each train,cal,test sets
train_names = ["nofac_train_seed100_96", "nofac_train_seed200_96", "nofac_train_seed300_96"]
cal_names   = ["nofac_cal_seed100_96",   "nofac_cal_seed200_96",   "nofac_cal_seed300_96"]
test_names  = ["nofac_test_seed100_96",  "nofac_test_seed200_96",  "nofac_test_seed300_96"]


# 3) Load into the same variables your pipeline expects:
imputed_X_train_list = [
    pd.read_pickle(f"{save_dir}/{name}.pkl") for name in train_names
]
imputed_X_cal_list = [
    pd.read_pickle(f"{save_dir}/{name}.pkl") for name in cal_names
]
imputed_X_test_list = [
    pd.read_pickle(f"{save_dir}/{name}.pkl") for name in test_names
]

print("Reloaded imputed_X_train_list, imputed_X_cal_list, and imputed_X_test_list")


In [None]:
##now we generate descriptive statistics for pre- and  post-imputation to ensure
##similar distribution of data

# Lists of imputed variables (must already exist in your notebook)
# continuous_mice = [ ... ]  # list of continuous columns that were imputed
# categorical_mice = [ ... ] # list of categorical columns that were imputed

# -----------------------------------------------------------------------------
# Lets define a helper function to summarize continuous variables (mean, median, STD, IQR)
# -----------------------------------------------------------------------------
def summarise_continuous(df, vars_list):
    stats = []
    for var in vars_list:
        s = df[var].dropna()
        mean = s.mean()
        median = s.median()
        std = s.std()
        q1 = s.quantile(0.25)
        q3 = s.quantile(0.75)
        iqr = q3 - q1
        stats.append({
            'variable': var,
            'mean': mean,
            'median': median,
            'std': std,
            'IQR': iqr
        })
    return pd.DataFrame(stats).set_index('variable')

# -----------------------------------------------------------------------------
# And same for categorical (mode, and % of dataset with that mode value)
# -----------------------------------------------------------------------------
def summarise_categorical(df, vars_list):
    summaries = {}
    for var in vars_list:
        counts = df[var].value_counts(dropna=True)
        props = counts / counts.sum()
        mode = df[var].mode(dropna=True)
        mode = mode.iloc[0] if not mode.empty else np.nan
        mode_prop = props.get(mode, np.nan)
        summaries[var] = {
            'mode': mode,
            'mode_proportion': mode_prop,
            'levels_proportions': props.to_dict()
        }
    return summaries

# -----------------------------------------------------------------------------
# 1) Pre-imputation summaries on TRAIN_FOR_IMPUTE raw
# -----------------------------------------------------------------------------
pre_train = X_train_for_impute_raw.copy()

cont_pre_summary = summarise_continuous(pre_train, continuous_mice)
cat_pre_summary = summarise_categorical(pre_train, categorical_mice)

# -----------------------------------------------------------------------------
# 2) Post-imputation summaries (each of the 3 imputations on TRAIN_FOR_IMPUTE)
# -----------------------------------------------------------------------------
cont_post_list = []
cat_post_list = []

for i, df_imp in enumerate(imputed_X_train_list):
    # 2a) Continuous stats for imputation i
    cont_stats = summarise_continuous(df_imp, continuous_mice).rename(
        columns=lambda c: f"{c}_imp{i}"
    )
    cont_post_list.append(cont_stats)

    # 2b) Categorical stats (mode + mode proportion) for imputation i
    cat_stats = pd.DataFrame([
        {
            'variable': var,
            'mode_imp': df_imp[var].mode(dropna=True).iloc[0] if not df_imp[var].mode(dropna=True).empty else np.nan,
            'mode_prop_imp': df_imp[var].value_counts(normalize=True).get(
                df_imp[var].mode(dropna=True).iloc[0], np.nan
            )
        }
        for var in categorical_mice
    ]).set_index('variable').rename(columns=lambda c: f"{c}_{i}")
    cat_post_list.append(cat_stats)

# 2c) Combine continuous stats across imputations
cont_post_summary = pd.concat(cont_post_list, axis=1)

# 2d) Combine categorical stats across imputations
cat_post_summary = pd.concat(cat_post_list, axis=1)

# -----------------------------------------------------------------------------
# 3) Display pre- vs post-imputation summaries for continuous
# -----------------------------------------------------------------------------
print("=== Continuous variables pre-imputation (TRAIN_FOR_IMPUTE) ===")
display(cont_pre_summary)

print("=== Continuous variables post-imputation (TRAIN_FOR_IMPUTE) ===")
display(cont_post_summary)

# -----------------------------------------------------------------------------
# 4) Display pre- vs post-imputation summaries for categorical
# -----------------------------------------------------------------------------
print("=== Categorical variables pre-imputation (TRAIN_FOR_IMPUTE) ===")
for var, info in cat_pre_summary.items():
    print(
        f"{var}: mode={info['mode']}, "
        f"mode_proportion={info['mode_proportion']:.3f}, "
        f"level_proportions={info['levels_proportions']}"
    )

print("\n=== Categorical variables post-imputation (TRAIN_FOR_IMPUTE) ===")
display(cat_post_summary)


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# Sanity‐check A: For each imputed TRAIN/CAL/TEST, confirm categorical dtypes & cardinalities
# ──────────────────────────────────────────────────────────────────────────────

for i in range(len(imputed_X_train_list)):
    print(f"\n--- Imputation {i} (TRAIN_FOR_IMPUTE) ---")
    df_train = imputed_X_train_list[i]

    # 1) Cardinality of each categorical_mice column
    card_train = df_train[categorical_mice].nunique().sort_values(ascending=False)
    print("  TRAIN_FOR_IMPUTE cardinalities:\n", card_train)

    # 2) dtype of each categorical_mice column
    dtypes_train = df_train[categorical_mice].dtypes
    print("  TRAIN_FOR_IMPUTE dtypes:\n", dtypes_train)

    print(f"\n--- Imputation {i} (CALIBRATION) ---")
    df_cal = imputed_X_cal_list[i]

    card_cal = df_cal[categorical_mice].nunique().sort_values(ascending=False)
    print("  CALIBRATION cardinalities:\n", card_cal)

    dtypes_cal = df_cal[categorical_mice].dtypes
    print("  CALIBRATION dtypes:\n", dtypes_cal)

    print(f"\n--- Imputation {i} (TEST) ---")
    df_test = imputed_X_test_list[i]

    card_test = df_test[categorical_mice].nunique().sort_values(ascending=False)
    print("  TEST cardinalities:\n", card_test)

    dtypes_test = df_test[categorical_mice].dtypes
    print("  TEST dtypes:\n", dtypes_test)

    print("─" * 60)


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# Sanity‐check B: Ensure zero missing values in TRAIN_FOR_IMPUTE / CAL / TEST
# ──────────────────────────────────────────────────────────────────────────────

for i in range(len(imputed_X_train_list)):
    df_train = imputed_X_train_list[i]
    n_missing_train = df_train.isnull().sum().sum()

    df_cal = imputed_X_cal_list[i]
    n_missing_cal = df_cal.isnull().sum().sum()

    df_test = imputed_X_test_list[i]
    n_missing_test = df_test.isnull().sum().sum()

    print(f"Imputation {i}:")
    print(f"  TRAIN_FOR_IMPUTE missing count = {n_missing_train}")
    print(f"  CALIBRATION missing count   = {n_missing_cal}")
    print(f"  TEST missing count          = {n_missing_test}")
    print("─" * 60)


In [None]:
##now we'll one-hot encode all categorical variables

##define lists to store each imputed dataset
encoded_X_train_list = []
encoded_X_cal_list   = []
encoded_X_test_list  = []

##now actually store copies of the imputed dataset in these lists
for i in range(len(imputed_X_train_list)):
    df_train_imp = imputed_X_train_list[i].copy()
    df_cal_imp   = imputed_X_cal_list[i].copy()
    df_test_imp  = imputed_X_test_list[i].copy()

    # 1) Collect all categorical columns present in train_i, cal_i, or test_i
    categorical_columns = sorted(
        set(df_train_imp.select_dtypes(include=["category", "object"]).columns.tolist())
        | set(df_cal_imp.select_dtypes(include=["category", "object"]).columns.tolist())
        | set(df_test_imp.select_dtypes(include=["category", "object"]).columns.tolist())
    )
    print(f"Imputation {i}: found {len(categorical_columns)} categorical columns to encode.")

    # 2) Concatenate train_i + cal_i + test_i so that get_dummies runs once.
    df_combined = pd.concat([df_train_imp, df_cal_imp, df_test_imp], axis=0)

    # 3) Loop through each categorical column and create dummies (drop or not as before):
    df_combined_enc = df_combined.copy()
    for col in categorical_columns:
        # Determine the actual levels (ignoring NaN)
        if isinstance(df_combined_enc[col].dtype, pd.CategoricalDtype):
            levels = list(df_combined_enc[col].cat.categories)
        else:
            levels = sorted(df_combined_enc[col].dropna().unique())
        levels = [lvl for lvl in levels if pd.notna(lvl)]

        if len(levels) <= 1:
            df_combined_enc.drop(columns=[col], inplace=True)
            continue

        if len(levels) == 2:
            # Binary: drop_first=True → one dummy column
            dummies = pd.get_dummies(df_combined_enc[col], prefix=col, drop_first=True)
            df_combined_enc = pd.concat(
                [df_combined_enc.drop(columns=[col]), dummies],
                axis=1
            )
        else:
            # Multi‐level: drop_first=False → full set of dummies
            dummies = pd.get_dummies(df_combined_enc[col], prefix=col, drop_first=False)
            df_combined_enc = pd.concat(
                [df_combined_enc.drop(columns=[col]), dummies],
                axis=1
            )

    # 4) Split the encoded combined back into train_enc_i, cal_enc_i, test_enc_i by index
    df_train_enc_i = df_combined_enc.loc[df_train_imp.index].copy()
    df_cal_enc_i   = df_combined_enc.loc[df_cal_imp.index].copy()
    df_test_enc_i  = df_combined_enc.loc[df_test_imp.index].copy()

    # 5) Verify that train_enc_i, cal_enc_i, and test_enc_i share the exact same columns:
    cols_train = set(df_train_enc_i.columns)
    cols_cal   = set(df_cal_enc_i.columns)
    cols_test  = set(df_test_enc_i.columns)
    if not (cols_train == cols_cal == cols_test):
        raise ValueError(f"❌ Column mismatch in imputation {i} after dummy encoding!")
    else:
        print(f"Imputation {i}: train/cal/test columns match (n_cols={len(cols_train)})")

    encoded_X_train_list.append(df_train_enc_i)
    encoded_X_cal_list.append(df_cal_enc_i)
    encoded_X_test_list.append(df_test_enc_i)

print("Completed one‐hot encoding for all 3 imputed sets.")


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# Sanity‐check C (revised): Check no missing values and preview encoded_X_train_list / encoded_X_test_list
# ──────────────────────────────────────────────────────────────────────────────

pd.set_option("display.max_columns", None)

print("Checking for missing values in each one‐hot–encoded TRAIN_FOR_IMPUTE:")
for i, X_enc in enumerate(encoded_X_train_list):
    total_na = X_enc.isnull().sum().sum()
    status = "No missing" if total_na == 0 else f"{total_na} missing"
    print(f"  Encoded TRAIN_FOR_IMPUTE {i}: {status}")

print("\nChecking for missing values in each one‐hot–encoded TEST:")
for i, X_enc in enumerate(encoded_X_test_list):
    total_na = X_enc.isnull().sum().sum()
    status = "No missing" if total_na == 0 else f"{total_na} missing"
    print(f"  Encoded TEST {i}: {status}")

# ──────────────────────────────────────────────────────────────────────────────
# Preview a few rows from each encoded DataFrame
# ──────────────────────────────────────────────────────────────────────────────

for i, X_enc in enumerate(encoded_X_train_list):
    print(f"\nEncoded TRAIN_FOR_IMPUTE {i} (preview):")
    display(X_enc.head())
    print("─" * 80)

for i, X_enc in enumerate(encoded_X_test_list):
    print(f"\nEncoded TEST {i} (preview):")
    display(X_enc.head())
    print("─" * 80)


In [None]:
##now ensure that the Y df indices match the X df indicies, so that each entry has the correct outcome

for i in range(len(encoded_X_train_list)):
    Xtr = encoded_X_train_list[i]
    Xcal = encoded_X_cal_list[i]
    Xte = encoded_X_test_list[i]

    # Check that Xtr.index == Y_train_for_impute.index
    if not Xtr.index.equals(Y_train_for_impute.index):
        print(f"Imputation {i}: index mismatch between X_train_enc and Y_train_for_impute!")
    else:
        print(f"Imputation {i}: X_train_enc index matches Y_train_for_impute.")

    # Check that Xcal.index == Y_cal.index
    if not Xcal.index.equals(Y_cal.index):
        print(f"Imputation {i}: index mismatch between X_cal_enc and Y_cal!")
    else:
        print(f"Imputation {i}: X_cal_enc index matches Y_cal.")

    # Check that Xte.index == Y_test.index
    if not Xte.index.equals(Y_test.index):
        print(f"Imputation {i}: index mismatch between X_test_enc and Y_test!")
    else:
        print(f"Imputation {i}: X_test_enc index matches Y_test.")

    print(f"   Shapes: X_train_enc_{i}={Xtr.shape}, X_cal_enc_{i}={Xcal.shape}, X_test_enc_{i}={Xte.shape}")
    print(f"           Y_train_for_impute={Y_train_for_impute.shape}, Y_cal={Y_cal.shape}, Y_test={Y_test.shape}\n")


In [None]:
#### Scale all continuous variables based on X_train

X_train_s_list = []
X_cal_s_list   = []
X_test_s_list  = []

for i in range(len(encoded_X_train_list)):
    Xtr = encoded_X_train_list[i].copy()
    Xcal = encoded_X_cal_list[i].copy()
    Xte = encoded_X_test_list[i].copy()

    # Identify continuous columns (from your original continuous_vars)
    continuous_cols = [c for c in continuous_vars if c in Xtr.columns]

    if len(continuous_cols) > 0:
        # ── a) Cast those columns to float64 on all three splits ─────────────────
        Xtr[continuous_cols] = Xtr[continuous_cols].astype(np.float64)
        Xcal[continuous_cols] = Xcal[continuous_cols].astype(np.float64)
        Xte[continuous_cols] = Xte[continuous_cols].astype(np.float64)

        # ── b) Fit StandardScaler on Xtr[continuous_cols]
        scaler = StandardScaler()
        scaler.fit(Xtr[continuous_cols])

        # ── c) Transform all three splits
        Xtr_scaled_vals = scaler.transform(Xtr[continuous_cols])
        Xcal_scaled_vals = scaler.transform(Xcal[continuous_cols])
        Xte_scaled_vals = scaler.transform(Xte[continuous_cols])

        # ── d) Now assign back into the DataFrames (no dtype conflict)
        Xtr.loc[:, continuous_cols] = Xtr_scaled_vals
        Xcal.loc[:, continuous_cols] = Xcal_scaled_vals
        Xte.loc[:, continuous_cols] = Xte_scaled_vals

    X_train_s_list.append(Xtr)
    X_cal_s_list.append(Xcal)
    X_test_s_list.append(Xte)
    print(f"Imputation {i}: scaled continuous cols.   (X_train_s_{i}.shape={Xtr.shape})")

print("All imputed train/cal/test sets are now scaled.")


In [None]:
##ensure DFs still have appropriate shape

for i in range(len(X_train_s_list)):
    print(f"--- Imputation {i} ---")
    print(f"X_train_s_{i}.shape = {X_train_s_list[i].shape}   (Y_train_for_impute.shape = {Y_train_for_impute.shape})")
    print(f"X_cal_s_{i}.shape   = {X_cal_s_list[i].shape}   (Y_cal.shape = {Y_cal.shape})")
    print(f"X_test_s_{i}.shape  = {X_test_s_list[i].shape}   (Y_test.shape = {Y_test.shape})\n")


In [None]:
# ──────────────────────────────────────────────────────────────
# Step 1: Make “safe” copies of your TRAIN and TEST data for XGBoost
# ──────────────────────────────────────────────────────────────
# Assume you have lists of imputed DataFrames: X_train_s_list, X_test_s_list
# and a single array/Series of labels before imputation: Y_train_for_impute, Y_test

# Create safe‐named versions for each imputation
X_train_s_list_xgb = [
    df.copy().set_axis([f"feat_{j}" for j in range(df.shape[1])], axis=1)
    for df in X_train_s_list
]
X_test_s_list_xgb = [
    df.copy().set_axis([f"feat_{j}" for j in range(df.shape[1])], axis=1)
    for df in X_test_s_list
]

# Build lists of numpy label arrays (one per imputation)
Y_train_list = [Y_train_for_impute.values for _ in X_train_s_list_xgb]
Y_test_list  = [Y_test.values for _ in X_test_s_list_xgb]

# ──────────────────────────────────────────────────────────────
# Step 2: Specify your fixed hyperparameters from previous files
# ──────────────────────────────────────────────────────────────
fixed_params = {
    "learning_rate": 0.05,
    "max_depth": 7,
    "n_estimators": 200,
    "colsample_bytree": 0.8,
    "subsample": 0.8
}

# ──────────────────────────────────────────────────────────────
# Step 3: Train on TRAIN and evaluate on TEST for each imputation
# ──────────────────────────────────────────────────────────────
test_auc_scores = []

for i, (X_tr, X_te) in enumerate(zip(X_train_s_list_xgb, X_test_s_list_xgb)):
    y_tr = Y_train_list[i]
    y_te = Y_test_list[i]

    # Initialize and fit the XGBoost model
    model = xgb.XGBClassifier(
        **fixed_params,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=0
    )
    model.fit(X_tr, y_tr)

    # Predict probabilities on the test set
    y_proba = model.predict_proba(X_te)[:, 1]

    # Compute ROC AUC on this imputation’s test set
    auc = roc_auc_score(y_te, y_proba)
    test_auc_scores.append(auc)
    print(f"Imputation {i} — Test ROC AUC: {auc:.4f}")

# Average AUC across imputations
mean_test_auc = float(np.mean(test_auc_scores))
print(f"\n>>> Mean Test ROC AUC (averaged over imputations): {mean_test_auc:.4f}")


# ──────────────────────────────────────────────────────────────
# Store models in list
final_models = []
for i, X_tr in enumerate(X_train_s_list_xgb):
    y_tr = Y_train_list[i]
    model = xgb.XGBClassifier(
        **fixed_params,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=0
    )
    model.fit(X_tr, y_tr)
    final_models.append(model)

# Now `final_models` holds one trained XGBClassifier per imputation (to be calibrated later
# on your validation data as desired).


In [None]:
##now calibrate all models on the validation/calibration set
# ──────────────────────────────────────────────────────────────
# Assume:
# - final_models is a list of pre-trained XGBClassifier instances
# - X_val_s_list_xgb is a list of “safe”-named validation DataFrames
#   corresponding to each imputation
# - Y_val_list is a list of numpy arrays/Series of validation labels
# ──────────────────────────────────────────────────────────────

##make safe copies again, prevent anything from being overridden
X_cal_s_list_xgb = [
    df.copy().set_axis([f"feat_{j}" for j in range(df.shape[1])], axis=1)
    for df in X_cal_s_list
]
# Build lists of numpy label arrays (one per imputation)
Y_cal_list = [Y_cal.values for _ in X_cal_s_list_xgb]

calibrated_models = []
val_auc_scores = []

for i, base_model in enumerate(final_models):
    X_val = X_cal_s_list_xgb[i]
    y_val = Y_cal_list[i]

    # Create a calibrated wrapper around the pre-trained model
    cal_model = CalibratedClassifierCV(
        estimator=base_model,
        method="isotonic",
        cv="prefit"
    )
    cal_model.fit(X_val, y_val)
    calibrated_models.append(cal_model)

    # Compute validation ROC AUC after calibration
    y_val_proba = cal_model.predict_proba(X_val)[:, 1]
    auc_val = roc_auc_score(y_val, y_val_proba)
    val_auc_scores.append(auc_val)
    print(f"Imputation {i} — Validation ROC AUC (calibrated): {auc_val:.4f}")

mean_val_auc = float(np.mean(val_auc_scores))
print(f"\n>>> Mean Validation ROC AUC (averaged over imputations): {mean_val_auc:.4f}")


In [None]:
##genearte reliability diagrams for calibraiton with 95% CI
# ──────────────────────────────────────────────────────────────
# Assume the following are already defined:
# - final_models: list of isotonic‐calibrated XGBClassifier instances (one per imputation)
# - X_test_s_list_xgb: list of “safe”-named test DataFrames (one per imputation)
# - Y_test_list: list of numpy arrays of test labels (one per imputation)
# ──────────────────────────────────────────────────────────────


##define the function used to generate the curves)
def bootstrap_calibration_curve(y_true, y_prob, n_bins=10, n_boot=1000, random_state=None):
    """
    1) Compute the original bin-based calibration curve.
    2) Bootstrap the dataset n_boot times, each time recalculating the bin-based
       fraction of positives (prob_true) and storing it.
    3) Return the original curve + 95% CI per bin (based on 2.5 and 97.5 percentiles).
    """
    # Define bin edges (equally spaced from 0 to 1)
    bin_edges = np.linspace(0, 1, n_bins + 1)

    # Digitize predicted probabilities
    bin_indices = np.digitize(y_prob, bin_edges) - 1
    bin_indices[bin_indices == n_bins] = n_bins - 1  # cap any == n_bins to last bin

    # Prepare arrays to hold the original bin stats
    prob_pred_orig = np.zeros(n_bins)
    prob_true_orig = np.zeros(n_bins)
    counts_in_bin  = np.zeros(n_bins, dtype=int)

    # Fill in the stats for each bin
    for i in range(n_bins):
        mask = (bin_indices == i)
        counts_in_bin[i] = np.sum(mask)
        if counts_in_bin[i] > 0:
            prob_pred_orig[i] = np.mean(y_prob[mask])   # mean predicted prob in this bin
            prob_true_orig[i] = np.mean(y_true[mask])   # fraction of positives (actual)
        else:
            prob_pred_orig[i] = np.nan
            prob_true_orig[i] = np.nan

    # Remove empty bins (NaN) from the original arrays
    valid_mask = ~np.isnan(prob_pred_orig)
    prob_pred_orig = prob_pred_orig[valid_mask]
    prob_true_orig = prob_true_orig[valid_mask]

    # -----------------------------
    # Bootstrap to get CIs
    # -----------------------------
    rng = np.random.RandomState(random_state) if random_state else np.random

    # Store fraction of positives (prob_true) for each bin in each bootstrap
    boot_prob_true = np.zeros((n_boot, sum(valid_mask)))
    n_data = len(y_true)

    for b in range(n_boot):
        sample_indices = rng.randint(0, n_data, size=n_data)
        y_true_b = y_true[sample_indices]
        y_prob_b = y_prob[sample_indices]

        bin_indices_b = np.digitize(y_prob_b, bin_edges) - 1
        bin_indices_b[bin_indices_b == n_bins] = n_bins - 1

        prob_true_b = np.zeros(n_bins)
        for i in range(n_bins):
            mask_b = (bin_indices_b == i)
            if mask_b.sum() > 0:
                prob_true_b[i] = np.mean(y_true_b[mask_b])
            else:
                prob_true_b[i] = np.nan

        prob_true_b = prob_true_b[valid_mask]
        boot_prob_true[b, :] = prob_true_b

    # Compute 2.5th and 97.5th percentile per bin
    lower_ci = np.nanpercentile(boot_prob_true,  2.5, axis=0)
    upper_ci = np.nanpercentile(boot_prob_true, 97.5, axis=0)

    return prob_pred_orig, prob_true_orig, lower_ci, upper_ci


# ──────────────────────────────────────────────────────────────
# STEP 1: Generate calibrated probabilities for each test row in each imputation
# ──────────────────────────────────────────────────────────────
all_y_probs = []  # will become shape (n_imputations, n_test_rows)

for i, cal_model in enumerate(calibrated_models):
    X_te = X_test_s_list_xgb[i]
    # Predict calibrated probabilities on this imputation’s test set
    y_prob_te = cal_model.predict_proba(X_te)[:, 1]
    all_y_probs.append(y_prob_te)

# Convert to a 2D array: shape = (n_imputations, n_test_rows)
all_y_probs = np.vstack(all_y_probs)

# STEP 2: Pool probabilities per test row by averaging across imputations
# -----------------------------------------------------------------------
# Each column corresponds to one test row, so take a mean along axis=0
y_prob_pooled = all_y_probs.mean(axis=0)

# STEP 3: Obtain a single true‐label array for test rows
# -----------------------------------------------------------------------
# All imputations share the same true labels in the test set, so we can take any
# (e.g., the first one). They should be identical across imputations.
y_true = Y_test_list[0]


# ──────────────────────────────────────────────────────────────
# STEP 4: Compute calibration curve + bootstrap CIs on pooled probabilities
# ──────────────────────────────────────────────────────────────
prob_pred_orig, prob_true_orig, lower_ci, upper_ci = bootstrap_calibration_curve(
    y_true, y_prob_pooled, n_bins=10, n_boot=1000, random_state=42
)

# STEP 5: Compute Brier score on pooled probabilities
# -----------------------------------------------------------------------
brier_score = brier_score_loss(y_true, y_prob_pooled)
print(f"Brier Score (on pooled probs): {brier_score:.4f}")


# ──────────────────────────────────────────────────────────────
# STEP 6: Plot reliability diagram with 95% CI using pooled probabilities
# ──────────────────────────────────────────────────────────────
plt.figure(figsize=(8, 6))

# Perfect calibration line
plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')

# Sort bins by mean predicted probability so the line is monotonic
sort_idx     = np.argsort(prob_pred_orig)
x_sorted     = prob_pred_orig[sort_idx]
y_sorted     = prob_true_orig[sort_idx]
lower_sorted = lower_ci[sort_idx]
upper_sorted = upper_ci[sort_idx]

# Plot calibration curve
plt.plot(x_sorted, y_sorted, marker='o', color='b', label='Calibration Curve')

# Shade 95% CI region
plt.fill_between(
    x_sorted,
    lower_sorted,
    upper_sorted,
    color='b',
    alpha=0.2,
    label='95% CI'
)

##actually display pooled RD
plt.xlabel('Mean Predicted Probability (pooled)')
plt.ylabel('Fraction of Positives')
plt.title('Reliability Diagram with 95% CI (Pooled Across Imputations)')
plt.legend(loc='best')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.tight_layout()
plt.show()


In [None]:
##now we'll generate ROC curve, PR curve, decision curve, and secondary metrics
##for models based on each imputation individually.

##secondary metrics will be computed at 2 different thresholds: Max F1 and 90% spec

# ----------------------------------------------------------------------
# 0) Helper: compute confusion‐matrix metrics at one threshold
# ----------------------------------------------------------------------
def compute_metrics_from_probs(y_true, y_prob, threshold):
    y_pred = (y_prob >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    actual_pos = tp + fn
    actual_neg = tn + fp
    pred_pos   = tp + fp
    pred_neg   = tn + fn

    accuracy      = (tp + tn) / (actual_pos + actual_neg) if (actual_pos + actual_neg) else 0
    sensitivity   = tp / actual_pos if actual_pos else 0           # TPR
    specificity   = tn / actual_neg if actual_neg else 0           # TNR
    precision     = tp / pred_pos if pred_pos else 0                # PPV
    npv           = tn / pred_neg if pred_neg else 0                # NPV
    f1            = f1_score(y_true, y_pred) if (tp + fp + fn) else 0

    return {
        "Accuracy": accuracy,
        "Sensitivity (TPR)": sensitivity,
        "Specificity (TNR)": specificity,
        "Precision (PPV)": precision,
        "Negative Predictive Value (NPV)": npv,
        "F1 Score": f1
    }

# ----------------------------------------------------------------------
# 1) Gather test labels and calibrated models from the refactored pipeline
# ----------------------------------------------------------------------
# calibrated_models: list of CalibratedClassifierCV instances, one per imputation
# X_test_s_list_xgb : list of “safe”-named test DataFrames (one per imputation)
# X_test_s_list     : list of original (un-renamed) test DataFrames
# Y_test_list       : list of numpy arrays (or Series) of test labels, one per imputation

# Extract calibrated estimators directly
calibrators = calibrated_models  # list of CalibratedClassifierCV

# Build safe_names_list from the “safe”-named test DataFrames
# Each X_test_s_list_xgb[i] has columns already renamed to ["feat_0", "feat_1", …].
safe_names_list = [df.columns.tolist() for df in X_test_s_list_xgb]

# ----------------------------------------------------------------------
# 2) Loop over each imputation, compute metrics & plot curves
# ----------------------------------------------------------------------
for i in range(len(X_test_s_list)):
    print(f"\n\n===== Imputation {i} Metrics & Curves =====")

    # a) Grab truth and calibrated probabilities for this test fold
    y_true      = Y_test_list[i]
    calib_model = calibrators[i]
    safe_names  = safe_names_list[i]
    X_test_df   = X_test_s_list[i]       # original, un-renamed DataFrame

    # Rename test DataFrame columns to safe names before predicting
    X_te_safe = X_test_df.copy()
    X_te_safe.columns = safe_names

    # Get calibrated probability of the positive class
    y_prob = calib_model.predict_proba(X_te_safe)[:, 1]

    # ------------------------------------------------------------------
    # 2) ROC Curve + AUROC
    # ------------------------------------------------------------------
    auroc       = roc_auc_score(y_true, y_prob)
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc_val = sk_auc(fpr, tpr)

    plt.figure(figsize=(6, 6))
    plt.plot(fpr, tpr, color="darkorange", lw=2,
             label=f"ROC curve (AUC = {roc_auc_val:.3f})")
    plt.plot([0, 1], [0, 1], color="black", lw=2, linestyle="--", label="Random")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"Imputation {i}: ROC Curve (AUROC={roc_auc_val:.3f})")
    plt.legend(loc="lower right")
    plt.show()

    # ------------------------------------------------------------------
    # 3) Precision‐Recall Curve + AUPRC
    # ------------------------------------------------------------------
    precision, recall, _ = precision_recall_curve(y_true, y_prob)
    pr_auc_val           = sk_auc(recall, precision)

    plt.figure(figsize=(6, 5))
    plt.plot(recall, precision, color="blue", lw=2,
             label=f"PR curve (AUC = {pr_auc_val:.3f})")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"Imputation {i}: Precision‐Recall Curve (AUPRC={pr_auc_val:.3f})")
    plt.legend(loc="best")
    plt.ylim([0, 1.0])
    plt.show()

    # ------------------------------------------------------------------
    # 4) Confusion Matrix Stats at threshold = 0.50
    # ------------------------------------------------------------------
    thresh_fixed = 0.50
    metrics_050  = compute_metrics_from_probs(y_true, y_prob, thresh_fixed)
    brier        = brier_score_loss(y_true, y_prob)

    print(f"\n– Confusion Matrix Metrics @ threshold = {thresh_fixed:.2f} –")
    cm = confusion_matrix(y_true, (y_prob >= thresh_fixed).astype(int))
    print("Confusion Matrix:")
    print(cm)
    print(f"Accuracy              : {metrics_050['Accuracy']:.3f}")
    print(f"Sensitivity (TPR)     : {metrics_050['Sensitivity (TPR)']:.3f}")
    print(f"Specificity (TNR)     : {metrics_050['Specificity (TNR)']:.3f}")
    print(f"Precision (PPV)       : {metrics_050['Precision (PPV)']:.3f}")
    print(f"Negative Predictive   : {metrics_050['Negative Predictive Value (NPV)']:.3f}")
    print(f"F1 Score              : {metrics_050['F1 Score']:.3f}")
    print(f"Brier Score           : {brier:.4f}")

    # ------------------------------------------------------------------
    # 5) Find threshold that maximizes F1 (0.00–1.00 by 0.01)
    # ------------------------------------------------------------------
    best_f1     = -1.0
    best_thresh = None
    candidate_ts = np.linspace(0, 1, 101)

    for t in candidate_ts:
        f1_val = f1_score(y_true, (y_prob >= t).astype(int))
        if f1_val > best_f1:
            best_f1     = f1_val
            best_thresh = t

    print(f"\nOptimal threshold for max F1: {best_thresh:.2f} (F1 = {best_f1:.3f})")

    # ------------------------------------------------------------------
    # 6) Find lowest threshold that yields ≥ 90% specificity (0.00–1.00 by 0.01)
    # ------------------------------------------------------------------
    thresh_90sp = None
    for t in candidate_ts:
        tn, fp, fn, tp = confusion_matrix(y_true, (y_prob >= t).astype(int)).ravel()
        actual_neg = tn + fp
        if actual_neg == 0:
            continue
        specificity = tn / actual_neg
        if specificity >= 0.90:
            thresh_90sp = t
            break

    if thresh_90sp is not None:
        tn90 = np.sum((y_true == 0) & (y_prob < thresh_90sp))
        fp90 = np.sum((y_true == 0) & (y_prob >= thresh_90sp))
        fn90 = np.sum((y_true == 1) & (y_prob < thresh_90sp))
        tp90 = np.sum((y_true == 1) & (y_prob >= thresh_90sp))
        tnr90 = tn90 / (tn90 + fp90) if (tn90 + fp90) else 0
        tpr90 = tp90 / (tp90 + fn90) if (tp90 + fn90) else 0
        print(f"\nThreshold achieving ≥ 90% specificity: {thresh_90sp:.2f}")
        print(f"  Specificity (TNR) at {thresh_90sp:.2f}: {tnr90:.3f}")
        print(f"  Sensitivity (TPR)  at {thresh_90sp:.2f}: {tpr90:.3f}")
    else:
        print("\nNo threshold found with ≥ 90% specificity.")

    # ------------------------------------------------------------------
    # 7) Decision Curve Analysis (Net Benefit)
    # ------------------------------------------------------------------
    def net_benefit(y_t, y_p, thresholds):
        N  = len(y_t)
        NB = []
        for t in thresholds:
            y_pred_t = (y_p >= t).astype(int)
            TP = np.sum((y_t == 1) & (y_pred_t == 1))
            FP = np.sum((y_t == 0) & (y_pred_t == 1))
            if t == 1.0:
                NB.append(0)
            else:
                NB.append((TP / N) - (FP / N) * (t / (1 - t)))
        return NB

    decision_thresholds = np.linspace(0.0, 1.0, 101)
    NB_vals = net_benefit(y_true, y_prob, decision_thresholds)

    prevalence    = np.mean(y_true)
    treat_all_nb  = [
        (prevalence - (1 - prevalence) * (t / (1 - t))) if t < 1.0 else 0
        for t in decision_thresholds
    ]
    treat_none_nb = np.zeros_like(decision_thresholds)

    plt.figure(figsize=(6, 5))
    plt.plot(decision_thresholds, NB_vals, label="Model", color="darkorange")
    plt.plot(decision_thresholds, treat_all_nb, label="Treat All", color="red", linestyle="--")
    plt.plot(decision_thresholds, treat_none_nb, label="Treat None", color="blue", linestyle=":")
    plt.xlabel("Threshold Probability")
    plt.ylabel("Net Benefit")
    plt.title(f"Imputation {i}: Decision Curve Analysis")
    plt.ylim([-0.3, 0.3])
    plt.xlim([0, 1.0])
    plt.legend(loc="best")
    plt.show()

    # ------------------------------------------------------------------
    # 8) Summary Table for Selected Thresholds
    # ------------------------------------------------------------------
    performance_metrics   = []
    thresholds_to_report  = {
        "Max F1 Threshold":          best_thresh,
        "90% Specificity Threshold": thresh_90sp
    }

    for desc, thr in thresholds_to_report.items():
        if thr is not None:
            mets = compute_metrics_from_probs(y_true, y_prob, thr)
            mets["Threshold"] = f"{thr:.2f}"
            performance_metrics.append(mets)
        else:
            print(f"\n{desc} not available (no threshold met requirement).")

    df_perf = pd.DataFrame(performance_metrics)
    df_perf = df_perf[
        ["Threshold", "Accuracy", "Sensitivity (TPR)",
         "Specificity (TNR)", "Precision (PPV)",
         "Negative Predictive Value (NPV)", "F1 Score"]
    ]
    df_perf.index = thresholds_to_report.keys()

    print("\n=== Performance Metrics at Selected Thresholds ===")
    print(df_perf.to_string(float_format="%.3f"))

print("\n===== End of All Imputations =====")


In [None]:
##now do same thing as above but POOLED results across all imputations
##each row is condensed into a single probability (probability averaged across the imputations)

# ----------------------------------------------------------------------
# 0) Helper: compute confusion‐matrix metrics at one threshold
# ----------------------------------------------------------------------
def compute_metrics_from_probs(y_true, y_prob, threshold):
    y_pred = (y_prob >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    actual_pos = tp + fn
    actual_neg = tn + fp
    pred_pos   = tp + fp
    pred_neg   = tn + fn

    accuracy      = (tp + tn) / (actual_pos + actual_neg) if (actual_pos + actual_neg) else 0
    sensitivity   = tp / actual_pos if actual_pos else 0           # TPR
    specificity   = tn / actual_neg if actual_neg else 0           # TNR
    precision     = tp / pred_pos if pred_pos else 0                # PPV
    npv           = tn / pred_neg if pred_neg else 0                # NPV
    f1            = f1_score(y_true, y_pred) if (tp + fp + fn) else 0

    return {
        "Accuracy": accuracy,
        "Sensitivity (TPR)": sensitivity,
        "Specificity (TNR)": specificity,
        "Precision (PPV)": precision,
        "Negative Predictive Value (NPV)": npv,
        "F1 Score": f1
    }

# ----------------------------------------------------------------------
# 1) Gather pooled test‐set probabilities across imputations
# ----------------------------------------------------------------------
# calibrated_models: list of CalibratedClassifierCV, one per imputation
# safe_names_list  : list of lists of safe column names, one per imputation
# X_test_s_list    : list of original test DataFrames (one per imputation)
# Y_test_list      : list of numpy arrays (or Series) of test labels, one per imputation

all_y_probs = []
for i, calib_model in enumerate(calibrated_models):
    # Rename X_test to safe names
    X_te_safe = X_test_s_list[i].copy()
    X_te_safe.columns = safe_names_list[i]
    # Get calibrated probabilities
    y_prob = calib_model.predict_proba(X_te_safe)[:, 1]
    all_y_probs.append(y_prob)

# Stack (K, N) → average → (N,)
y_prob_stack   = np.vstack(all_y_probs)         # shape = (K, N)
y_prob_pooled  = y_prob_stack.mean(axis=0)      # shape = (N,)

# True labels (assume identical across imputations)
y_true = Y_test_list[0]

# ----------------------------------------------------------------------
# 2) ROC Curve + AUROC on pooled probabilities
# ----------------------------------------------------------------------
auroc       = roc_auc_score(y_true, y_prob_pooled)
fpr, tpr, _ = roc_curve(y_true, y_prob_pooled)
roc_auc_val = sk_auc(fpr, tpr)

plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, color="darkorange", lw=2,
         label=f"ROC curve (AUC = {roc_auc_val:.3f})")
plt.plot([0, 1], [0, 1], color="black", lw=2, linestyle="--", label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title(f"Pooled ROC Curve (AUROC={roc_auc_val:.3f})")
plt.legend(loc="lower right")
plt.show()

# ----------------------------------------------------------------------
# 3) Precision‐Recall Curve + AUPRC on pooled probabilities
# ----------------------------------------------------------------------
precision, recall, _ = precision_recall_curve(y_true, y_prob_pooled)
pr_auc_val           = sk_auc(recall, precision)

plt.figure(figsize=(6, 5))
plt.plot(recall, precision, color="blue", lw=2,
         label=f"PR curve (AUC = {pr_auc_val:.3f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title(f"Pooled Precision‐Recall Curve (AUPRC={pr_auc_val:.3f})")
plt.legend(loc="best")
plt.ylim([0, 1.0])
plt.show()

# ----------------------------------------------------------------------
# 4) Confusion Matrix Stats at threshold = 0.50 on pooled probabilities
# ----------------------------------------------------------------------
thresh_fixed = 0.50
metrics_050  = compute_metrics_from_probs(y_true, y_prob_pooled, thresh_fixed)
brier        = brier_score_loss(y_true, y_prob_pooled)

print(f"\n– Pooled Confusion Matrix Metrics @ threshold = {thresh_fixed:.2f} –")
cm = confusion_matrix(y_true, (y_prob_pooled >= thresh_fixed).astype(int))
print("Confusion Matrix:")
print(cm)
print(f"Accuracy              : {metrics_050['Accuracy']:.3f}")
print(f"Sensitivity (TPR)     : {metrics_050['Sensitivity (TPR)']:.3f}")
print(f"Specificity (TNR)     : {metrics_050['Specificity (TNR)']:.3f}")
print(f"Precision (PPV)       : {metrics_050['Precision (PPV)']:.3f}")
print(f"Negative Predictive   : {metrics_050['Negative Predictive Value (NPV)']:.3f}")
print(f"F1 Score              : {metrics_050['F1 Score']:.3f}")
print(f"Brier Score           : {brier:.4f}")

# ----------------------------------------------------------------------
# 5) Find threshold that maximizes F1 on pooled predictions
# ----------------------------------------------------------------------
best_f1     = -1.0
best_thresh = None
candidate_ts = np.linspace(0, 1, 101)

for t in candidate_ts:
    f1_val = f1_score(y_true, (y_prob_pooled >= t).astype(int))
    if f1_val > best_f1:
        best_f1     = f1_val
        best_thresh = t

print(f"\nOptimal threshold for max F1 (pooled): {best_thresh:.2f} (F1 = {best_f1:.3f})")

# ----------------------------------------------------------------------
# 6) Find threshold that yields ≥ 90% specificity on pooled predictions
# ----------------------------------------------------------------------
thresh_90sp = None
for t in candidate_ts:
    tn, fp, fn, tp = confusion_matrix(y_true, (y_prob_pooled >= t).astype(int)).ravel()
    actual_neg = tn + fp
    if actual_neg == 0:
        continue
    specificity = tn / actual_neg
    if specificity >= 0.90:
        thresh_90sp = t
        break

if thresh_90sp is not None:
    tn90 = np.sum((y_true == 0) & (y_prob_pooled <  thresh_90sp))
    fp90 = np.sum((y_true == 0) & (y_prob_pooled >= thresh_90sp))
    fn90 = np.sum((y_true == 1) & (y_prob_pooled <  thresh_90sp))
    tp90 = np.sum((y_true == 1) & (y_prob_pooled >= thresh_90sp))
    tnr90 = tn90 / (tn90 + fp90) if (tn90 + fp90) else 0
    tpr90 = tp90 / (tp90 + fn90) if (tp90 + fn90) else 0
    print(f"\nThreshold achieving ≥ 90% specificity (pooled): {thresh_90sp:.2f}")
    print(f"  Specificity (TNR) at {thresh_90sp:.2f}: {tnr90:.3f}")
    print(f"  Sensitivity (TPR)  at {thresh_90sp:.2f}: {tpr90:.3f}")
else:
    print("\nNo threshold found with ≥ 90% specificity (pooled).")

# ----------------------------------------------------------------------
# 7) Decision Curve Analysis (Net Benefit) on pooled predictions
# ----------------------------------------------------------------------
def net_benefit(y_t, y_p, thresholds):
    N  = len(y_t)
    NB = []
    for t in thresholds:
        y_pred_t = (y_p >= t).astype(int)
        TP = np.sum((y_t == 1) & (y_pred_t == 1))
        FP = np.sum((y_t == 0) & (y_pred_t == 1))
        if t == 1.0:
            NB.append(0)
        else:
            NB.append((TP / N) - (FP / N) * (t / (1 - t)))
    return NB

decision_thresholds = np.linspace(0.0, 1.0, 101)
NB_vals = net_benefit(y_true, y_prob_pooled, decision_thresholds)

prevalence    = np.mean(y_true)
treat_all_nb  = [
    (prevalence - (1 - prevalence) * (t / (1 - t))) if t < 1.0 else 0
    for t in decision_thresholds
]
treat_none_nb = np.zeros_like(decision_thresholds)

plt.figure(figsize=(6, 5))
plt.plot(decision_thresholds, NB_vals, label="Model", color="darkorange")
plt.plot(decision_thresholds, treat_all_nb, label="Treat All", color="red", linestyle="--")
plt.plot(decision_thresholds, treat_none_nb, label="Treat None", color="blue", linestyle=":")
plt.xlabel("Threshold Probability")
plt.ylabel("Net Benefit")
plt.title("Pooled Decision Curve Analysis")
plt.ylim([-0.3, 0.3])
plt.xlim([0, 1.0])
plt.legend(loc="best")
plt.show()

# ----------------------------------------------------------------------
# 8) Summary Table for Selected Thresholds (Pooled)
# ----------------------------------------------------------------------
performance_metrics   = []
thresholds_to_report  = {
    "Max F1 Threshold":          best_thresh,
    "90% Specificity Threshold": thresh_90sp
}

for desc, thr in thresholds_to_report.items():
    if thr is not None:
        mets = compute_metrics_from_probs(y_true, y_prob_pooled, thr)
        mets["Threshold"] = f"{thr:.2f}"
        performance_metrics.append(mets)
    else:
        print(f"\n{desc} not available (no threshold met requirement).")

df_perf = pd.DataFrame(performance_metrics)
df_perf = df_perf[
    ["Threshold", "Accuracy", "Sensitivity (TPR)",
     "Specificity (TNR)", "Precision (PPV)",
     "Negative Predictive Value (NPV)", "F1 Score"]
]
df_perf.index = thresholds_to_report.keys()

print("\n=== Pooled Performance Metrics at Selected Thresholds ===")
print(df_perf.to_string(float_format="%.3f"))


In [None]:
##compute 95% CI for AUROC and AUPRC based on 1000-sample bootstrapping

# ───────────────────────────────────────────────────────────────────────────────
# 0) Gather inputs from your pipeline:
#    • X_test_s_list     : list of DataFrames (one per imputation) of test features (un‐renamed)
#    • Y_test_list       : list of numpy arrays (one per imputation) of test labels
#    • calibrated_models : list of CalibratedClassifierCV models (one per imputation)
#    • safe_names_list   : list of lists of “safe” feature names (one per imputation)
# ───────────────────────────────────────────────────────────────────────────────

K = len(X_test_s_list)
N = len(Y_test_list[0])  # number of patients in each imputed test set

# 1) For each imputation, compute y_prob_i (length N)
all_probs = np.zeros((K, N))
for i in range(K):
    X_te_safe = X_test_s_list[i].copy()
    X_te_safe.columns = safe_names_list[i]
    all_probs[i, :] = calibrated_models[i].predict_proba(X_te_safe)[:, 1]

# 2) Average across imputations → one probability per patient:
y_prob_pooled = all_probs.mean(axis=0)   # shape = (N,)

# 3) True labels (same for every imputation)
y_true = Y_test_list[0]  # assume identical for all i

# ───────────────────────────────────────────────────────────────────────────────
# 4) Compute “pooled” AUROC & AUPRC (no bootstrap yet)
# ───────────────────────────────────────────────────────────────────────────────
pooled_auroc = roc_auc_score(y_true, y_prob_pooled)

precision_all, recall_all, _ = precision_recall_curve(y_true, y_prob_pooled)
pooled_auprc = sk_auc(recall_all, precision_all)

print(f"Pooled AUROC (no CI) = {pooled_auroc:.3f}")
print(f"Pooled AUPRC (no CI) = {pooled_auprc:.3f}")

# ───────────────────────────────────────────────────────────────────────────────
# 5) Bootstrap to get 95% CI (resample the N patients, NOT K·N)
# ───────────────────────────────────────────────────────────────────────────────
n_boot   = 1000
rng      = np.random.RandomState(0)

boot_aurocs = np.zeros(n_boot)
boot_auprcs = np.zeros(n_boot)

for b in range(n_boot):
    idx = rng.randint(0, N, size=N)
    y_b  = y_true[idx]
    p_b  = y_prob_pooled[idx]

    try:
        boot_aurocs[b] = roc_auc_score(y_b, p_b)
    except ValueError:
        boot_aurocs[b] = np.nan

    try:
        prec_b, rec_b, _ = precision_recall_curve(y_b, p_b)
        boot_auprcs[b]     = sk_auc(rec_b, prec_b)
    except ValueError:
        boot_auprcs[b] = np.nan

boot_aurocs = boot_aurocs[~np.isnan(boot_aurocs)]
boot_auprcs = boot_auprcs[~np.isnan(boot_auprcs)]

lower_auroc = np.percentile(boot_aurocs, 2.5)
upper_auroc = np.percentile(boot_aurocs, 97.5)

lower_auprc = np.percentile(boot_auprcs, 2.5)
upper_auprc = np.percentile(boot_auprcs, 97.5)

print(f"Pooled AUROC = {pooled_auroc:.3f}   95% CI = [{lower_auroc:.3f}, {upper_auroc:.3f}]")
print(f"Pooled AUPRC = {pooled_auprc:.3f}   95% CI = [{lower_auprc:.3f}, {upper_auprc:.3f}]")

In [None]:
##generate SHAP importance value visualization for top 20 vars

# ───────────────────────────────────────────────────────────────
# STEP 1: Compute SHAP values for each imputation’s training set
# ───────────────────────────────────────────────────────────────
all_shap_full = []     # will hold arrays of shape (N, F) for each imputation
orig_feature_names = X_train_s_list[0].columns.tolist()  # assume same columns every imputation

for i, model in enumerate(final_models):
    # A) Safe‐named training DataFrame
    X_tr_safe = X_train_s_list_xgb[i]  # shape = (N, F)

    # B) Compute SHAP values on the safe‐named DataFrame
    explainer = shap.TreeExplainer(model)
    shap_vals = explainer.shap_values(X_tr_safe)  # shape = (N, F)

    all_shap_full.append(shap_vals)

# ───────────────────────────────────────────────────────────────
# STEP 2: Pool SHAP by averaging across imputations per row
# ───────────────────────────────────────────────────────────────
# A) Stack into (K, N, F)
shap_stack    = np.stack(all_shap_full, axis=0)    # shape = (K, N, F)
# B) Average along axis=0 → (N, F)
shap_pooled   = shap_stack.mean(axis=0)            # shape = (N, F)

# ───────────────────────────────────────────────────────────────
# STEP 3: Compute mean(|SHAP|) from pooled SHAP and rank features
# ───────────────────────────────────────────────────────────────
mean_abs_pooled = np.abs(shap_pooled).mean(axis=0)     # shape = (F,)
shap_mean_series = pd.Series(mean_abs_pooled, index=orig_feature_names)
overall_ranking = shap_mean_series.sort_values(ascending=False)

# Display the top 20 overall
top20_overall = overall_ranking.head(20)
print("🔥 Top 20 features overall (pooled across imputations):")
display(top20_overall)

# ───────────────────────────────────────────────────────────────
# STEP 4: Plot the overall top 20
# ───────────────────────────────────────────────────────────────
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(top20_overall.index[::-1], top20_overall.values[::-1], color="dodgerblue")
ax.set_xlabel("Average |SHAP| (pooled across imputations)")
ax.set_title("Overall Top 20 Features by Mean |SHAP| (Pooled Across Imputations)")
plt.tight_layout()
plt.show()


In [None]:
##generate SHAP beeswarm plot for top 20 vars

# ───────────────────────────────────────────────────────────────
# STEP 0: Ensure `top20_overall` is already defined:
#   A pandas Series where index = feature_name, value = avg mean(|SHAP|).
# ───────────────────────────────────────────────────────────────
top20_features = list(top20_overall.index)  # e.g. ["AGEYEARS", "TBIGCSMOTOR", …]

# Prepare lists to collect SHAP arrays and (optional) raw feature values
all_shap_top20_list   = []
all_featvals_top20_df = None  # we'll take raw feature values from imputation 0

# ───────────────────────────────────────────────────────────────
# STEP 1: For each imputation, extract the SHAP values of the top 20
# ───────────────────────────────────────────────────────────────
for i, model in enumerate(final_models):
    # A) Original training DataFrame (with original column names)
    X_tr_full = X_train_s_list[i]
    orig_names = X_tr_full.columns.tolist()

    # B) Safe‐named version used to train XGBoost
    X_tr_safe = X_train_s_list_xgb[i]
    safe_names = X_tr_safe.columns.tolist()

    # C) Compute SHAP values on the safe‐named training DataFrame
    explainer = shap.TreeExplainer(model)
    shap_vals_full = explainer.shap_values(X_tr_safe)  # shape = (N, F)

    # D) Identify indices of top‐20 features within the full feature list
    full_cols  = orig_names  # ordering corresponds to safe_names
    top20_idxs = [full_cols.index(feat) for feat in top20_features]

    # E) Extract only those SHAP columns (shape → [N, 20])
    shap_top20 = shap_vals_full[:, top20_idxs]
    all_shap_top20_list.append(shap_top20)

    # F) For coloring, capture raw feature values of top20 from imputation 0
    if i == 0:
        all_featvals_top20_df = X_tr_full[top20_features].copy()

# ───────────────────────────────────────────────────────────────
# STEP 2: Pool SHAP values by averaging across imputations per row
# ───────────────────────────────────────────────────────────────
# A) Stack into (K, N, 20)
shap_top20_stack   = np.stack(all_shap_top20_list, axis=0)   # shape = (K, N, 20)
# B) Average along axis=0 → (N, 20)
shap_top20_pooled  = shap_top20_stack.mean(axis=0)           # shape = (N, 20)

# ───────────────────────────────────────────────────────────────
# STEP 3: Build a beeswarm (dot) plot for these 20 features (pooled)
# ───────────────────────────────────────────────────────────────
plt.figure(figsize=(10, 6))
shap.summary_plot(
    shap_top20_pooled,
    all_featvals_top20_df,
    feature_names=top20_features,
    plot_type="dot",
    show=False
)
plt.title("Beeswarm of Top-20 Features (Pooled SHAP Across Imputations)", fontsize=14)
plt.tight_layout()
plt.show()
