In [None]:
##install and import necessary modules
##this code was originally designed and run in google colab
##use outside of colab may require modification
##if using colab, you may need to restart your runtime after installing modules,
##depending on enviornment at time of code running.
##due to potential module dependencies, we will install DeepTables later

!pip install scikit-learn==1.5.2
!pip install tensorflow==2.12.1
!pip install xgboost==2.0.2
!pip install shap
import time
import os
import shap
import sys
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import seaborn as sn
from google.colab import drive
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, ParameterGrid
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import roc_auc_score, f1_score, roc_curve, auc, precision_recall_curve, recall_score, confusion_matrix, brier_score_loss
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
sn.set(style='whitegrid')

print("Python version:", sys.version)
print("scikit-learn version:", sklearn.__version__)
print("XGBoost version:", xgb.__version__)
print("shap version:", shap.__version__)

In [None]:
##import your dataset
##mount google drive if using in colab. Replace <MOUNT_POINT> with the directory where you want to mount the drive (e.g., /content/drive).
drive.mount('<MOUNT_POINT>')

# Replace <YOUR_FILE_PATH> with the actual path inside your Google Drive (e.g., My Drive/FileNameHere).
file_path = '<MOUNT_POINT>/<YOUR_FILE_PATH>.csv'

In [None]:
# Import data and specify missing values
data = pd.read_csv(file_path, na_values=['NA', 'N/A', 'NULL', ' ', '', '-99', '-98', '-99.0', '-99.00', '-98.0', '-98.00', 'NaN'])

# Filter out rows where 'TRAUMATYPE' is 26, 'Other/unspecified', or 'Burn'
try:
  exclude_values = ['26', 'Other/unspecified', 'Burn']
  data = data[~data['TRAUMATYPE'].isin(exclude_values)]
except:
  pass

In [None]:
##check dataframe to ensure it appears as it should
data.head()

In [None]:
##check for missing data
data.isnull().sum(axis=0)

In [None]:
##create a dataframe of all complications/things not available on admission.  We can remove all of these from the X data set and pick one to be
#our Y dataset

complications_df=pd.DataFrame()
complications_list= [
                    # 'HC_CLABSI', 'HC_DEEPSSI', 'HC_DVTHROMBOSIS', 'HC_ALCOHOLWITHDRAWAL', 'HC_CARDARREST', 'HC_CAUTI',
                    # 'HC_EMBOLISM', 'HC_EXTREMITYCS', 'HC_INTUBATION', 'HC_KIDNEY', 'HC_MI', 'HC_ORGANSPACESSI',
                    # 'HC_OSTEOMYELITIS', 'HC_RESPIRATORY', 'HC_RETURNOR', 'HC_SEPSIS', 'HC_STROKECVA', 'HC_SUPERFICIALINCISIONSSI',
                    # 'HC_PRESSUREULCER', 'HC_UNPLANNEDICU', 'HC_VAPNEUMONIA',
                    ##'EDDISCHARGEDISPOSITION',
                    'HOSPDISCHARGEDISPOSITION',
                    ##'EDDISCHARGEHRS',
                    'WITHDRAWALLST',
                    # 'VTEPROPHYLAXISTYPE',
                    # 'TOTALICULOS',
                    # 'TOTALVENTDAYS',
                    # 'VTEPROPHYLAXISHRS',
                    'VTEPROPHYLAXISDAYS', 'MORTALITY', 'EDDISCHARGEDAYS','FINALDISCHARGEDAYS','FINALDISCHARGEHRS', 'HMRRHGCTRLSURGDAYS',  'WITHDRAWALLSTHRS',
                    'AMERICANINDIAN', 'ASIAN', 'BLACK', 'PACIFICISLANDER', 'RACEOTHER', 'WHITE', 'RACE_NA', 'RACE_UK',
                    'TM_GROUNDAMBULANCE', 'TM_HELICOPTERAMBULANCE', 'TM_FIXEDWINGAMBULANCE', 'TM_PRIVPUBVEHWALKIN', 'TM_POLICE', 'TM_OTHER', 'TM_NA', 'TM_UK'
                    # 'ISS_05'
                    # , 'AIS_FACE', 'AIS_NECK', 'AIS_HEAD', 'AIS_THORAX', 'AIS_ABDOMEN', 'AIS_SPINE', 'AIS_UPPEREX', 'AIS_LOWEREX', 'AIS_SKIN', 'AIS_OTHER'
                    # , 'VTEPPXStartOver48', 'VTEPPXStartOver24', 'ICUOver48', 'ICUOver24', 'VentOver48', 'VentOver24'
                    # , 'VTEPPXStartOver72', 'VTEPPXStartOver96', 'ICUOver72', 'ICUOver96', 'VentOver72', 'VentOver96'
                    , 'FacilityTotalWLST', 'factilityTotalPatients', 'FacilityWLSTRate', 'FacilityKey'
                    , 'facilityWLSTNew', 'WLSTRateNew', 'WLSTRateNewCensored'
                    ]
for c in complications_list:
    complications_df[c] = data[c]
# complications_df

In [None]:
##this is where we choose our outcome variable, in this case, WLST, and move it to a separate dataframe
Y_data = pd.DataFrame()
Y_data['WLST'] = data['WITHDRAWALLST']
Y_data

In [None]:
##clean Y_data by replacing "Yes" and "No" vcalues with 0's and 1's

Y_data['WLST'] = Y_data['WLST'].replace({'Yes': 1, 'No': 0})
Y_data

In [None]:
##remove all unwanted variables as defined above from the input space
X_data = data.drop(columns=complications_list)
X_data.shape

In [None]:
##need to remove any cases with missing data for our outcome variable
Missing_Y = Y_data.isnull().sum(axis=0)
Missing_Y

In [None]:
##here we find which rows in Y have missing values

bad_row_index_list=[]
for n in range(0, Y_data.shape[0]):
    n_missings=Y_data.iloc[n,:].isnull().sum()
    if n_missings>0:
        bad_row_index_list.append(n)
bad_row_index_list

In [None]:
##now remove the bad rows in Y
Y_clean = Y_data.drop(bad_row_index_list, axis=0)
Y_clean

In [None]:
##ensure all cases with missing values for the outcome have been dropped
Missing_Y_clean = Y_clean.isnull().sum(axis=0)
Missing_Y_clean

In [None]:
##and remove bad rows in X
X_data=X_data.drop(bad_row_index_list, axis=0)

In [None]:
# Rename the 'TRAUMATYPE' column to 'Penetrating' and map the values to 0 and 1
X_data['Penetrating'] = X_data['TRAUMATYPE'].map({'Penetrating': 1, 'Blunt': 0})

# Drop the old 'TRAUMATYPE' column
X_data.drop(columns=['TRAUMATYPE'], inplace=True)

In [None]:
##drop patient record number as its not useful in making predictions

columns_to_remove = ['inc_key']
X_data = X_data.drop(columns=columns_to_remove, errors='ignore')

In [None]:
##first we will convert No's and Yes's to 0's and 1's to minimize the amount of double variables (want to avoid Yes/Nos being converted to 1-hot variables)
##want code to be reusable between different populations of input data.  Not every population will have all of these variables
##Therefore, will do everything within separate try/except blocks

try:
    X_data= X_data.replace({True: 1, 'Yes': 1, "Female": 1, False: 0, 'No': 0, "Male": 0})
except:
    pass
try:
    X_data['ETHNICITY'] = X_data['ETHNICITY'].replace({'Hispanic or Latino': 1, 'Not Hispanic or Latino': 0})
except:
    pass
try:
    X_data['EMSGCSEYE'] = X_data['EMSGCSEYE'].replace({'None': 1, 'To pressure': 2, 'To sound': 3,
                                                               'Spontaneous': 4})
except:
    pass
try:
    X_data['GCSEYE'] = X_data['GCSEYE'].replace({'None': 1, 'To pressure': 2, 'To sound': 3, 'Spontaneous': 4})
except:
    pass
try:
    X_data['EMSGCSVERBAL'] = X_data['EMSGCSVERBAL'].replace({'None': 1, 'Sounds': 2, 'Words': 3,
                                                                     'Confused': 4, 'Oriented': 5})
except:
    pass
try:
    X_data['EMSGCSMOTOR'] = X_data['EMSGCSMOTOR'].replace({'None': 1, 'Extension': 2, 'Abnormal Flexion': 3,
                                                                 'Normal Flexion': 4, 'Localising': 5, 'Obeys commands': 6})
except:
    pass
try:
    X_data['TBIGCSMOTOR'] = X_data['TBIGCSMOTOR'].replace({'None': 1, 'Extension': 2, 'Abnormal Flexion': 3,
                                                                 'Normal Flexion': 4, 'Localising': 5, 'Obeys commands': 6})
except:
    pass
try:
    X_data['GCSVERBAL'] = X_data['GCSVERBAL'].replace({'None': 1, 'Sounds': 2, 'Words': 3,
                                                               'Confused': 4, 'Orientated': 5})
except:
    pass
try:
    X_data['GCSMOTOR'] = X_data['GCSMOTOR'].replace({'None': 1, 'Extension': 2, 'Abnormal Flexion': 3,
                                                           'Normal Flexion': 4, 'Localising': 5, 'Obeys commands': 6})
except:
    pass
try:
    X_data['RESPIRATORYASSISTANCE'] = X_data['RESPIRATORYASSISTANCE'].replace({'Assisted Respiratory Rate': 1,
                                                                                   'Unassisted Respiratory Rate': 0})
except:
    pass
try:
    X_data['SUPPLEMENTALOXYGEN'] = X_data['SUPPLEMENTALOXYGEN'].replace({'Supplemental Oxygen': 1,
                                                                             'No Supplemental Oxygen': 0})
except:
    pass

X_data.head()

##male coded as 0
##female coded as 1

##not hispanic coded as 0
##hispanic coded as 1

In [None]:
##replace boolean values in binary variables to numeric values
X_data = X_data.replace({True: 1, False: 0})

In [None]:
##check which variables in the input space have missing variables
Missing = X_data.isnull().sum(axis=0)
Missing[Missing>0]

In [None]:
##order variables with missing data by percentage
data_missing = (X_data.isnull().sum(axis=0)/X_data.shape[0]) * 100
data_missing

In [None]:
##display variables withOUT mising data
data_missing[data_missing == 0].index

In [None]:
#remove the good columns (no missing values) from data_missing
data_missing = data_missing.drop(data_missing[data_missing == 0].index)
data_missing

In [None]:
#sort this in ascending order
pd.set_option('display.max_rows', None)
data_missing = data_missing.sort_values(ascending=False)
data_missing

In [None]:
##prepare to drop variables with >50% missing values
dropCutoff=50
bad_column_names = data_missing[data_missing >=dropCutoff].index
bad_column_names

In [None]:
##actually drop bad variables
X_data_new=X_data.drop(columns=bad_column_names, axis=1)

##check for which variables still have missing data (<50% missing values)
Missing = X_data_new.isnull().sum(axis=0)
Missing[Missing>0]

In [None]:
#check for columns with less than 50% missing that need to be cleaned
pd.set_option('display.max_rows', None)
to_be_cleaned_column_names = data_missing[data_missing <50].index
for col in X_data_new:
    print(col)

In [None]:
continuous_vars = [
    "AGEYEARS", "EMSPULSERATE", "EMSRESPIRATORYRATE", "EMSTOTALGCS", "EMSDISPATCHDAYS",
    "EMSSCENEHRS", "EMSSCENEDAYS", "EMSHRS", "EMSDAYS", "SBP", "PULSERATE", "TEMPERATURE",
    "RESPIRATORYRATE", "PULSEOXIMETRY", "HEIGHT", "WEIGHT", "TOTALGCS", "ALCOHOLSCREENRESULT",
    "EDDISCHARGEHRS", "TBIHIGHESTTOTALGCS", "TBIGCSMOTOR", "BLOODUNITS", "PLASMAUNITS",
    "NumberOfInjuries", "mFI", "facilityPatientsNew", "WLSTRateCensorNormal",
    'AIS_FACE', 'AIS_NECK', 'AIS_HEAD', 'AIS_THORAX', 'AIS_ABDOMEN', 'AIS_SPINE', 'AIS_UPPEREX', 'AIS_LOWEREX', 'AIS_SKIN', 'AIS_OTHER',
    "ISS_05", 'TOTALICULOS', 'TOTALVENTDAYS', 'VTEPROPHYLAXISHRS'
]

categorical_vars = [
    "SEX", "RACE", "ETHNICITY", "MECHANISM", "INTENT", "WORKRELATED", "ABUSEREPORT",
    "PROTDEV_NONE", "PROTDEV_LAP_BELT", "PROTDEV_PER_FLOAT", "PROTDEV_PROTECT_GEAR",
    "PROTDEV_EYE_PROTECT", "PROTDEV_CHILD_RESTRAINT", "PROTDEV_HELMET", "PROTDEV_AIRBAG_PRESENT",
    "PROTDEV_PROTECT_CLOTH", "PROTDEV_SHOULDER_BELT", "PROTDEV_OTHER", "PROTDEV_NA", "PROTDEV_UK",
    "AIRBAG_NOTDEPLOYED", "AIRBAG_DEPLOYED_FRNT", "AIRBAG_DEPLOYED_SIDE", "AIRBAG_DEPLOYED_OTHER",
    "AIRBAG_DEPLOYED_NA", "AIRBAG_DEPLOYED_UK", "TRANSPORTMODE", "INTERFACILITYTRANSFER",
    "PREHOSPITALCARDIACARREST", "TCCGCSLE13", "TCC10RR29", "TCCPEN", "TCCCHEST", "TCCLONGBONE",
    "TCCCRUSHED", "TCCAMPUTATION", "TCCPELVIC", "TCCSKULLFRACTURE", "TCCPARALYSIS", "TCC_NA",
    "TCC_UK", "VPOFALLADULT", "VPOFALLCHILD", "VPOCRASHINTRUSION", "VPOCRASHEJECT",
    "VPOCRASHDEATH", "VPOCRASHTELEMETRY", "VPOAUTOPEDIMPACT", "VPOMOTORCYCLECRASH",
    "VPO65SBP110", "VPOANTICOAGULANT", "VPOPREGNANCY20WKS", "VPOEMSJUDGE", "VPOBURNS",
    "VPOTRAUMABURNS", "VPO_NA", "VPO_UK", "RESPIRATORYASSISTANCE", "SUPPLEMENTALOXYGEN",
    "GCSQ_SEDATEDPARALYZED", "GCSQ_EYEOBSTRUCTION", "GCSQ_INTUBATED", "GCSQ_VALID", "GCSQ_NA",
    "GCSQ_UK", "DRGSCR_AMPHETAMINE", "DRGSCR_BARBITURATE", "DRGSCR_BENZODIAZEPINES",
    "DRGSCR_COCAINE", "DRGSCR_METHAMPHETAMINE", "DRGSCR_ECSTASY", "DRGSCR_METHADONE",
    "DRGSCR_OPIOID", "DRGSCR_OXYCODONE", "DRGSCR_PHENCYCLIDINE", "DRGSCR_TRICYCLICDEPRESS",
    "DRGSCR_CANNABINOID", "DRGSCR_OTHER", "DRGSCR_NONE", "DRGSCR_NOTTESTED", "DRGSCR_UK",
    "DRGSCR_NA", "ALCOHOLSCREEN", "EDDISCHARGEDISPOSITION", "DEATHINED", "TBIPUPILLARYRESPONSE",
    "TBIMIDLINESHIFT", "PMGCSQ_SEDATEDPARALYZED", "PMGCSQ_EYEOBSTRUCTION", "PMGCSQ_INTUBATED",
    "PMGCSQ_VALID", "PMGCSQ_NA", "PMGCSQ_UK", "ICPEVDRAIN", "ICPPARENCH", "ICPO2MONITOR",
    "ICPJVBULB", "ICPNONE", "ICP_NA", "ICP_UK", "BLOODBINARY", "PLASMABINARY", "PLATELETSBINARY",
    "CRYOBINARY", "ESLIVER", "ESSPLEEN", "ESKIDNEY", "ESPELVIS", "ESRETROPERI", "VERIFICATIONLEVEL",
    "ESVASCULAR", "ESAORTA", "ESOTHER", "ES_UK", "ES_NA", "PRIMARYMETHODPAYMENT", "TEACHINGSTATUS",
    "HOSPITALTYPE", "STATEDESIGNATION", "CC_ADHD", "CC_ADLC", "CC_ALCOHOLISM", "CC_ANGINAPECTORIS",
    "CC_ANTICOAGULANT", "CC_BLEEDING", "CC_CHEMO", "CC_CIRRHOSIS", "CC_CONGENITAL", "CC_COPD",
    "CC_CVA", "CC_DEMENTIA", "CC_DIABETES", "CC_DISCANCER", "CC_FUNCTIONAL", "CC_CHF",
    "CC_HYPERTENSION", "CC_MI", "CC_PAD", "CC_PREMATURITY", "CC_MENTALPERSONALITY", "CC_RENAL",
    "CC_SMOKING", "CC_STEROID", "CC_SUBSTANCEABUSE", "IntracranialVascularInjury", "BrainStemInjury",
    "EDH", "SAH", "SDH", "SkullFx", "DAI", "NeckVascularInjury", "ThoracicVascularInjury",
    "AeroDigestiveInjury", "CardiacInjury", "LungInjury", "AbdominalVascular", "RibFx",
    "KidneyInjury", "StomachInjury", "SpleenInjury", "UroGenInternalInjury", "SCI", "SpineFx",
    "UEAmputation", "UEVascularInjury", "UELongBoneFx", "LEVascularInjury", "PelvicFx",
    "LEAmputation", "PancreasInjury", "LELongBoneFx", "LiverInjury", "ColorectalInjury",
    "SmallBowelInjury", "isolatedTBI", "missingGCS", "missingAge", "missingSex", "missingType",
    "missingSBP", "missingHR", "missingRR", "missingPulseOx", "missingHeight", "missingWeight",
    "missingEDDispo", "missingRBC", "missingPlasma", "Penetrating",
    'VTEPPXStartOver48', 'VTEPPXStartOver24', 'ICUOver48', 'ICUOver24', 'VentOver48', 'VentOver24',
    'VTEPPXStartOver72', 'VTEPPXStartOver96', 'ICUOver72', 'ICUOver96', 'VentOver72', 'VentOver96', 'VTEPROPHYLAXISTYPE',
    'HC_CLABSI', 'HC_DEEPSSI', 'HC_DVTHROMBOSIS', 'HC_ALCOHOLWITHDRAWAL', 'HC_CARDARREST', 'HC_CAUTI',
    'HC_EMBOLISM', 'HC_EXTREMITYCS', 'HC_INTUBATION', 'HC_KIDNEY', 'HC_MI', 'HC_ORGANSPACESSI',
    'HC_OSTEOMYELITIS', 'HC_RESPIRATORY', 'HC_RETURNOR', 'HC_SEPSIS', 'HC_STROKECVA', 'HC_SUPERFICIALINCISIONSSI',
    'HC_PRESSUREULCER', 'HC_UNPLANNEDICU', 'HC_VAPNEUMONIA',
]

# Continuous variables that must be integers
int_constrained_vars = [
    "AGEYEARS", "EMSPULSERATE", "EMSRESPIRATORYRATE", "EMSTOTALGCS", "EMSDISPATCHDAYS",
    "EMSSCENEDAYS", "EMSDAYS", "SBP", "PULSERATE", "RESPIRATORYRATE", "PULSEOXIMETRY",
    "TOTALGCS", "TBIHIGHESTTOTALGCS", "TBIGCSMOTOR", "BLOODUNITS", "PLASMAUNITS",
    "NumberOfInjuries", "mFI", "facilityPatientsNew", 'ISS_05', 'TOTALICULOS', 'TOTALVENTDAYS',
    'AIS_FACE', 'AIS_NECK', 'AIS_HEAD', 'AIS_THORAX', 'AIS_ABDOMEN', 'AIS_SPINE', 'AIS_UPPEREX', 'AIS_LOWEREX', 'AIS_SKIN', 'AIS_OTHER'
]

time_pairs = [
    ("EMSSCENEDAYS", "EMSSCENEHRS"),
    ("EMSDAYS", "EMSHRS"),
]

In [None]:
# ── B) Define helper to compute ISS from AIS ─────────────────

ais_cols = [
    "AIS_FACE",
    "AIS_NECK",
    "AIS_HEAD",
    "AIS_THORAX",
    "AIS_ABDOMEN",
    "AIS_SPINE",
    "AIS_UPPEREX",
    "AIS_LOWEREX",
    "AIS_SKIN",
    "AIS_OTHER",
]

def compute_iss_from_ais(row):
    # 1) If any AIS = 6, ISS = 75
    if any(row[col] == 6 for col in ais_cols):
        return 75

    # 2) Build the six regional scores, treating any “9” as 0:
    head_neck   = max(row["AIS_HEAD"], row["AIS_NECK"])
    face        = row["AIS_FACE"]
    thorax      = row["AIS_THORAX"]
    abdomen     = row["AIS_ABDOMEN"]
    extremities = max(row["AIS_UPPEREX"], row["AIS_LOWEREX"])
    other_ext   = max(row["AIS_OTHER"], row["AIS_SKIN"])

    regions = []
    for score in [head_neck, face, thorax, abdomen, extremities, other_ext]:
        regions.append(0 if score == 9 else score)

    # 3) Take the top 3 region scores and sum their squares
    top3 = sorted(regions, reverse=True)[:3]
    return top3[0]**2 + top3[1]**2 + top3[2]**2


In [None]:
print(X_data_new.shape)
print(len(continuous_vars))
print(len(categorical_vars))

In [None]:
##train/test split time
# ───────────────────────────────────────────────────────────────────────────────
# 1) Start with your full feature matrix X_data_new and your outcome Y_clean.
# ───────────────────────────────────────────────────────────────────────────────

X_full = X_data_new.copy()
# If Y_clean is a one-column DataFrame, extract it as a Series. Otherwise adjust accordingly.
Y_full = Y_clean.iloc[:, 0]

# 2) Identify which columns are categorical / continuous / integer‐constrained.
categorical_mice = [c for c in categorical_vars if c in X_full.columns]
continuous_mice  = [c for c in continuous_vars  if c in X_full.columns]
int_mice         = [c for c in int_constrained_vars if c in X_full.columns]

# 3) Convert any "nan" strings into real np.nan in the categorical columns.
X_full[categorical_mice] = (
    X_full[categorical_mice]
      .replace("nan", np.nan)
      .astype(str)
      .replace("nan", np.nan)
)

# 4) Ordinal‐encode all categorical_mice columns at once (so train/test share the same mapping).
ordinal_encoder_mice = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1
)
X_full[categorical_mice] = ordinal_encoder_mice.fit_transform(X_full[categorical_mice])

# 5) Now do the first split: full → (train_raw, test_raw).
#    Since your outcome is binary, we can stratify on it.
X_train_raw, X_test_raw, Y_train, Y_test = train_test_split(
    X_full,
    Y_full,
    test_size=0.20,
    random_state=0,
    stratify=Y_full
)

print("Completed train/test split:")
print(f"   X_train_raw: {X_train_raw.shape}, X_test_raw: {X_test_raw.shape}")
print(f"   Y_train: {Y_train.shape},      Y_test: {Y_test.shape}")

# 6) Next, split X_train_raw + Y_train into (train_for_impute_raw, calibration_raw).
#    We'll use calibration_raw later to calibrate the model.
#    Use stratification here as well (since Y_train is still binary).
X_train_for_impute_raw, X_cal_raw, Y_train_for_impute, Y_cal = train_test_split(
    X_train_raw,
    Y_train,
    test_size=0.20,
    random_state=0,
    stratify=Y_train
)

print("\nSplit train_raw into train_for_impute and calibration:")
print(f"   X_train_for_impute_raw: {X_train_for_impute_raw.shape}, Y_train_for_impute: {Y_train_for_impute.shape}")
print(f"   X_cal_raw: {X_cal_raw.shape},            Y_cal: {Y_cal.shape}")


In [None]:
##now for MICE
# ──────────────────────────────────────────────────────────────────────────────
# WRAPPER: RandomForestRegressor that supports predict(X, return_std=True)
# ──────────────────────────────────────────────────────────────────────────────
class RFWithStd(RandomForestRegressor):
    """
    Subclass of RandomForestRegressor whose `predict(..., return_std=True)`
    returns (mean_prediction, std_prediction) across all trees.
    """
    def predict(self, X, return_std=False):
        all_tree_preds = np.vstack([tree.predict(X) for tree in self.estimators_])
        means = np.mean(all_tree_preds, axis=0)
        if not return_std:
            return means
        stds = np.std(all_tree_preds, axis=0, ddof=1)
        return means, stds

# ──────────────────────────────────────────────────────────────────────────────
# RF-MICE with sample_posterior=True
# ──────────────────────────────────────────────────────────────────────────────

# BEFORE any imputation: record which rows in the full dataset had ISS_05 = NaN
original_iss_missing = X_full["ISS_05"].isna()

# 1) Force each “Over”-flag column to Int64 so RF sees 0/1
for prefix in ["ICUOver", "VentOver", "VTEPPXStartOver"]:
    for tp in ["24", "48", "72", "96"]:
        col = f"{prefix}{tp}"
        if col in X_train_for_impute_raw.columns:
            X_train_for_impute_raw[col] = X_train_for_impute_raw[col].astype("Int64")
            X_cal_raw[col]            = X_cal_raw[col].astype("Int64")
            X_test_raw[col]           = X_test_raw[col].astype("Int64")

# 2) Define which columns are categorical / continuous / integer-constrained
categorical_mice = [c for c in categorical_vars if c in X_full.columns]
continuous_mice  = [c for c in continuous_vars  if c in X_full.columns]
int_mice         = [c for c in int_constrained_vars if c in X_full.columns]

seeds = [100, 200, 300]

imputed_X_train_list = []
imputed_X_cal_list   = []
imputed_X_test_list  = []

print(f"Running RF-MICE on TRAIN_FOR_IMPUTE only (n_train={X_train_for_impute_raw.shape[0]})\n")

for seed in seeds:
    start_time = time.time()

    rf_imputer = IterativeImputer(
        estimator=RFWithStd(
            n_estimators=100,    # number of trees
            max_depth=10,        # tree depth
            n_jobs=-1,
            random_state=seed
        ),
        max_iter=5,             # number of MICE iterations
        sample_posterior=True,
        random_state=seed
    )

    # 3) Fit on TRAIN_FOR_IMPUTE raw
    rf_imputer.fit(X_train_for_impute_raw)

    # 4) Impute TRAIN_FOR_IMPUTE, CAL, TEST in one go
    X_train_imp = pd.DataFrame(
        rf_imputer.transform(X_train_for_impute_raw),
        columns=X_train_for_impute_raw.columns,
        index=X_train_for_impute_raw.index
    )
    X_cal_imp = pd.DataFrame(
        rf_imputer.transform(X_cal_raw),
        columns=X_cal_raw.columns,
        index=X_cal_raw.index
    )
    X_test_imp = pd.DataFrame(
        rf_imputer.transform(X_test_raw),
        columns=X_test_raw.columns,
        index=X_test_raw.index
    )

    # 5) Round & clip all “Over”-flag columns to 0/1
    for prefix in ["ICUOver", "VentOver", "VTEPPXStartOver"]:
        for tp in ["24", "48", "72", "96"]:
            col = f"{prefix}{tp}"
            if col in X_train_imp.columns:
                X_train_imp[col] = X_train_imp[col].round().clip(0, 1).astype(int)
                X_cal_imp[col]   = X_cal_imp[col].round().clip(0, 1).astype(int)
                X_test_imp[col]  = X_test_imp[col].round().clip(0, 1).astype(int)

    # 6) Round & clip any other categorical columns (non-Over flags) back to integer codes
    for idx, col in enumerate(categorical_mice):
        n_cat = len(ordinal_encoder_mice.categories_[idx])
        X_train_imp[col] = X_train_imp[col].round().clip(0, n_cat - 1).astype(int)
        X_cal_imp[col]   = X_cal_imp[col].round().clip(0, n_cat - 1).astype(int)
        X_test_imp[col]  = X_test_imp[col].round().clip(0, n_cat - 1).astype(int)

    # 7) Decode all categorical columns back to original labels
    X_train_imp[categorical_mice] = ordinal_encoder_mice.inverse_transform(
        X_train_imp[categorical_mice]
    )
    X_cal_imp[categorical_mice] = ordinal_encoder_mice.inverse_transform(
        X_cal_imp[categorical_mice]
    )
    X_test_imp[categorical_mice] = ordinal_encoder_mice.inverse_transform(
        X_test_imp[categorical_mice]
    )

    # 8) Clip continuous columns to original min/max from X_full
    for col in continuous_mice:
        col_min = X_full[col].min(skipna=True)
        col_max = X_full[col].max(skipna=True)
        X_train_imp[col] = X_train_imp[col].clip(lower=col_min, upper=col_max)
        X_cal_imp[col]   = X_cal_imp[col].clip(lower=col_min, upper=col_max)
        X_test_imp[col]  = X_test_imp[col].clip(lower=col_min, upper=col_max)

    # 9) Round & cast integer-constrained columns
    for col in int_mice:
        if col in X_train_imp.columns:
            X_train_imp[col] = np.round(X_train_imp[col]).astype("Int64")
        if col in X_cal_imp.columns:
            X_cal_imp[col]   = np.round(X_cal_imp[col]).astype("Int64")
        if col in X_test_imp.columns:
            X_test_imp[col]  = np.round(X_test_imp[col]).astype("Int64")

    # 10) Domain-specific consistency checks for GCS & alcohol
    if "TBIHIGHESTTOTALGCS" in X_train_imp.columns and "TBIGCSMOTOR" in X_train_imp.columns:
        required_min = X_train_imp["TBIGCSMOTOR"] + 2
        X_train_imp["TBIHIGHESTTOTALGCS"] = np.maximum(
            X_train_imp["TBIHIGHESTTOTALGCS"], required_min
        )
    if "TBIHIGHESTTOTALGCS" in X_cal_imp.columns and "TBIGCSMOTOR" in X_cal_imp.columns:
        required_min = X_cal_imp["TBIGCSMOTOR"] + 2
        X_cal_imp["TBIHIGHESTTOTALGCS"] = np.maximum(
            X_cal_imp["TBIHIGHESTTOTALGCS"], required_min
        )
    if "TBIHIGHESTTOTALGCS" in X_test_imp.columns and "TBIGCSMOTOR" in X_test_imp.columns:
        required_min = X_test_imp["TBIGCSMOTOR"] + 2
        X_test_imp["TBIHIGHESTTOTALGCS"] = np.maximum(
            X_test_imp["TBIHIGHESTTOTALGCS"], required_min
        )

    if "ALCOHOLSCREEN" in X_train_imp.columns and "ALCOHOLSCREENRESULT" in X_train_imp.columns:
        X_train_imp.loc[X_train_imp["ALCOHOLSCREEN"] == 0, "ALCOHOLSCREENRESULT"] = 0
    if "ALCOHOLSCREEN" in X_cal_imp.columns and "ALCOHOLSCREENRESULT" in X_cal_imp.columns:
        X_cal_imp.loc[X_cal_imp["ALCOHOLSCREEN"] == 0, "ALCOHOLSCREENRESULT"] = 0
    if "ALCOHOLSCREEN" in X_test_imp.columns and "ALCOHOLSCREENRESULT" in X_test_imp.columns:
        X_test_imp.loc[X_test_imp["ALCOHOLSCREEN"] == 0, "ALCOHOLSCREENRESULT"] = 0

    # 11) Temporal-group consistency rules on the rounded Over-flags
    temporal_variable_groups = {
        "VTEPPXStartOver": ["24", "48", "72", "96"],
        "ICUOver":         ["24", "48", "72", "96"],
        "VentOver":        ["24", "48", "72", "96"]
    }

    for prefix, timepoints in temporal_variable_groups.items():
        available_train = [f"{prefix}{tp}" for tp in timepoints if f"{prefix}{tp}" in X_train_imp.columns]
        available_cal   = [f"{prefix}{tp}" for tp in timepoints if f"{prefix}{tp}" in X_cal_imp.columns]
        available_test  = [f"{prefix}{tp}" for tp in timepoints if f"{prefix}{tp}" in X_test_imp.columns]

        if len(available_train) < 2:
            continue

        # Rule B: if a later “Over” = 1 ⇒ earlier “Over” must be 1
        for i_tp in range(len(available_train) - 1, 0, -1):
            late = available_train[i_tp]
            for j_tp in range(i_tp - 1, -1, -1):
                early = available_train[j_tp]
                mask_late = (X_train_imp[late] == 1)
                X_train_imp.loc[mask_late, early] = 1
                if early in X_cal_imp.columns:
                    mask_late_cal = (X_cal_imp[late] == 1)
                    X_cal_imp.loc[mask_late_cal, early] = 1
                if early in X_test_imp.columns:
                    mask_late_test = (X_test_imp[late] == 1)
                    X_test_imp.loc[mask_late_test, early] = 1

        # Rule A: if an earlier “Over” = 0 ⇒ later “Over” must be 0
        for i_tp in range(len(available_train) - 1):
            early = available_train[i_tp]
            for j_tp in range(i_tp + 1, len(available_train)):
                late = available_train[j_tp]
                mask_early0 = (X_train_imp[early] == 0)
                X_train_imp.loc[mask_early0, late] = 0
                if late in X_cal_imp.columns:
                    mask_early0_cal = (X_cal_imp[early] == 0)
                    X_cal_imp.loc[mask_early0_cal, late] = 0
                if late in X_test_imp.columns:
                    mask_early0_test = (X_test_imp[early] == 0)
                    X_test_imp.loc[mask_early0_test, late] = 0

    # 12) Immediately re-cast all Over-flag columns back to integer via to_numeric
    for prefix in ["ICUOver", "VentOver", "VTEPPXStartOver"]:
        for tp in ["24", "48", "72", "96"]:
            col = f"{prefix}{tp}"
            if col in X_train_imp.columns:
                X_train_imp[col] = (
                    pd.to_numeric(X_train_imp[col], errors="coerce")
                      .fillna(0)
                      .astype(int)
                )
                X_cal_imp[col]   = (
                    pd.to_numeric(X_cal_imp[col], errors="coerce")
                      .fillna(0)
                      .astype(int)
                )
                X_test_imp[col]  = (
                    pd.to_numeric(X_test_imp[col], errors="coerce")
                      .fillna(0)
                      .astype(int)
                )

    # 13) Compute continuous values from the corrected Over-flags
    cont_mapping = {
        "ICUOver":         ("TOTALICULOS",       24),
        "VentOver":        ("TOTALVENTDAYS",     24),
        "VTEPPXStartOver": ("VTEPROPHYLAXISHRS", 1),
    }

    for prefix, (cont_var, divisor) in cont_mapping.items():
        bin_cols_train = sorted(
            [c for c in X_train_imp.columns if c.startswith(prefix)],
            key=lambda c: int(c.replace(prefix, ""))
        )
        if not bin_cols_train:
            continue

        def compute_cont_value(row):
            max_threshold = None
            for col_name in reversed(bin_cols_train):
                if row[col_name] == 1:
                    max_threshold = int(col_name.replace(prefix, ""))
                    break
            return 0 if max_threshold is None else max_threshold // divisor

        X_train_imp[cont_var] = X_train_imp.apply(compute_cont_value, axis=1)
        if cont_var in X_cal_imp.columns:
            X_cal_imp[cont_var] = X_cal_imp.apply(compute_cont_value, axis=1)
        if cont_var in X_test_imp.columns:
            X_test_imp[cont_var] = X_test_imp.apply(compute_cont_value, axis=1)

    # 14) Recompute ISS_05 where it was originally missing
    mask_train_to_fix = original_iss_missing.loc[X_train_for_impute_raw.index]
    mask_cal_to_fix   = original_iss_missing.loc[X_cal_raw.index]
    mask_test_to_fix  = original_iss_missing.loc[X_test_raw.index]

    if "ISS_05" in X_train_imp.columns:
        X_train_imp.loc[mask_train_to_fix, "ISS_05"] = (
            X_train_imp.loc[mask_train_to_fix].apply(compute_iss_from_ais, axis=1)
        )
    if "ISS_05" in X_cal_imp.columns:
        X_cal_imp.loc[mask_cal_to_fix, "ISS_05"] = (
            X_cal_imp.loc[mask_cal_to_fix].apply(compute_iss_from_ais, axis=1)
        )
    if "ISS_05" in X_test_imp.columns:
        X_test_imp.loc[mask_test_to_fix, "ISS_05"] = (
            X_test_imp.loc[mask_test_to_fix].apply(compute_iss_from_ais, axis=1)
        )

    # 15) Fill any remaining missing values (mode for categorical, median for continuous)
    for col in categorical_mice:
        mode_val = X_train_imp[col].mode(dropna=True)[0]
        X_train_imp[col] = X_train_imp[col].fillna(mode_val)
        X_cal_imp[col]   = X_cal_imp[col].fillna(mode_val)
        X_test_imp[col]  = X_test_imp[col].fillna(mode_val)

    for col in continuous_mice:
        med_val = X_train_imp[col].median()
        X_train_imp[col] = X_train_imp[col].fillna(med_val)
        X_cal_imp[col]   = X_cal_imp[col].fillna(med_val)
        X_test_imp[col]  = X_test_imp[col].fillna(med_val)

    # 16) Cast all categorical columns to “category” dtype
    for col in categorical_mice:
        X_train_imp[col] = X_train_imp[col].astype("category")
        X_cal_imp[col]   = X_cal_imp[col].astype("category")
        X_test_imp[col]  = X_test_imp[col].astype("category")

    # 17) Append each imputed split to our lists
    imputed_X_train_list.append(X_train_imp)
    imputed_X_cal_list.append(X_cal_imp)
    imputed_X_test_list.append(X_test_imp)

    elapsed = time.time() - start_time
    print(f"  → Seed {seed} done in {elapsed:.1f} seconds.\n")

print("RF-MICE completed")


In [None]:
##save our imputed datasets to a google drive directory

# 1) save to your drive directory.  Change the name of <MY_SAV_DIR> to this directory.
save_dir = "<MY_SAV_DIR>"
os.makedirs(save_dir, exist_ok=True)

# 2) Define custom filenames for each imputation
#    You can pick any descriptive names you like.

train_names = ["train_seed100_los", "train_seed200_los", "train_seed300_los"]
cal_names   = ["cal_seed100_los",   "cal_seed200_los",   "cal_seed300_los"]
test_names  = ["test_seed100_los",  "test_seed200_los",  "test_seed300_los"]


# 3) Save each DataFrame under its custom name
for i, df in enumerate(imputed_X_train_list):
    filename = f"{train_names[i]}.pkl"
    df.to_pickle(os.path.join(save_dir, filename))

for i, df in enumerate(imputed_X_cal_list):
    filename = f"{cal_names[i]}.pkl"
    df.to_pickle(os.path.join(save_dir, filename))

for i, df in enumerate(imputed_X_test_list):
    filename = f"{test_names[i]}.pkl"
    df.to_pickle(os.path.join(save_dir, filename))

print(f" Saved imputations to {save_dir} with custom names.")


In [None]:
##now for variable summary pre and post imputation

# Lists of imputed variables (must already exist in your notebook)
# continuous_mice = [ ... ]  # list of continuous columns that were imputed
# categorical_mice = [ ... ] # list of categorical columns that were imputed

# -----------------------------------------------------------------------------
# Helper: summary for continuous variables
# -----------------------------------------------------------------------------
def summarise_continuous(df, vars_list):
    stats = []
    for var in vars_list:
        s = df[var].dropna()
        mean = s.mean()
        median = s.median()
        std = s.std()
        q1 = s.quantile(0.25)
        q3 = s.quantile(0.75)
        iqr = q3 - q1
        stats.append({
            'variable': var,
            'mean': mean,
            'median': median,
            'std': std,
            'IQR': iqr
        })
    return pd.DataFrame(stats).set_index('variable')

# -----------------------------------------------------------------------------
# Helper: summary for categorical variables
# -----------------------------------------------------------------------------
def summarise_categorical(df, vars_list):
    summaries = {}
    for var in vars_list:
        counts = df[var].value_counts(dropna=True)
        props = counts / counts.sum()
        mode = df[var].mode(dropna=True)
        mode = mode.iloc[0] if not mode.empty else np.nan
        mode_prop = props.get(mode, np.nan)
        summaries[var] = {
            'mode': mode,
            'mode_proportion': mode_prop,
            'levels_proportions': props.to_dict()
        }
    return summaries

# -----------------------------------------------------------------------------
# 1) Pre-imputation summaries on TRAIN_FOR_IMPUTE raw
# -----------------------------------------------------------------------------
pre_train = X_train_for_impute_raw.copy()

cont_pre_summary = summarise_continuous(pre_train, continuous_mice)
cat_pre_summary = summarise_categorical(pre_train, categorical_mice)

# -----------------------------------------------------------------------------
# 2) Post-imputation summaries (each of the 3 imputations on TRAIN_FOR_IMPUTE)
# -----------------------------------------------------------------------------
cont_post_list = []
cat_post_list = []

for i, df_imp in enumerate(imputed_X_train_list):
    # 2a) Continuous stats for imputation i
    cont_stats = summarise_continuous(df_imp, continuous_mice).rename(
        columns=lambda c: f"{c}_imp{i}"
    )
    cont_post_list.append(cont_stats)

    # 2b) Categorical stats (mode + mode proportion) for imputation i
    cat_stats = pd.DataFrame([
        {
            'variable': var,
            'mode_imp': df_imp[var].mode(dropna=True).iloc[0] if not df_imp[var].mode(dropna=True).empty else np.nan,
            'mode_prop_imp': df_imp[var].value_counts(normalize=True).get(
                df_imp[var].mode(dropna=True).iloc[0], np.nan
            )
        }
        for var in categorical_mice
    ]).set_index('variable').rename(columns=lambda c: f"{c}_{i}")
    cat_post_list.append(cat_stats)

# 2c) Combine continuous stats across imputations
cont_post_summary = pd.concat(cont_post_list, axis=1)

# 2d) Combine categorical stats across imputations
cat_post_summary = pd.concat(cat_post_list, axis=1)

# -----------------------------------------------------------------------------
# 3) Display pre- vs post-imputation summaries for continuous
# -----------------------------------------------------------------------------
print("=== Continuous variables pre-imputation (TRAIN_FOR_IMPUTE) ===")
display(cont_pre_summary)

print("=== Continuous variables post-imputation (TRAIN_FOR_IMPUTE) ===")
display(cont_post_summary)

# -----------------------------------------------------------------------------
# 4) Display pre- vs post-imputation summaries for categorical
# -----------------------------------------------------------------------------
print("=== Categorical variables pre-imputation (TRAIN_FOR_IMPUTE) ===")
for var, info in cat_pre_summary.items():
    print(
        f"{var}: mode={info['mode']}, "
        f"mode_proportion={info['mode_proportion']:.3f}, "
        f"level_proportions={info['levels_proportions']}"
    )

print("\n=== Categorical variables post-imputation (TRAIN_FOR_IMPUTE) ===")
display(cat_post_summary)


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# Sanity‐check A: For each imputed TRAIN/CAL/TEST, confirm categorical dtypes & cardinalities
# ──────────────────────────────────────────────────────────────────────────────

for i in range(len(imputed_X_train_list)):
    print(f"\n--- Imputation {i} (TRAIN_FOR_IMPUTE) ---")
    df_train = imputed_X_train_list[i]

    # 1) Cardinality of each categorical_mice column
    card_train = df_train[categorical_mice].nunique().sort_values(ascending=False)
    print("  TRAIN_FOR_IMPUTE cardinalities:\n", card_train)

    # 2) dtype of each categorical_mice column
    dtypes_train = df_train[categorical_mice].dtypes
    print("  TRAIN_FOR_IMPUTE dtypes:\n", dtypes_train)

    print(f"\n--- Imputation {i} (CALIBRATION) ---")
    df_cal = imputed_X_cal_list[i]

    card_cal = df_cal[categorical_mice].nunique().sort_values(ascending=False)
    print("  CALIBRATION cardinalities:\n", card_cal)

    dtypes_cal = df_cal[categorical_mice].dtypes
    print("  CALIBRATION dtypes:\n", dtypes_cal)

    print(f"\n--- Imputation {i} (TEST) ---")
    df_test = imputed_X_test_list[i]

    card_test = df_test[categorical_mice].nunique().sort_values(ascending=False)
    print("  TEST cardinalities:\n", card_test)

    dtypes_test = df_test[categorical_mice].dtypes
    print("  TEST dtypes:\n", dtypes_test)

    print("─" * 60)


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# Sanity‐check B: Ensure zero missing values in TRAIN_FOR_IMPUTE / CAL / TEST
# ──────────────────────────────────────────────────────────────────────────────

for i in range(len(imputed_X_train_list)):
    df_train = imputed_X_train_list[i]
    n_missing_train = df_train.isnull().sum().sum()

    df_cal = imputed_X_cal_list[i]
    n_missing_cal = df_cal.isnull().sum().sum()

    df_test = imputed_X_test_list[i]
    n_missing_test = df_test.isnull().sum().sum()

    print(f"Imputation {i}:")
    print(f"  TRAIN_FOR_IMPUTE missing count = {n_missing_train}")
    print(f"  CALIBRATION missing count   = {n_missing_cal}")
    print(f"  TEST missing count          = {n_missing_test}")
    print("─" * 60)


In [None]:
##now for one-hot encoding

encoded_X_train_list = []
encoded_X_cal_list   = []
encoded_X_test_list  = []

for i in range(len(imputed_X_train_list)):
    df_train_imp = imputed_X_train_list[i].copy()
    df_cal_imp   = imputed_X_cal_list[i].copy()
    df_test_imp  = imputed_X_test_list[i].copy()

    # 1) Collect all categorical columns present in train_i, cal_i, or test_i
    categorical_columns = sorted(
        set(df_train_imp.select_dtypes(include=["category", "object"]).columns.tolist())
        | set(df_cal_imp.select_dtypes(include=["category", "object"]).columns.tolist())
        | set(df_test_imp.select_dtypes(include=["category", "object"]).columns.tolist())
    )
    print(f"Imputation {i}: found {len(categorical_columns)} categorical columns to encode.")

    # 2) Concatenate train_i + cal_i + test_i so that get_dummies runs once.
    df_combined = pd.concat([df_train_imp, df_cal_imp, df_test_imp], axis=0)

    # 3) Loop through each categorical column and create dummies (drop or not as before):
    df_combined_enc = df_combined.copy()
    for col in categorical_columns:
        # Determine the actual levels (ignoring NaN)
        if isinstance(df_combined_enc[col].dtype, pd.CategoricalDtype):
            levels = list(df_combined_enc[col].cat.categories)
        else:
            levels = sorted(df_combined_enc[col].dropna().unique())
        levels = [lvl for lvl in levels if pd.notna(lvl)]

        if len(levels) <= 1:
            df_combined_enc.drop(columns=[col], inplace=True)
            continue

        if len(levels) == 2:
            # Binary: drop_first=True → one dummy column
            dummies = pd.get_dummies(df_combined_enc[col], prefix=col, drop_first=True)
            df_combined_enc = pd.concat(
                [df_combined_enc.drop(columns=[col]), dummies],
                axis=1
            )
        else:
            # Multi‐level: drop_first=False → full set of dummies
            dummies = pd.get_dummies(df_combined_enc[col], prefix=col, drop_first=False)
            df_combined_enc = pd.concat(
                [df_combined_enc.drop(columns=[col]), dummies],
                axis=1
            )

    # 4) Split the encoded combined back into train_enc_i, cal_enc_i, test_enc_i by index
    df_train_enc_i = df_combined_enc.loc[df_train_imp.index].copy()
    df_cal_enc_i   = df_combined_enc.loc[df_cal_imp.index].copy()
    df_test_enc_i  = df_combined_enc.loc[df_test_imp.index].copy()

    # 5) Verify that train_enc_i, cal_enc_i, and test_enc_i share the exact same columns:
    cols_train = set(df_train_enc_i.columns)
    cols_cal   = set(df_cal_enc_i.columns)
    cols_test  = set(df_test_enc_i.columns)
    if not (cols_train == cols_cal == cols_test):
        raise ValueError(f"Column mismatch in imputation {i} after dummy encoding!")
    else:
        print(f"Imputation {i}: train/cal/test columns match (n_cols={len(cols_train)})")

    encoded_X_train_list.append(df_train_enc_i)
    encoded_X_cal_list.append(df_cal_enc_i)
    encoded_X_test_list.append(df_test_enc_i)

print("Completed one‐hot encoding for all 3 imputed sets.")


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# Sanity‐check C (revised): Check no missing values and preview encoded_X_train_list / encoded_X_test_list
# ──────────────────────────────────────────────────────────────────────────────

import pandas as pd
pd.set_option("display.max_columns", None)

print("Checking for missing values in each one‐hot–encoded TRAIN_FOR_IMPUTE:")
for i, X_enc in enumerate(encoded_X_train_list):
    total_na = X_enc.isnull().sum().sum()
    status = "No missing" if total_na == 0 else f" {total_na} missing"
    print(f"  Encoded TRAIN_FOR_IMPUTE {i}: {status}")

print("\nChecking for missing values in each one‐hot–encoded TEST:")
for i, X_enc in enumerate(encoded_X_test_list):
    total_na = X_enc.isnull().sum().sum()
    status = "No missing" if total_na == 0 else f"{total_na} missing"
    print(f"  Encoded TEST {i}: {status}")

# ──────────────────────────────────────────────────────────────────────────────
# Preview a few rows from each encoded DataFrame
# ──────────────────────────────────────────────────────────────────────────────

for i, X_enc in enumerate(encoded_X_train_list):
    print(f"\nEncoded TRAIN_FOR_IMPUTE {i} (preview):")
    display(X_enc.head())
    print("─" * 80)

for i, X_enc in enumerate(encoded_X_test_list):
    print(f"\nEncoded TEST {i} (preview):")
    display(X_enc.head())
    print("─" * 80)


In [None]:
##check indices match

for i in range(len(encoded_X_train_list)):
    Xtr = encoded_X_train_list[i]
    Xcal = encoded_X_cal_list[i]
    Xte = encoded_X_test_list[i]

    # Check that Xtr.index == Y_train_for_impute.index
    if not Xtr.index.equals(Y_train_for_impute.index):
        print(f"Imputation {i}: index mismatch between X_train_enc and Y_train_for_impute!")
    else:
        print(f"Imputation {i}: X_train_enc index matches Y_train_for_impute.")

    # Check that Xcal.index == Y_cal.index
    if not Xcal.index.equals(Y_cal.index):
        print(f"Imputation {i}: index mismatch between X_cal_enc and Y_cal!")
    else:
        print(f"Imputation {i}: X_cal_enc index matches Y_cal.")

    # Check that Xte.index == Y_test.index
    if not Xte.index.equals(Y_test.index):
        print(f"Imputation {i}: index mismatch between X_test_enc and Y_test!")
    else:
        print(f"Imputation {i}: X_test_enc index matches Y_test.")

    print(f"   Shapes: X_train_enc_{i}={Xtr.shape}, X_cal_enc_{i}={Xcal.shape}, X_test_enc_{i}={Xte.shape}")
    print(f"           Y_train_for_impute={Y_train_for_impute.shape}, Y_cal={Y_cal.shape}, Y_test={Y_test.shape}\n")


In [None]:
##now scale continuous variables

X_train_s_list = []
X_cal_s_list   = []
X_test_s_list  = []

for i in range(len(encoded_X_train_list)):
    Xtr = encoded_X_train_list[i].copy()
    Xcal = encoded_X_cal_list[i].copy()
    Xte = encoded_X_test_list[i].copy()

    # Identify continuous columns (from your original continuous_vars)
    continuous_cols = [c for c in continuous_vars if c in Xtr.columns]

    if len(continuous_cols) > 0:
        # ── a) Cast those columns to float64 on all three splits ─────────────────
        Xtr[continuous_cols] = Xtr[continuous_cols].astype(np.float64)
        Xcal[continuous_cols] = Xcal[continuous_cols].astype(np.float64)
        Xte[continuous_cols] = Xte[continuous_cols].astype(np.float64)

        # ── b) Fit StandardScaler on Xtr[continuous_cols]
        scaler = StandardScaler()
        scaler.fit(Xtr[continuous_cols])

        # ── c) Transform all three splits
        Xtr_scaled_vals = scaler.transform(Xtr[continuous_cols])
        Xcal_scaled_vals = scaler.transform(Xcal[continuous_cols])
        Xte_scaled_vals = scaler.transform(Xte[continuous_cols])

        # ── d) Now assign back into the DataFrames (no dtype conflict)
        Xtr.loc[:, continuous_cols] = Xtr_scaled_vals
        Xcal.loc[:, continuous_cols] = Xcal_scaled_vals
        Xte.loc[:, continuous_cols] = Xte_scaled_vals

    X_train_s_list.append(Xtr)
    X_cal_s_list.append(Xcal)
    X_test_s_list.append(Xte)
    print(f"Imputation {i}: scaled continuous cols.   (X_train_s_{i}.shape={Xtr.shape})")

print("All imputed train/cal/test sets are now scaled.")


In [None]:
##verify shape of each df is as expected

for i in range(len(X_train_s_list)):
    print(f"--- Imputation {i} ---")
    print(f"X_train_s_{i}.shape = {X_train_s_list[i].shape}   (Y_train_for_impute.shape = {Y_train_for_impute.shape})")
    print(f"X_cal_s_{i}.shape   = {X_cal_s_list[i].shape}   (Y_cal.shape = {Y_cal.shape})")
    print(f"X_test_s_{i}.shape  = {X_test_s_list[i].shape}   (Y_test.shape = {Y_test.shape})\n")


In [None]:
##create a dictionary of model hyper-parameter(s)

##for KNN
n_list=np.arange(1, 503, 2)
param_grid_knc = {'n_neighbors':n_list}

##for RF
param_grid_rf = {
    'n_estimators': [100, 200, 400],       ## Number of trees in the forest
    'max_depth': [None, 10, 20, 30],       ## Maximum depth of the trees
    'min_samples_split': [2, 5, 10],       ## Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],         ## Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt']               ## Number of features to consider for the best split
}

##for LR
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],   ## Inverse of regularization strength
    'penalty': ['l1', 'l2'],               ## Regularization type
    'solver': ['liblinear', 'saga'],       ## Optimization algorithm
    'max_iter': [100, 200, 300]            ## Maximum number of iterations
    }

##this is for XGBoost
param_grid_gb = {
    'learning_rate': [0.01, 0.05, 0.1],    ## Learning rate
    'max_depth': [3, 5, 7],                ## Maximum depth of the trees
    'subsample': [0.6, 0.8, 1.0],          ## Subsample ratio of the training instances
    'colsample_bytree': [0.6, 0.8, 1.0],   ## Subsample ratio of columns when constructing each tree.
    'n_estimators': [100, 150, 200]        ## Number of trees
}


In [None]:
##XGBoost Hyperparameter optimization
# ──────────────────────────────────────────────────────────────
# Step 1: Make a safe copy of your training data
# ──────────────────────────────────────────────────────────────
X_train_s_list_xgb = [
    df.copy().set_axis([f"feat_{j}" for j in range(df.shape[1])], axis=1)
    for df in X_train_s_list
]

# Build a list of label‐arrays (one per imputation) so pooled_cv_auc can iterate correctly
Y_train_list = [Y_train_for_impute.values for _ in range(len(X_train_s_list_xgb))]

# ──────────────────────────────────────────────────────────────
# Step 2: Define pooled CV‐AUC function
# ──────────────────────────────────────────────────────────────
def pooled_cv_auc(params_dict, X_list, Y_list, cv_folds=5, random_state=0):
    skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
    per_imputation_scores = []

    for X_imp, y_imp in zip(X_list, Y_list):
        model = xgb.XGBClassifier(
            **params_dict,
            use_label_encoder=False,
            eval_metric="logloss",
            random_state=random_state
        )
        scores = cross_val_score(
            model,
            X_imp,
            y_imp,
            cv=skf,
            scoring="roc_auc",
            n_jobs=-1
        )
        per_imputation_scores.append(np.mean(scores))

    return float(np.mean(per_imputation_scores))

# ──────────────────────────────────────────────────────────────
# Step 3: Run grid search on the safe XGBoost copy
# ──────────────────────────────────────────────────────────────
results = []

for candidate_params in ParameterGrid(param_grid_gb):
    pooled_auc = pooled_cv_auc(
        candidate_params,
        X_train_s_list_xgb,
        Y_train_list,
        cv_folds=5,
        random_state=0
    )
    results.append((candidate_params, pooled_auc))

df_results = pd.DataFrame([
    {**params, "pooled_cv_auc": auc_score}
    for (params, auc_score) in results
]).sort_values("pooled_cv_auc", ascending=False).reset_index(drop=True)

print("\nTop 5 hyperparameter combinations (by pooled CV‐AUC):")
print(df_results.head(5).to_string(index=False, float_format="{:.4f}".format))

best_params = df_results.iloc[0].drop("pooled_cv_auc").to_dict()
print(f"\n>>> Best parameters (pooled across imputations):\n{best_params}")


In [None]:
##L1/L2 LR hyperparameter optimization
# ───────────────────────────────────────────────────────────────────────────────
# Helper: compute pooled CV‐AUC for LogisticRegression
# ───────────────────────────────────────────────────────────────────────────────
def pooled_cv_auc_lr(params_dict, X_list, Y_list, cv_folds=5, random_state=0):
    skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
    per_imputation_scores = []

    for X_imp, y_imp in zip(X_list, Y_list):
        model = LogisticRegression(
            **params_dict,
            random_state=0,
            verbose=2
        )
        scores = cross_val_score(
            model,
            X_imp,
            y_imp,
            cv=skf,
            scoring="roc_auc",
            n_jobs=-1
        )
        per_imputation_scores.append(np.mean(scores))

    return float(np.mean(per_imputation_scores))

# ──────────────────────────────────────────────────────────────
# Step 1: Make a safe copy of your training data
# ──────────────────────────────────────────────────────────────
X_train_s_list_lr = [
    df.copy().set_axis([f"feat_{j}" for j in range(df.shape[1])], axis=1)
    for df in X_train_s_list]

# ───────────────────────────────────────────────────────────────────────────────
# Step 1: Build Y_train_list (one array per imputation), if you haven’t already
# ───────────────────────────────────────────────────────────────────────────────
# (This is exactly the same trick we used for XGBoost.)
Y_train_list = [Y_train_for_impute.values for _ in range(len(X_train_s_list_lr))]

# ───────────────────────────────────────────────────────────────────────────────
# Step 2: Grid search for Logistic Regression
# ───────────────────────────────────────────────────────────────────────────────
results_lr = []

for candidate_params in ParameterGrid(param_grid_lr):
    pooled_auc = pooled_cv_auc_lr(
        candidate_params,
        X_train_s_list_lr,
        Y_train_list,
        cv_folds=5,
        random_state=0
    )
    results_lr.append((candidate_params, pooled_auc))

# Convert to DataFrame and sort
df_results_lr = pd.DataFrame([
    {**params, "pooled_cv_auc": auc_score}
    for (params, auc_score) in results_lr
]).sort_values("pooled_cv_auc", ascending=False).reset_index(drop=True)

print("\nTop 5 Logistic Regression hyperparameter combinations (by pooled CV‐AUC):")
print(df_results_lr.head(5).to_string(index=False, float_format="{:.4f}".format))

best_params_lr = df_results_lr.iloc[0].drop("pooled_cv_auc").to_dict()
print(f"\n>>> Best Logistic Regression parameters (pooled across imputations):\n{best_params_lr}")


In [None]:
##RF hyperparameter optimization
# ──────────────────────────────────────────────────────────────
# Step 1: Make a safe copy of your training data
# ──────────────────────────────────────────────────────────────
X_train_s_list_rf = [
    df.copy().set_axis([f"feat_{j}" for j in range(df.shape[1])], axis=1)
    for df in X_train_s_list]
# ───────────────────────────────────────────────────────────────────────────────
# 1) Build Y_train_list (one array per imputation), if not already created
# ───────────────────────────────────────────────────────────────────────────────
Y_train_list = [Y_train_for_impute.values for _ in range(len(X_train_s_list_rf))]

# ───────────────────────────────────────────────────────────────────────────────
# 2) Define pooled CV‐AUC function for Random Forest
# ───────────────────────────────────────────────────────────────────────────────
def pooled_cv_auc_rf(params_dict, X_list, Y_list, cv_folds=5, random_state=0):
    skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
    per_imputation_scores = []

    for X_imp, y_imp in zip(X_list, Y_list):
        model = RandomForestClassifier(
            **params_dict,
            random_state=0,
            n_jobs=-1
        )
        scores = cross_val_score(
            model,
            X_imp,
            y_imp,
            cv=skf,
            scoring="roc_auc",
            n_jobs=-1
        )
        per_imputation_scores.append(np.mean(scores))

    return float(np.mean(per_imputation_scores))

# ───────────────────────────────────────────────────────────────────────────────
# 3) Run pooled grid search across imputations (using calibration splits)
# ───────────────────────────────────────────────────────────────────────────────
results_rf = []

for candidate_params in ParameterGrid(param_grid_rf):
    pooled_auc = pooled_cv_auc_rf(
        candidate_params,
        X_train_s_list_rf,
        Y_train_list,
        cv_folds=5,
        random_state=0
    )
    results_rf.append((candidate_params, pooled_auc))

# ───────────────────────────────────────────────────────────────────────────────
# 4) Store results in DataFrame and identify best parameters
# ───────────────────────────────────────────────────────────────────────────────
df_results_rf = pd.DataFrame([
    {**params, "pooled_cv_auc": auc_score}
    for (params, auc_score) in results_rf
]).sort_values("pooled_cv_auc", ascending=False).reset_index(drop=True)

print("\nTop 5 RF hyperparameter combinations (by pooled CV‐AUC):")
print(df_results_rf.head(5).to_string(index=False, float_format="{:.4f}".format))

best_params_rf = df_results_rf.iloc[0].drop("pooled_cv_auc").to_dict()
print(f"\n>>> Best RF parameters (pooled across imputations):\n{best_params_rf}")


In [None]:
##KNN hyperparameter optimization
# ──────────────────────────────────────────────────────────────
# Step 1: Make a safe copy of your training data
# ──────────────────────────────────────────────────────────────
X_train_s_list_knn = [
    df.copy().set_axis([f"feat_{j}" for j in range(df.shape[1])], axis=1)
    for df in X_train_s_list]
# ───────────────────────────────────────────────────────────────────────────────
# Build Y_train_list (one array per imputation), if not already created
# ───────────────────────────────────────────────────────────────────────────────
Y_train_list = [Y_train_for_impute.values for _ in range(len(X_train_s_list_knn))]

# ───────────────────────────────────────────────────────────────────────────────
# 1) Define pooled CV‐AUC function for KNN
# ───────────────────────────────────────────────────────────────────────────────
def pooled_cv_auc_knn(params_dict, X_list, Y_list, cv_folds=5):
    skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=0)
    per_imputation_scores = []

    for X_imp, y_imp in zip(X_list, Y_list):
        model = KNeighborsClassifier(**params_dict)
        scores = cross_val_score(
            model,
            X_imp,
            y_imp,
            cv=skf,
            scoring="roc_auc",
            n_jobs=-1
        )
        per_imputation_scores.append(np.mean(scores))

    return float(np.mean(per_imputation_scores))

# ───────────────────────────────────────────────────────────────────────────────
# 2) Run pooled grid search across imputations (using calibration splits)
# ───────────────────────────────────────────────────────────────────────────────
results_knn = []

for candidate_params in ParameterGrid(param_grid_knc):
    pooled_auc = pooled_cv_auc_knn(
        candidate_params,
        X_train_s_list_knn,
        Y_train_list,
        cv_folds=5
    )
    results_knn.append((candidate_params, pooled_auc))

# ───────────────────────────────────────────────────────────────────────────────
# 3) Store results in DataFrame and identify best parameters
# ───────────────────────────────────────────────────────────────────────────────
df_results_knn = pd.DataFrame([
    {**params, "pooled_cv_auc": auc_score}
    for (params, auc_score) in results_knn
]).sort_values("pooled_cv_auc", ascending=False).reset_index(drop=True)

print("\nTop 5 KNN hyperparameter combinations (by pooled CV‐AUC):")
print(df_results_knn.head(5).to_string(index=False, float_format="{:.4f}".format))

best_params_knn = df_results_knn.iloc[0].drop("pooled_cv_auc").to_dict()
print(f"\n>>> Best KNN parameters (pooled across imputations):\n{best_params_knn}")


In [None]:
##prepare to run neural networks to test which DeepTable arhcitecture performs best

!pip install deeptables
##revert to sklearn 1.5 to resolve dependency issues
!pip install scikit-learn==1.5
import deeptables
print("dt version:", deeptables.__version__)
from deeptables.models.deeptable import DeepTable, ModelConfig
from deeptables.models.deepnets import DeepFM, WideDeep, DCN

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# Build label lists (one per imputation) for calibration and test sets
# ───────────────────────────────────────────────────────────────────────────────
Y_cal_list  = [Y_cal.values for _ in range(len(X_cal_s_list))]
Y_test_list = [Y_test.values for _ in range(len(X_test_s_list))]

# ──────────────────────────────────────────────────────────────
# Helper to clean and align DataFrames
# ──────────────────────────────────────────────────────────────
def clean_df(df, ref_columns=None):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == bool:
            df[col] = df[col].astype(int)
    if ref_columns is not None:
        missing_cols = [c for c in ref_columns if c not in df.columns]
        for col in missing_cols:
            df[col] = 0
        df = df[ref_columns]
    return df

# ──────────────────────────────────────────────────────────────
# Define architectures to test
# ──────────────────────────────────────────────────────────────
architectures = {"DeepFM": DeepFM, "DCN": DCN, "WideDeep": WideDeep}
results = {}

# ──────────────────────────────────────────────────────────────
# Loop through architectures and imputations
# ──────────────────────────────────────────────────────────────
for arch_name, arch_class in architectures.items():
    aucs = []

    for i in range(len(X_train_s_list)):
        # 1) Training split for model fitting
        X_train_df = clean_df(X_train_s_list[i])
        Y_train_s  = Y_train_for_impute.values  # same labels for all imputations

        # 2) Validation (calibration) split
        X_val_df = clean_df(
            X_cal_s_list[i],
            ref_columns=X_train_df.columns
        )
        Y_val_s = Y_cal_list[i]

        # 3) Test split
        X_test_df = clean_df(
            X_test_s_list[i],
            ref_columns=X_train_df.columns
        )
        Y_test_s = Y_test_list[i]

        # 4) Build model config
        conf = ModelConfig(
            nets=arch_class,
            metrics=["AUC", "accuracy"],
            auto_discrete=True,
            auto_imputation=False,
            earlystopping_patience=5
        )

        # 5) Instantiate DeepTable and fit
        dt = DeepTable(config=conf)
        model, history = dt.fit(
            X_train_df,
            Y_train_s,
            epochs=100,
            validation_data=(X_val_df, Y_val_s)
        )

        # 6) Predict on test and compute AUC
        y_pred_prob = dt.predict_proba(X_test_df)[:, 1]
        auc_score = roc_auc_score(Y_test_s, y_pred_prob)
        aucs.append(auc_score)

    results[arch_name] = np.mean(aucs)

# ──────────────────────────────────────────────────────────────
# Output summary
# ──────────────────────────────────────────────────────────────
print("\nArchitecture Comparison (Average AUC across imputations):")
for name, score in results.items():
    print(f"{name:<10}: {score:.4f}")

best_model = max(results, key=results.get)
print(f"\nBest architecture: {best_model}")
