In [13]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier
import warnings

warnings.simplefilter("ignore", category=UserWarning)

###############################################################################
# 1. GLOBAL PARAMETERS & SETTINGS
###############################################################################
FP_COST = 10    # Penalty for false positive (treating a healthy patient)
FN_COST = 50    # Penalty for false negative (never treating a sick patient)
D_COST  = 1     # Penalty per time-step of delay in treating a sick patient
GAMMA   = 0.99  # Default discount factor for DP (may be overridden in DP tuning)
T_MAX   = 20    # Time horizon (discrete steps 0..T_MAX-1 for each patient)

FEATURE_COLS = ["time", "EIT", "NIRS", "EIS"]

###############################################################################
# 2. HELPER FUNCTIONS FOR DATA SPLITTING & FILTERING
###############################################################################
def split_patients_kfold(df, n_splits=4, seed=0):
    """
    Shuffle unique patient IDs, then split into (n_splits+1) groups G1,...,G_{n_splits},G_{n_splits+1}.
    """
    rng = np.random.RandomState(seed)
    unique_pts = df['patient_id'].unique()
    rng.shuffle(unique_pts)
    
    n = len(unique_pts)
    splits = {}
    
    # Partition into n_splits+1 roughly equal groups
    for i in range(n_splits + 1):
        start_idx = int(i * n / (n_splits + 1))
        end_idx   = int((i + 1) * n / (n_splits + 1))
        group_name = f"G{i+1}"
        splits[group_name] = set(unique_pts[start_idx:end_idx])
    
    return splits

def filter_by_group(df, pid_set):
    """Return the subset of df whose patient_id is in pid_set."""
    return df[df['patient_id'].isin(pid_set)].copy()


###############################################################################
# 3. ML TRAINING & RISK-SCORE PREDICTIONS
###############################################################################
def train_and_predict_model(
    model_type,
    hyperparams,
    df_train,
    df_val,
    feature_cols=FEATURE_COLS):
    """
    Train a classification model (CatBoost/RF/GB) on df_train and return predicted
    probabilities for df_val. `hyperparams` is a dict of model-specific hyper-parameters.
    """
    X_train = df_train[feature_cols]
    y_train = df_train['label']
    
    if model_type == "catboost":
        model = CatBoostClassifier(**hyperparams, verbose=False)
        model.fit(X_train, y_train)
    elif model_type == "rf":
        model = RandomForestClassifier(**hyperparams, random_state=42)
        model.fit(X_train, y_train)
    elif model_type == "gb":
        model = GradientBoostingClassifier(**hyperparams, random_state=42)
        model.fit(X_train, y_train)
    else:
        raise ValueError(f"Unknown model_type={model_type}")
    
    X_val = df_val[feature_cols]
    risk_scores = model.predict_proba(X_val)[:,1]  # Probability label=1
    return risk_scores


def select_best_ml_hyperparams_by_auc(
    df_train_splits,
    val_split_name,
    model_list,
    param_grid_dict,
    feature_cols=FEATURE_COLS):
    """
    Perform a grid search over (model_type, hyperparams) to maximize AUC on the 
    validation set = df_train_splits[val_split_name].
    Return:
       best_model_type, best_hyperparams, best_auc, val_preds (predicted_risk for the val set).
    """
    from sklearn.metrics import roc_auc_score
    
    df_val = df_train_splits[val_split_name].copy()
    
    # Combine all other folds for training
    train_df_list = []
    for k, v_df in df_train_splits.items():
        if k != val_split_name:
            train_df_list.append(v_df)
    df_train_full = pd.concat(train_df_list, ignore_index=True)
    
    X_val = df_val[feature_cols]
    y_val = df_val['label'].values
    
    best_model_type = None
    best_hparams    = None
    best_auc        = -999
    best_preds      = None
    
    # Grid search across all candidate (model_type, hyperparam)
    for model_type in model_list:
        for hyperparams in param_grid_dict[model_type]:
            # Train on df_train_full, predict on df_val
            preds = train_and_predict_model(
                model_type=model_type,
                hyperparams=hyperparams,
                df_train=df_train_full,
                df_val=df_val,
                feature_cols=feature_cols
            )
            auc_val = roc_auc_score(y_val, preds)
            if auc_val > best_auc:
                best_auc = auc_val
                best_model_type = model_type
                best_hparams    = hyperparams
                best_preds      = preds
    
    return best_model_type, best_hparams, best_auc, best_preds


###############################################################################
# 4. POLICY SIMULATION (Compute cost, precision, recall, etc.)
###############################################################################
def simulate_policy(df, policy_func):
    """
    df has columns: [patient_id, time, label, predicted_risk].
    policy_func(patient_rows) -> an integer in [0..T_MAX-1] for the 
        time step of treatment, or None if never treated.
    Returns a dict with keys {cost, precision, recall, avg_treatment_time}, etc.
    """
    results = []
    for pid, patient_rows in df.groupby('patient_id'):
        patient_rows = patient_rows.sort_values('time')
        label = patient_rows['label'].iloc[0]  # 0 or 1 (healthy vs sick)
        treat_time = policy_func(patient_rows)
        
        if treat_time is None:
            # never treat
            treated_flag = 0
            if label == 1:
                cost = FN_COST  # missed a sick patient
            else:
                cost = 0
            tp = 0
            fp = 0
            tt = None
        else:
            # treat at treat_time
            treated_flag = 1
            if label == 1:
                cost = D_COST * treat_time  # delay cost
                tp   = 1
                fp   = 0
            else:
                cost = FP_COST
                tp   = 0
                fp   = 1
            tt = treat_time
        
        results.append({
            'patient_id': pid,
            'label': label,
            'treated': treated_flag,
            'treat_time': tt,
            'cost': cost,
            'tp': tp,
            'fp': fp
        })
    
    df_res = pd.DataFrame(results)
    total_cost = df_res['cost'].sum()
    
    treated_df = df_res[df_res['treated'] == 1]
    tp_sum     = treated_df['tp'].sum()
    fp_sum     = treated_df['fp'].sum()
    
    if len(treated_df) > 0:
        precision = tp_sum / (tp_sum + fp_sum)
    else:
        precision = 0.0
    
    sick_df   = df_res[df_res['label'] == 1]
    total_sick= len(sick_df)
    if total_sick > 0:
        recall = tp_sum / total_sick
    else:
        recall = 0.0
    
    if len(treated_df) > 0:
        valid_tt = treated_df['treat_time'].dropna()
        avg_tt   = valid_tt.mean() if len(valid_tt) > 0 else 0.0
    else:
        avg_tt = 0.0
    
    return {
        'cost': total_cost,
        'precision': precision,
        'recall': recall,
        'avg_treatment_time': avg_tt
    }


###############################################################################
# 5. BENCHMARK STRATEGIES (Constant Threshold, Dynamic, Linear, Wait-Till-End)
###############################################################################
def constant_threshold_search(df, thresholds=None):
    """
    Try a grid of constant thresholds for the entire time horizon,
    pick the one minimizing cost on df. Return (best_threshold, best_stats).
    """
    if thresholds is None:
        thresholds = np.linspace(0, 1, 21)
    best_thr, best_cost, best_stats = None, float('inf'), None
    
    for thr in thresholds:
        def policy_func(patient_rows):
            for _, row in patient_rows.iterrows():
                if row['predicted_risk'] >= thr:
                    return int(row['time'])
            return None
        
        stats = simulate_policy(df, policy_func)
        if stats['cost'] < best_cost:
            best_cost = stats['cost']
            best_thr  = thr
            best_stats= stats
    
    return best_thr, best_stats


def dynamic_threshold_random_search(df,
                                    time_steps=T_MAX,
                                    threshold_candidates=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
                                    n_samples=100,
                                    seed=0):
    """
    Sample random time-varying thresholds (one threshold per time step),
    measure cost, pick the best. For demonstration.
    """
    rng = np.random.RandomState(seed)
    best_vec = None
    best_cost= float('inf')
    best_stats=None
    
    for _ in range(n_samples):
        thr_vec = rng.choice(threshold_candidates, size=time_steps)
        
        def policy_func(patient_rows):
            for _, row in patient_rows.iterrows():
                t = int(row['time'])
                if t < time_steps and row['predicted_risk'] >= thr_vec[t]:
                    return t
            return None
        
        stats = simulate_policy(df, policy_func)
        if stats['cost'] < best_cost:
            best_cost = stats['cost']
            best_vec  = thr_vec.copy()
            best_stats= stats
    
    return best_vec, best_stats

def linear_threshold_search(df, A_candidates=None, B_candidates=None):
    """
    threshold(t) = A*t + B, clipped to [0,1].
    Search over A_candidates x B_candidates, pick the best cost.
    """
    if A_candidates is None:
        A_candidates = np.linspace(-0.05, 0.05, 11)
    if B_candidates is None:
        B_candidates = np.linspace(0, 1, 11)
    best_A, best_B = None, None
    best_cost, best_stats = float('inf'), None
    
    for A in A_candidates:
        for B in B_candidates:
            def policy_func(patient_rows):
                for _, row in patient_rows.iterrows():
                    t = row['time']
                    thr = A * t + B
                    thr = max(0, min(1, thr))  # clip to [0,1]
                    if row['predicted_risk'] >= thr:
                        return int(t)
                return None
            
            stats = simulate_policy(df, policy_func)
            if stats['cost'] < best_cost:
                best_cost = stats['cost']
                best_A = A
                best_B = B
                best_stats = stats
    
    return (best_A, best_B), best_stats


def wait_till_end_search(df, thresholds=None):
    """
    Treat (if at all) only at the final time step, with a single threshold.
    """
    if thresholds is None:
        thresholds = np.linspace(0, 1, 21)
    best_thr, best_cost, best_stats = None, float('inf'), None
    
    for thr in thresholds:
        def policy_func(patient_rows):
            # Look at the final time row
            final_row = patient_rows.loc[patient_rows['time'].idxmax()]
            if final_row['predicted_risk'] >= thr:
                return int(final_row['time'])
            return None
        
        stats = simulate_policy(df, policy_func)
        if stats['cost'] < best_cost:
            best_cost = stats['cost']
            best_thr  = thr
            best_stats= stats
    return best_thr, best_stats


###############################################################################
# 6. DATA-DRIVEN DP (Bucketed)
###############################################################################
def assign_buckets(prob, n_buckets=5):
    """
    Convert predicted probability into a discrete bucket 0..(n_buckets-1).
    E.g. edges for 5 buckets = [0,0.2,0.4,0.6,0.8,1.0]
    """
    edges = np.linspace(0, 1, n_buckets+1)
    for b in range(n_buckets):
        if edges[b] <= prob < edges[b+1]:
            return b
    return n_buckets-1  # fallback if prob=1.0

def estimate_transition_and_sick_probs(df_train, T=T_MAX, n_buckets=5):
    """
    Given df_train with 'predicted_risk' & 'label' & 'time' (0..T-1),
    compute p_trans[t,b,b'] = P(bucket_{t+1}=b' | bucket_t=b),
    and p_sick[t,b] = Probability of being sick in (t,b).
    """
    transition_counts = np.zeros((T-1, n_buckets, n_buckets))
    bucket_counts     = np.zeros((T, n_buckets))
    sick_counts       = np.zeros((T, n_buckets))
    
    df_sorted = df_train.sort_values(['patient_id','time'])
    
    for pid, grp in df_sorted.groupby('patient_id'):
        rows = grp.to_dict('records')
        for i, row in enumerate(rows):
            t   = int(row['time'])
            b   = int(row['risk_bucket'])
            lbl = int(row['label'])
            if t < T:
                bucket_counts[t, b] += 1
                sick_counts[t, b]   += lbl
            
            if i < len(rows) - 1:
                row_next = rows[i+1]
                t_next = int(row_next['time'])
                b_next = int(row_next['risk_bucket'])
                if (t_next == t+1) and (t < T-1):
                    transition_counts[t, b, b_next] += 1
    
    p_trans = np.zeros((T-1, n_buckets, n_buckets))
    for t_ in range(T-1):
        for b_ in range(n_buckets):
            denom = transition_counts[t_, b_, :].sum()
            if denom > 0:
                p_trans[t_, b_, :] = transition_counts[t_, b_, :] / denom
            else:
                # if no data, remain in the same bucket with prob=1
                p_trans[t_, b_, b_] = 1.0
    
    p_sick = np.zeros((T, n_buckets))
    for t_ in range(T):
        for b_ in range(n_buckets):
            denom = bucket_counts[t_, b_]
            if denom > 0:
                p_sick[t_, b_] = sick_counts[t_, b_] / denom
            else:
                p_sick[t_, b_] = 0.0
    
    return p_trans, p_sick


def train_data_driven_dp(p_trans, p_sick,
                         FP=FP_COST, FN=FN_COST, D=D_COST,
                         gamma=GAMMA, T=T_MAX):
    """
    Standard backward DP for the bucket-based approach:
      - V[t,b] = min( cost of treating now, cost of waiting )
    Return V, pi_ (value function and policy).
    """
    n_buckets = p_sick.shape[1]
    V = np.zeros((T+1, n_buckets))
    pi_ = np.zeros((T, n_buckets), dtype=int)
    
    # Terminal cost at t=T
    # If we reach time T in bucket b, the next step is "end" => we can choose treat or not
    for b in range(n_buckets):
        cost_treat = p_sick[T-1,b]* (D*(T-1)) + (1 - p_sick[T-1,b])*FP
        cost_skip  = p_sick[T-1,b]*FN
        V[T,b]     = min(cost_treat, cost_skip)
    
    # Backward recursion
    for t in reversed(range(T)):
        for b in range(n_buckets):
            # Option A: Treat now
            cost_treat = p_sick[t,b]*(D*t) + (1 - p_sick[t,b])*FP
            
            # Option B: Wait => expected cost of next state
            if t == T-1:
                # next step is t=T
                exp_future = V[T,b]
            else:
                exp_future = 0.0
                for b_next in range(n_buckets):
                    exp_future += p_trans[t,b,b_next] * V[t+1,b_next]
            cost_wait = gamma * exp_future
            
            if cost_treat <= cost_wait:
                V[t,b]   = cost_treat
                pi_[t,b] = 1
            else:
                V[t,b]   = cost_wait
                pi_[t,b] = 0
    
    return V, pi_

def make_data_driven_dp_policy(V, pi_, T=T_MAX):
    """
    Returns a function that uses pi_[t,b] to decide when to treat.
    """
    def policy_func(patient_rows):
        for _, row in patient_rows.iterrows():
            t = int(row['time'])
            if t < T:
                b = int(row['risk_bucket'])
                action = pi_[t,b]  # 0=wait, 1=treat
                if action == 1:
                    return t
        return None
    return policy_func


###############################################################################
# 7. DP Hyper-Parameter Search for Algorithm 3
###############################################################################
def dp_param_search(df_train_fold, df_val_fold,
                    dp_param_grid,  # list of dicts, e.g. [{'gamma':0.95}, {'gamma':0.99}]
                    T=T_MAX):
    """
    Given a training fold & validation fold, we try each DP param set in dp_param_grid,
    build a DP policy, and measure cost on the validation fold.
    
    Return the best_dp_params, the cost, and the predicted risk for the validation set
    (the validation set already should have 'predicted_risk' from the chosen ML).
    """
    # For DP, we need to:
    #  - compute discrete buckets in the training fold
    #  - estimate transitions
    #  - run DP for each param set
    #  - apply the resulting policy on the validation fold
    #  - measure cost
    
    # 1) Assign buckets to training fold
    df_train_fold = df_train_fold.copy()
    df_train_fold["risk_bucket"] = df_train_fold["predicted_risk"].apply(assign_buckets)
    
    # 2) Estimate transitions
    p_trans, p_sick = estimate_transition_and_sick_probs(df_train_fold, T=T)
    
    best_params = None
    best_cost   = float('inf')
    best_stats  = None
    
    # Assign buckets to val fold too (for policy simulation)
    df_val_fold = df_val_fold.copy()
    df_val_fold["risk_bucket"] = df_val_fold["predicted_risk"].apply(assign_buckets)
    
    for param_dict in dp_param_grid:
        gamma_ = param_dict.get("gamma", GAMMA)
        # Potentially we could also vary D, FP, FN, etc. if included in the dictionary
        D_  = param_dict.get("D", D_COST)
        FP_ = param_dict.get("FP", FP_COST)
        FN_ = param_dict.get("FN", FN_COST)
        
        # 3) Train DP
        V, pi_ = train_data_driven_dp(p_trans, p_sick,
                                      FP=FP_, FN=FN_, D=D_, gamma=gamma_, T=T)
        dp_policy_func = make_data_driven_dp_policy(V, pi_, T=T)
        
        # 4) Evaluate on df_val_fold
        stats = simulate_policy(df_val_fold, dp_policy_func)
        if stats['cost'] < best_cost:
            best_cost  = stats['cost']
            best_params= param_dict
            best_stats = stats
    
    return best_params, best_stats


###############################################################################
# 8. ALGORITHM 3: SEQUENTIAL OPTIMIZATION
###############################################################################
def run_experiment_algorithm3(
    df_all,
    n_splits=4,
    seed=42,
    model_list=("catboost","rf","gb"),
    ml_param_grid=None,
    dp_param_grid=None,
    verbose=True
):
    """
    Implement Algorithm 3 (Sequential Optimization):
      1) Cross-validate ML hyperparams (AUC-based).
      2) For each fold's chosen ML model, cross-validate DP hyperparams (cost-based).
      3) Summarize the sets of (mu) found in each fold => define mu_{all}^*.
      4) Use all folds again to pick final ML hyperparams (by AUC).
      5) Then, with that ML fixed, pick final DP hyperparams from mu_{all}^* by cost.
      6) Retrain on G1..G_n with chosen ML, produce predicted_risk, run final DP,
         evaluate on G_{n+1}.
    
    Because there are multiple ways to interpret the text-block pseudo-code,
    this function follows the step-by-step logic common in "sequential" 
    (non-decision-aware => then DP decision) style. 
    """
    from sklearn.metrics import roc_auc_score
    
    if ml_param_grid is None:
        # A small default grid for demonstration
        ml_param_grid = {
            "catboost": [
                {"iterations":50, "depth":3, "learning_rate":0.1},
                {"iterations":50, "depth":4, "learning_rate":0.05},
            ],
            "rf": [
                {"n_estimators":50, "max_depth":3},
                {"n_estimators":100,"max_depth":5},
            ],
            "gb": [
                {"n_estimators":50, "max_depth":3, "learning_rate":0.1},
                {"n_estimators":100,"max_depth":3, "learning_rate":0.05},
            ]
        }
    if dp_param_grid is None:
        # Example DP param grid: vary gamma, or vary others
        dp_param_grid = [
            {"gamma": 0.95},
            {"gamma": 0.99},
        ]
    
    if verbose:
        print(f"\nRunning Algorithm 3 (Sequential Optimization) with {n_splits} folds...")
    
    # 1) Split data => G1..G_{n_splits}, G_{n_splits+1}
    splits = split_patients_kfold(df_all, n_splits=n_splits, seed=seed)
    group_dfs = {}
    for group_name, pid_set in splits.items():
        sub_df = filter_by_group(df_all, pid_set)
        group_dfs[group_name] = sub_df
    
    test_name = f"G{n_splits+1}"
    df_test   = group_dfs[test_name]
    
    # -------------------------------------------------------------------------
    # (A) CROSS-VALIDATE ML => pick best ML hyperparams for each fold
    # -------------------------------------------------------------------------
    ml_cv_details = []
    
    for i_val in range(1, n_splits+1):
        val_name = f"G{i_val}"
        
        # (A1) Find best ML hyperparams by AUC
        best_model_type, best_hparams, best_auc, val_preds = select_best_ml_hyperparams_by_auc(
            df_train_splits=group_dfs,
            val_split_name=val_name,
            model_list=model_list,
            param_grid_dict=ml_param_grid,
            feature_cols=FEATURE_COLS
        )
        
        # (A2) Store predicted_risk for that validation set
        df_val = group_dfs[val_name].copy()
        df_val["predicted_risk"] = val_preds
        
        # Save it back
        group_dfs[val_name] = df_val
        
        ml_cv_details.append({
            "fold": i_val,
            "best_model_type": best_model_type,
            "best_hparams": best_hparams,
            "AUC_val": best_auc
        })
    
    df_ml_cv_details = pd.DataFrame(ml_cv_details)
    # Summarize which ML hyperparams got chosen by each fold ...
    # (We will re-check them in the next step.)
    
    # -------------------------------------------------------------------------
    # (B) For each fold's chosen ML, do a DP hyper-param search => pick DP param
    # -------------------------------------------------------------------------
    dp_cv_details = []
    
    for i_val in range(1, n_splits+1):
        val_name  = f"G{i_val}"
        best_rec  = df_ml_cv_details[df_ml_cv_details['fold'] == i_val].iloc[0]
        ml_model_type = best_rec["best_model_type"]
        ml_hparams    = best_rec["best_hparams"]
        
        # 1) Retrain that ML on "training folds except G_i_val" => get predicted_risk
        #    for the union (train_folds) = G\G_i
        train_folds = []
        for j in range(1, n_splits+1):
            if j != i_val:
                train_folds.append(group_dfs[f"G{j}"])
        df_train_fold = pd.concat(train_folds, ignore_index=True).copy()
        
        # Train & predict on df_train_fold itself for DP transitions
        from sklearn.metrics import roc_auc_score
        
        X_train_f = df_train_fold[FEATURE_COLS]
        y_train_f = df_train_fold['label']
        
        # Rebuild the model
        if ml_model_type == "catboost":
            final_model = CatBoostClassifier(**ml_hparams, verbose=False)
            final_model.fit(X_train_f, y_train_f)
        elif ml_model_type == "rf":
            final_model = RandomForestClassifier(**ml_hparams, random_state=42)
            final_model.fit(X_train_f, y_train_f)
        else:
            final_model = GradientBoostingClassifier(**ml_hparams, random_state=42)
            final_model.fit(X_train_f, y_train_f)
        
        # Store predictions in df_train_fold
        df_train_fold["predicted_risk"] = final_model.predict_proba(X_train_f)[:,1]
        
        # 2) DP hyper-param search on this fold, using the same "train => val" logic
        #    Validation set is group_dfs[val_name], which already has *some* predicted risk 
        #    but that risk was from the *best model for i_val.* We should unify it carefully.
        
        # Actually, to be consistent: The DP sees the same final model that we have for i_val.
        # So let's do a fresh predicted risk for df_val as well. (Because we want consistent 
        # train->val usage for DP.)
        
        df_val_fold = group_dfs[val_name].copy()
        X_val_fold  = df_val_fold[FEATURE_COLS]
        df_val_fold["predicted_risk"] = final_model.predict_proba(X_val_fold)[:,1]
        
        best_dp_params, best_dp_stats = dp_param_search(
            df_train_fold=df_train_fold,
            df_val_fold=df_val_fold,
            dp_param_grid=dp_param_grid,
            T=T_MAX
        )
        
        dp_cv_details.append({
            "fold": i_val,
            "chosen_ML_model": ml_model_type,
            "chosen_ML_hparams": ml_hparams,
            "chosen_DP_params": best_dp_params,
            "dp_val_cost": best_dp_stats["cost"],
            "dp_val_prec": best_dp_stats["precision"],
            "dp_val_rec":  best_dp_stats["recall"],
            "dp_val_avgTT": best_dp_stats["avg_treatment_time"]
        })
    
    df_dp_cv_details = pd.DataFrame(dp_cv_details)
    
    # Collect all DP param sets that got chosen: mu(j) for j=1..n
    mu_all_star = []
    for _, row_ in df_dp_cv_details.iterrows():
        # each fold might have chosen a dictionary like {"gamma":0.95}
        mu_all_star.append(row_["chosen_DP_params"])
    
    # -------------------------------------------------------------------------
    # (C) Now do a second pass to pick the final ML hyperparams \lambda^*
    #     across all folds (by AUC).
    # -------------------------------------------------------------------------
    # The simplest approach: we do a standard cross-validation again for ML 
    # but ignoring DP for the moment, because this is "sequential" approach.
    # => Essentially the same method we used in step (A), but summarizing now 
    #    across all folds. We'll pick the single best (model_type, hyperparams)
    #    that leads to highest average AUC across G1..G_n.
    
    # We'll accumulate fold-level AUC for each candidate, then pick the best overall.
    candidate_list = []
    for model_type in model_list:
        for hyperparams in ml_param_grid[model_type]:
            candidate_list.append((model_type, hyperparams))
    
    results_auc_cv = []
    for (mtype, mhp) in candidate_list:
        fold_aucs = []
        for i_val in range(1, n_splits+1):
            val_name = f"G{i_val}"
            # Train on G\G_i
            train_folds = []
            for j in range(1, n_splits+1):
                if j != i_val:
                    train_folds.append(group_dfs[f"G{j}"])
            df_train_fold = pd.concat(train_folds, ignore_index=True)
            
            # Train model
            X_train_f = df_train_fold[FEATURE_COLS]
            y_train_f = df_train_fold['label']
            
            if mtype == "catboost":
                tmp_model = CatBoostClassifier(**mhp, verbose=False)
                tmp_model.fit(X_train_f, y_train_f)
            elif mtype == "rf":
                tmp_model = RandomForestClassifier(**mhp, random_state=42)
                tmp_model.fit(X_train_f, y_train_f)
            else:
                tmp_model = GradientBoostingClassifier(**mhp, random_state=42)
                tmp_model.fit(X_train_f, y_train_f)
            
            # Predict on validation G_i
            df_val_fold = group_dfs[val_name]
            X_val_fold  = df_val_fold[FEATURE_COLS]
            val_preds   = tmp_model.predict_proba(X_val_fold)[:,1]
            
            auc_val = roc_auc_score(df_val_fold['label'], val_preds)
            fold_aucs.append(auc_val)
        
        avg_auc = np.mean(fold_aucs)
        results_auc_cv.append({
            "model_type": mtype,
            "hyperparams": mhp,
            "avg_auc": avg_auc
        })
    
    df_results_auc_cv = pd.DataFrame(results_auc_cv)
    # pick best by avg_auc
    best_row = df_results_auc_cv.loc[df_results_auc_cv['avg_auc'].idxmax()]
    final_ml_type   = best_row["model_type"]
    final_ml_params = best_row["hyperparams"]
    final_ml_auc    = best_row["avg_auc"]
    
    # -------------------------------------------------------------------------
    # (D) Next, with that final ML type/params fixed, we pick the best DP hyperparams 
    #     from the union mu_all_star we collected above.
    #     We'll evaluate each candidate in mu_all_star with a new cross-validation 
    #     pass for cost, but with the final ML in place.
    # -------------------------------------------------------------------------
    
    # Because multiple folds might produce duplicates in mu_all_star, we can deduplicate:
    import json
    unique_mu = []
    seen_strs = set()
    for mu_dict in mu_all_star:
        s = json.dumps(mu_dict, sort_keys=True)
        if s not in seen_strs:
            seen_strs.add(s)
            unique_mu.append(mu_dict)
    
    dp_candidates = unique_mu
    
    # Evaluate each dp_candidates in cross-validation with final ML
    #  => for each fold i_val, we do: train final ML on G\G_i => predict => 
    #     run the DP with param from dp_candidates => measure cost => average across folds
    results_dp_cv = []
    
    for dp_params in dp_candidates:
        fold_costs = []
        for i_val in range(1, n_splits+1):
            val_name = f"G{i_val}"
            # Train final ML on G\G_i
            train_folds = []
            for j in range(1, n_splits+1):
                if j != i_val:
                    train_folds.append(group_dfs[f"G{j}"])
            df_train_fold = pd.concat(train_folds, ignore_index=True).copy()
            
            X_train_f = df_train_fold[FEATURE_COLS]
            y_train_f = df_train_fold['label']
            
            if final_ml_type == "catboost":
                tmp_model = CatBoostClassifier(**final_ml_params, verbose=False)
                tmp_model.fit(X_train_f, y_train_f)
            elif final_ml_type == "rf":
                tmp_model = RandomForestClassifier(**final_ml_params, random_state=42)
                tmp_model.fit(X_train_f, y_train_f)
            else:
                tmp_model = GradientBoostingClassifier(**final_ml_params, random_state=42)
                tmp_model.fit(X_train_f, y_train_f)
            
            df_train_fold["predicted_risk"] = tmp_model.predict_proba(X_train_f)[:,1]
            
            # Build DP for dp_params
            df_val_fold = group_dfs[val_name].copy()
            X_val_fold  = df_val_fold[FEATURE_COLS]
            df_val_fold["predicted_risk"] = tmp_model.predict_proba(X_val_fold)[:,1]
            
            # train DP on df_train_fold
            df_train_fold["risk_bucket"] = df_train_fold["predicted_risk"].apply(assign_buckets)
            p_trans, p_sick = estimate_transition_and_sick_probs(df_train_fold, T=T_MAX)
            
            gamma_ = dp_params.get("gamma", GAMMA)
            D_  = dp_params.get("D", D_COST)
            FP_ = dp_params.get("FP", FP_COST)
            FN_ = dp_params.get("FN", FN_COST)
            
            V, pi_ = train_data_driven_dp(
                p_trans, p_sick,
                FP=FP_, FN=FN_, D=D_, gamma=gamma_, T=T_MAX
            )
            policy_func = make_data_driven_dp_policy(V, pi_, T=T_MAX)
            
            # evaluate cost on df_val_fold
            df_val_fold["risk_bucket"] = df_val_fold["predicted_risk"].apply(assign_buckets)
            stats = simulate_policy(df_val_fold, policy_func)
            fold_costs.append(stats['cost'])
        
        avg_cost = np.mean(fold_costs)
        results_dp_cv.append({
            "dp_params": dp_params,
            "avg_cost": avg_cost
        })
    
    df_results_dp_cv = pd.DataFrame(results_dp_cv)
    best_dp_idx = df_results_dp_cv['avg_cost'].idxmin()
    final_dp_params = df_results_dp_cv.loc[best_dp_idx, "dp_params"]
    final_dp_cost   = df_results_dp_cv.loc[best_dp_idx, "avg_cost"]
    
    if verbose:
        print(f"Final chosen ML: {final_ml_type} {final_ml_params}, avg AUC={final_ml_auc:.3f}")
        print(f"Final chosen DP params: {final_dp_params}, avg cost={final_dp_cost:.3f}")
    
    # -------------------------------------------------------------------------
    # (E) Retrain on G1..G_n with final ML => evaluate on G_{n+1}
    # -------------------------------------------------------------------------
    train_all = []
    for i in range(1, n_splits+1):
        train_all.append(group_dfs[f"G{i}"])
    df_train_all = pd.concat(train_all, ignore_index=True).copy()
    
    X_train_all = df_train_all[FEATURE_COLS]
    y_train_all = df_train_all['label']
    
    if final_ml_type == "catboost":
        final_model = CatBoostClassifier(**final_ml_params, verbose=False)
        final_model.fit(X_train_all, y_train_all)
    elif final_ml_type == "rf":
        final_model = RandomForestClassifier(**final_ml_params, random_state=42)
        final_model.fit(X_train_all, y_train_all)
    else:
        final_model = GradientBoostingClassifier(**final_ml_params, random_state=42)
        final_model.fit(X_train_all, y_train_all)
    
    # Predict risk for train to build DP transitions
    df_train_all["predicted_risk"] = final_model.predict_proba(X_train_all)[:,1]
    df_train_all["risk_bucket"]    = df_train_all["predicted_risk"].apply(assign_buckets)
    
    # Build DP with final_dp_params
    gamma_ = final_dp_params.get("gamma", GAMMA)
    D_  = final_dp_params.get("D", D_COST)
    FP_ = final_dp_params.get("FP", FP_COST)
    FN_ = final_dp_params.get("FN", FN_COST)
    
    p_trans, p_sick = estimate_transition_and_sick_probs(df_train_all, T=T_MAX)
    V, pi_ = train_data_driven_dp(
        p_trans, p_sick,
        FP=FP_, FN=FN_, D=D_, gamma=gamma_, T=T_MAX
    )
    dp_final_policy = make_data_driven_dp_policy(V, pi_, T=T_MAX)
    
    # Evaluate on G_{n+1}
    df_test_eval = df_test.copy()
    X_test_eval  = df_test_eval[FEATURE_COLS]
    df_test_eval["predicted_risk"] = final_model.predict_proba(X_test_eval)[:,1]
    
    # Benchmark methods:
    best_thr_const, stats_const = constant_threshold_search(df_test_eval)
    best_dyn_vec, stats_dyn     = dynamic_threshold_random_search(df_test_eval)
    (bestA,bestB), stats_lin    = linear_threshold_search(df_test_eval)
    best_thr_wte, stats_wte     = wait_till_end_search(df_test_eval)
    
    df_test_eval["risk_bucket"] = df_test_eval["predicted_risk"].apply(assign_buckets)
    stats_dp = simulate_policy(df_test_eval, dp_final_policy)
    
    final_table = pd.DataFrame({
        "Method": [
            "Constant Threshold",
            "Dynamic Threshold-R",
            "Linear Threshold",
            "Wait Till End",
            "Dynamic Threshold-DP"
        ],
        "Precision (%)": [
            100*stats_const['precision'],
            100*stats_dyn['precision'],
            100*stats_lin['precision'],
            100*stats_wte['precision'],
            100*stats_dp['precision']
        ],
        "Cost": [
            stats_const['cost'],
            stats_dyn['cost'],
            stats_lin['cost'],
            stats_wte['cost'],
            stats_dp['cost']
        ],
        "Recall (%)": [
            100*stats_const['recall'],
            100*stats_dyn['recall'],
            100*stats_lin['recall'],
            100*stats_wte['recall'],
            100*stats_dp['recall']
        ],
        "Treatment Time": [
            stats_const['avg_treatment_time'],
            stats_dyn['avg_treatment_time'],
            stats_lin['avg_treatment_time'],
            stats_wte['avg_treatment_time'],
            stats_dp['avg_treatment_time']
        ]
    })
    
    return {
        "ml_cv_details": df_ml_cv_details,
        "dp_cv_details": df_dp_cv_details,
        "ml_final_choice": (final_ml_type, final_ml_params, final_ml_auc),
        "dp_final_choice": (final_dp_params, final_dp_cost),
        "test_results_table": final_table
    }


###############################################################################
# 9. RUN MULTIPLE REPLICATIONS
###############################################################################
def run_multiple_replications(df_all, n_replications=30, n_splits=4):
    """
    Run Algorithm 3 multiple times with different random seeds.
    Compute mean and standard deviation for each metric.
    """
    # Define standard method names for consistent reporting
    standard_methods = [
        'Constant Threshold',
        'Dynamic Threshold-R',
        'Linear Threshold',
        'Wait Till End',
        'Dynamic Threshold-DP'
    ]
    
    # Initialize containers for each metric and method
    precision_values = {method: [] for method in standard_methods}
    cost_values = {method: [] for method in standard_methods}
    recall_values = {method: [] for method in standard_methods}
    treatment_time_values = {method: [] for method in standard_methods}
    
    for i in range(n_replications):
        seed = i  # Use a different seed for each replication
        print(f"\nRunning replication {i+1}/{n_replications} with seed={seed}")
        
        # Run Algorithm 3 with current seed
        results = run_experiment_algorithm3(
            df_all=df_all, 
            n_splits=n_splits, 
            seed=seed,
            verbose=False  # Turn off verbose output for cleaner console
        )
        
        # Extract final test results table
        test_table = results["test_results_table"]
        
        # Extract values for each method
        for _, row in test_table.iterrows():
            method = row['Method']
            
            if method in standard_methods:
                precision_values[method].append(row['Precision (%)'])
                cost_values[method].append(row['Cost'])
                recall_values[method].append(row['Recall (%)'])
                treatment_time_values[method].append(row['Treatment Time'])
    
    # Compute statistics
    final_data = []
    for method in standard_methods:
        if precision_values[method]:  # Check if we have data for this method
            precision_mean = np.mean(precision_values[method])
            precision_std = np.std(precision_values[method])
            cost_mean = np.mean(cost_values[method])
            cost_std = np.std(cost_values[method])
            recall_mean = np.mean(recall_values[method])
            recall_std = np.std(recall_values[method])
            treat_time_mean = np.mean(treatment_time_values[method])
            treat_time_std = np.std(treatment_time_values[method])
            
            final_data.append({
                'Method': method,
                'Precision (%)': f"{precision_mean:.2f} ± {precision_std:.2f}",
                'Cost': f"{cost_mean:.2f} ± {cost_std:.2f}",
                'Recall (%)': f"{recall_mean:.2f} ± {recall_std:.2f}",
                'Treatment Time': f"{treat_time_mean:.2f} ± {treat_time_std:.2f}"
            })
    
    return pd.DataFrame(final_data)


###############################################################################
# 10. MAIN SCRIPT 
###############################################################################
def main():
   
    df_all = pd.read_csv("synthetic_patients_with_features.csv")
    print(f"Total patients: {df_all['patient_id'].nunique()}")
    print(f"Columns: {list(df_all.columns)}")

    # 2) Run multiple replications
    n_replications = 1
    n_splits = 4
    final_results = run_multiple_replications(df_all, n_replications=n_replications, n_splits=n_splits)
    
    # 3) Print final results
    print(f"\n=== FINAL RESULTS (Mean ± Std Dev over {n_replications} Replications, Algorithm 3) ===")
    print(final_results.to_string(index=False))


if __name__ == "__main__":
    main()

Total patients: 600
Columns: ['patient_id', 'time', 'risk_bucket', 'risk_score', 'EIT', 'NIRS', 'EIS', 'label']

Running replication 1/30 with seed=0

Running replication 2/30 with seed=1

Running replication 3/30 with seed=2

Running replication 4/30 with seed=3

Running replication 5/30 with seed=4

Running replication 6/30 with seed=5

Running replication 7/30 with seed=6

Running replication 8/30 with seed=7

Running replication 9/30 with seed=8

Running replication 10/30 with seed=9

Running replication 11/30 with seed=10

Running replication 12/30 with seed=11

Running replication 13/30 with seed=12

Running replication 14/30 with seed=13

Running replication 15/30 with seed=14

Running replication 16/30 with seed=15

Running replication 17/30 with seed=16

Running replication 18/30 with seed=17

Running replication 19/30 with seed=18

Running replication 20/30 with seed=19

Running replication 21/30 with seed=20

Running replication 22/30 with seed=21

Running replication 23/30 