<a href="https://colab.research.google.com/github/osun24/nsclc-adj-chemo/blob/main/TorchSurv_DeepSurv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install necessary packages
!pip install torchsurv scikit-survival

# Import required packages
import os
import time
import datetime
import itertools
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sksurv.metrics import concordance_index_censored

# (Optional) Mount Google Drive if you plan to load/save files there
from google.colab import drive
drive.mount('/content/drive')


Collecting torchsurv
  Downloading torchsurv-0.1.5-py3-none-any.whl.metadata (15 kB)
Collecting scikit-survival
  Downloading scikit_survival-0.25.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (7.2 kB)
Collecting torchmetrics (from torchsurv)
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting ecos (from scikit-survival)
  Downloading ecos-2.0.14-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.0 kB)
Collecting osqp<1.0.0,>=0.6.3 (from scikit-survival)
  Downloading osqp-0.6.7.post3-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Collecting qdldl (from osqp<1.0.0,>=0.6.3->scikit-survival)
  Downloading qdldl-0.1.7.post5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics->torchsurv)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloadin

In [4]:
import os
import sys
import math
import datetime
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Sampler
from torchsurv.loss.cox import neg_partial_log_likelihood
from sksurv.metrics import concordance_index_censored
import warnings
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import copy
import random

warnings.filterwarnings("ignore", message="Ties in event time detected; using efron's method to handle ties.")
torch.manual_seed(0); np.random.seed(0); random.seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# ---------------- utils ----------------
class Tee:
    def __init__(self, *files): self.files = files
    def write(self, data):
        for f in self.files: f.write(data)
    def flush(self):
        for f in self.files: f.flush()

def loguniform(rng, lo, hi):
    return float(np.exp(rng.uniform(np.log(lo), np.log(hi))))

# Anchors so we can occasionally pick exact values (includes 1e-3 for L1)
wd_anchors = np.array([5e-2, 1e-1, 1.5e-1, 2e-1, 3e-1, 5e-1])
l1_anchors = np.array([5e-4, 7e-4, 1e-3, 2e-3, 5e-3, 1e-2])

def sample_hparams(rng):
    # Architectures: keep the same small winners in play
    hidden_options = [[256], [512], [512, 512], [256, 256], [32]]
    layers = hidden_options[rng.integers(len(hidden_options))]

    # Moderately high dropout
    dropout = float(rng.uniform(0.30, 0.65))

    # Slightly wider LR band (strong reg can handle a touch more LR)
    lr = loguniform(rng, 2e-5, 1.5e-4)

    # Mix anchors + ranges
    r = rng.random()
    if r < 0.20:
        wd = float(rng.choice(wd_anchors))
        l1 = float(rng.choice(l1_anchors))
    elif r < 0.80:
        # heavy
        wd = loguniform(rng, 5e-2, 3e-1)    # 0.05–0.30
        l1 = loguniform(rng, 7e-4, 5e-3)    # 7e-4–5e-3
    else:
        # extreme
        wd = loguniform(rng, 3e-1, 6e-1)    # 0.30–0.60
        l1 = loguniform(rng, 5e-3, 2e-2)    # 5e-3–2e-2

    return {'layers': layers, 'dropout': dropout, 'lr': lr, 'wd': wd, 'l1': l1}

# ---------------- model & data ----------------
class DeepSurvMLP(nn.Module):
    def __init__(self, in_features, hidden_layers, dropout=0.0, activation=nn.ReLU()):
        super().__init__()
        layers, d = [], in_features
        for units in hidden_layers:
            layers += [nn.Linear(d, units), activation]
            if dropout > 0: layers.append(nn.Dropout(dropout))
            d = units
        layers.append(nn.Linear(d, 1))
        self.model = nn.Sequential(*layers)
    def forward(self, x): return self.model(x)

class SurvivalDataset(Dataset):
    def __init__(self, features, time_vals, events):
        self.x = torch.tensor(features, dtype=torch.float32)
        self.time = torch.tensor(time_vals, dtype=torch.float32)
        self.event = torch.tensor(events, dtype=torch.bool)
    def __len__(self): return len(self.x)
    def __getitem__(self, idx): return self.x[idx], self.time[idx], self.event[idx]

# ---- (1) Event-balanced batch sampler: guarantee ≥1 event per batch ----
class EventBalancedBatchSampler(Sampler):
    def __init__(self, events_numpy, batch_size, seed=0):
        events = np.asarray(events_numpy).astype(bool)
        self.pos_idx = np.where(events)[0]
        self.neg_idx = np.where(~events)[0]
        assert len(self.pos_idx) > 0, "No events in training set — cannot balance batches."
        self.bs = int(batch_size)
        self.rng = np.random.default_rng(seed)

    def __iter__(self):
        # one finite pass == one epoch
        pos = self.rng.permutation(self.pos_idx)
        neg = self.rng.permutation(self.neg_idx)
        n_total = len(pos) + len(neg)
        n_batches = math.ceil(n_total / self.bs)

        pi = ni = 0
        for _ in range(n_batches):
            take_pos = 1 if pi < len(pos) else 0
            # fill the rest with negatives if we can
            avail_neg = max(0, len(neg) - ni)
            take_neg = min(self.bs - take_pos, avail_neg)

            # if we ran out of negatives, top up with extra positives
            need = self.bs - (take_pos + take_neg)
            extra_pos = min(need, max(0, len(pos) - (pi + take_pos)))
            take_pos += extra_pos

            batch = np.concatenate([
                pos[pi:pi+take_pos],
                neg[ni:ni+take_neg]
            ])
            pi += take_pos
            ni += take_neg

            if batch.size == 0:
                break
            self.rng.shuffle(batch)
            yield batch.tolist()

    def __len__(self):
        return math.ceil((len(self.pos_idx) + len(self.neg_idx)) / self.bs)

# ---- Param groups: L2 only on non-bias, non-final weights ----
def make_optimizer(model, lr, wd):
    # find last Linear
    linears = [m for m in model.modules() if isinstance(m, nn.Linear)]
    last_linear = linears[-1] if len(linears) > 0 else None

    decay, no_decay = [], []
    for name, p in model.named_parameters():
        if not p.requires_grad:
            continue
        if name.endswith('bias'):
            no_decay.append(p); continue
        if (last_linear is not None) and (p is last_linear.weight):
            no_decay.append(p); continue
        decay.append(p)

    param_groups = [
        {'params': decay, 'weight_decay': wd},
        {'params': no_decay, 'weight_decay': 0.0},
    ]
    return optim.AdamW(param_groups, lr=lr)

# ---- (2) Reg warm-up helpers (dropout + WD) ----
def set_dropout_p(model, p):
    for m in model.modules():
        if isinstance(m, nn.Dropout):
            m.p = float(p)

def set_weight_decay(optimizer, wd):
    for g in optimizer.param_groups:
        g['weight_decay'] = float(wd)

# L1 ONLY on the first (input) Linear layer (already warmed up in train loop)
def l1_penalty_first_layer(model):
    for m in model.modules():
        if isinstance(m, nn.Linear):
            return m.weight.abs().sum()
    return torch.tensor(0.0, device=next(model.parameters()).device)

def train_one_epoch(model, optimizer, dataloader, device, l1_lambda=0.0, epoch=0, warmup_epochs=20):
    model.train()
    warm = min(1.0, (epoch + 1) / float(warmup_epochs))  # linear warmup of L1
    loss_sum, n_seen = 0.0, 0
    skipped, total_batches = 0, 0
    for x, t, e in dataloader:
        total_batches += 1
        # with balanced sampler, this should almost never trigger, but keep it safe:
        if e.sum().item() == 0:
            skipped += 1
            continue
        x, t, e = x.to(device), t.to(device), e.to(device)
        optimizer.zero_grad(set_to_none=True)
        out = torch.clamp(model(x), -20, 20)
        loss = neg_partial_log_likelihood(out, e, t, reduction='mean')
        if l1_lambda > 0:
            loss = loss + (l1_lambda * warm) * l1_penalty_first_layer(model)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()
        loss_sum += loss.item() * x.size(0)
        n_seen += x.size(0)
    avg_loss = loss_sum / max(n_seen, 1)
    skip_frac = skipped / max(total_batches, 1)
    return {'avg_loss': avg_loss, 'skip_frac': skip_frac, 'warm': warm}

# ---- (3) Full-risk-set correction step (1x per epoch) ----
def full_risk_set_step(model, optimizer, train_ds, device, l1_lambda=0.0, warm=1.0):
    model.train()
    X_all = train_ds.x.to(device)
    t_all = train_ds.time.to(device)
    e_all = train_ds.event.to(device)
    optimizer.zero_grad(set_to_none=True)
    out_all = torch.clamp(model(X_all), -20, 20)
    loss_full = neg_partial_log_likelihood(out_all, e_all, t_all, reduction='mean')
    if l1_lambda > 0:
        loss_full = loss_full + (l1_lambda * warm) * l1_penalty_first_layer(model)
    loss_full.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
    optimizer.step()
    return float(loss_full.detach().cpu().item())

def evaluate_ci(model, dataloader, device):
    model.eval()
    preds, times, events = [], [], []
    with torch.no_grad():
        for x, t, e in dataloader:
            x = x.to(device)
            y = torch.clamp(model(x), -20, 20)
            preds.append(y.cpu().numpy().ravel())
            times.append(t.numpy()); events.append(e.numpy())
    preds = np.concatenate(preds)
    if np.isnan(preds).any():
        print("Warning: NaN predictions detected, returning -inf for concordance index")
        return -np.inf
    times = np.concatenate(times); events = np.concatenate(events)
    return concordance_index_censored(events.astype(bool), times, preds)[0]

def count_params(in_dim, hidden_layers):
    params, d = 0, in_dim
    for h in hidden_layers:
        params += d*h + h  # weights + bias
        d = h
    params += d*1 + 1     # final layer
    return params

# --- Bootstrap SE on validation C-index for a trial's best checkpoint ---
def _bootstrap_val_ci_for_trial(tr, X_val, t_val, e_val, in_dim, device, B=150, seed=123):
    if tr['best_state'] is None:
        return np.nan, np.nan, np.nan
    hp = tr['hp']
    # rebuild model at best checkpoint
    m = DeepSurvMLP(in_dim, hp['layers'], dropout=hp['dropout']).to(device)
    m.load_state_dict(tr['best_state'])
    m.eval()
    with torch.no_grad():
        preds = m(torch.tensor(X_val, dtype=torch.float32, device=device)).cpu().numpy().ravel()
    n = len(t_val)
    rng = np.random.default_rng(seed)
    boot = []
    for _ in range(B):
        idx = rng.integers(0, n, size=n)
        boot.append(concordance_index_censored(e_val[idx].astype(bool), t_val[idx], preds[idx])[0])
    boot = np.asarray(boot, dtype=float)
    return float(np.nanmean(boot)), float(np.nanstd(boot, ddof=1)), float(np.nanpercentile(boot, 5))

# ---------------- random search with successive halving ----------------
def main():
    # logging
    original_stdout = sys.stdout
    log_path = "/content/drive/MyDrive/deepsurv_9-16-25_training_log_fRMA.txt"
    with open(log_path, "w") as log_file:
        sys.stdout = Tee(original_stdout, log_file)

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        current_date = datetime.datetime.now().strftime("%Y%m%d")
        output_dir = "/content/drive/MyDrive/deepsurv_results"
        os.makedirs(output_dir, exist_ok=True)

        # ----- load & prep data -----
        train_df = pd.read_csv("/content/drive/MyDrive/affyTrainFrozen.csv")
        valid_df = pd.read_csv("/content/drive/MyDrive/affyValidationFrozen.csv")
        for df in (train_df, valid_df):
            if 'Adjuvant Chemo' in df.columns:
                df['Adjuvant Chemo'] = df['Adjuvant Chemo'].replace({'OBS':0,'ACT':1})
        binary_columns = ['Adjuvant Chemo','IS_MALE']
        for col in binary_columns:
            if col in train_df.columns: train_df[col] = train_df[col].astype(int)
            if col in valid_df.columns: valid_df[col] = valid_df[col].astype(int)

        survival_cols = ['OS_STATUS','OS_MONTHS']
        feature_cols = [c for c in train_df.columns if c not in survival_cols]

        X_train = train_df[feature_cols].values.astype(np.float32)
        X_valid = valid_df[feature_cols].values.astype(np.float32)
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train).astype(np.float32)
        X_valid = scaler.transform(X_valid).astype(np.float32)

        y_train_time = train_df['OS_MONTHS'].values.astype(np.float32)
        y_train_event = train_df['OS_STATUS'].values.astype(np.float32)
        y_valid_time = valid_df['OS_MONTHS'].values.astype(np.float32)
        y_valid_event = valid_df['OS_STATUS'].values.astype(np.float32)

        train_ds = SurvivalDataset(X_train, y_train_time, y_train_event)
        valid_ds = SurvivalDataset(X_valid, y_valid_time, y_valid_event)

        BATCH_SIZE = 64

        # (1) swap in the event-balanced sampler
        train_sampler = EventBalancedBatchSampler(y_train_event, BATCH_SIZE, seed=42)
        train_loader  = DataLoader(train_ds, batch_sampler=train_sampler)
        # evaluation loaders remain standard
        train_eval_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=False)
        valid_loader      = DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=False)

        # ----- search budget & rungs -----
        rng = np.random.default_rng(42)
        num_trials = 100
        rungs = [16, 64, 160, 320]  # keep your longer rungs
        eta = 3
        min_delta = 1e-4
        PRINT_EVERY = 1

        # Warm-ups
        L1_WARMUP_EPOCHS = 30
        WD_WARMUP_EPOCHS = 30
        DROPOUT_WARMUP_EPOCHS = 30
        DROPOUT_START = 0.15
        WD_START = 0.0

        # ΔVal-CI moving-average window
        DELTA_CI_MA_K = 10

        # ----- initialize trials -----
        trials = []
        for tid in range(num_trials):
            hp = sample_hparams(rng)
            model = DeepSurvMLP(X_train.shape[1], hp['layers'], dropout=hp['dropout']).to(device)
            optimizer = make_optimizer(model, lr=hp['lr'], wd=hp['wd'])
            trials.append({
                'id': tid, 'hp': hp, 'model': model, 'opt': optimizer, 'sched': None, 'sched_target': 0,
                'best_ci': -np.inf, 'best_state': copy.deepcopy(model.state_dict()),
                'best_epoch': 0, 'best_gap': np.inf, 'best_gap_abs': np.inf,
                'train_ci_at_best': np.nan, 'val_ci_at_best': np.nan,
                'hist_train_ci': [], 'hist_val_ci': [], 'hist_gap': [],
                'hist_loss': [], 'hist_skip': [],
                'hist_val_ci_delta': [], 'hist_val_ci_delta_ma': [],
                'epochs_done': 0, 'alive': True, 'no_improve': 0
            })

        # ----- successive halving loop -----
        prev_target = 0
        for rung_idx, rung_ep in enumerate(rungs, start=1):
            print(f"\n=== Rung {rung_idx}/{len(rungs)} → target {rung_ep} epochs ===")

            span = rung_ep - prev_target
            es_patience = float('inf') if rung_idx == 1 else max(5, int(0.25 * span))

            for tr in trials:
                if not tr['alive']: continue
                steps_remaining = rung_ep - tr['epochs_done']
                if steps_remaining <= 0:
                    continue

                tr['sched'] = torch.optim.lr_scheduler.CosineAnnealingLR(tr['opt'], T_max=steps_remaining)
                tr['sched_target'] = rung_ep

                while tr['epochs_done'] < rung_ep:
                    # (2) apply dropout + weight-decay warm-up schedules
                    frac_d = min(1.0, tr['epochs_done'] / float(DROPOUT_WARMUP_EPOCHS))
                    frac_w = min(1.0, tr['epochs_done'] / float(WD_WARMUP_EPOCHS))
                    p_t = DROPOUT_START + (tr['hp']['dropout'] - DROPOUT_START) * frac_d
                    wd_t = WD_START + (tr['hp']['wd'] - WD_START) * frac_w
                    set_dropout_p(tr['model'], p_t)
                    set_weight_decay(tr['opt'], wd_t)

                    # normal mini-batch training epoch
                    stats = train_one_epoch(
                        tr['model'], tr['opt'], train_loader, device,
                        l1_lambda=tr['hp']['l1'], epoch=tr['epochs_done'], warmup_epochs=L1_WARMUP_EPOCHS
                    )
                    # (3) one full-risk-set correction step per epoch
                    full_loss = full_risk_set_step(
                        tr['model'], tr['opt'], train_ds, device,
                        l1_lambda=tr['hp']['l1'], warm=stats['warm']
                    )

                    tr['epochs_done'] += 1

                    # diagnostics each epoch
                    tr_ci = evaluate_ci(tr['model'], train_eval_loader, device)
                    va_ci = evaluate_ci(tr['model'], valid_loader, device)
                    gap = tr_ci - va_ci  # overfitting gap (Train − Val)

                    tr['hist_train_ci'].append(tr_ci)
                    tr['hist_val_ci'].append(va_ci)
                    tr['hist_gap'].append(gap)
                    tr['hist_loss'].append(stats['avg_loss'])
                    tr['hist_skip'].append(stats['skip_frac'])

                    # ΔVal-CI and its moving average
                    if len(tr['hist_val_ci']) >= 2:
                        delta = tr['hist_val_ci'][-1] - tr['hist_val_ci'][-2]
                    else:
                        delta = 0.0
                    tr['hist_val_ci_delta'].append(delta)
                    ma = float(np.mean(tr['hist_val_ci_delta'][-DELTA_CI_MA_K:]))
                    tr['hist_val_ci_delta_ma'].append(ma)

                    if tr['sched'] is not None:
                        tr['sched'].step()

                    if tr['epochs_done'] % PRINT_EVERY == 0:
                        print(f"[Trial {tr['id']:02d} | Rung {rung_idx}/{len(rungs)} | "
                              f"Epoch {tr['epochs_done']:3d}] "
                              f"Loss {stats['avg_loss']:.4f} | FullStepLoss {full_loss:.4f} | "
                              f"Skip% {100*stats['skip_frac']:.1f} | "
                              f"Train CI {tr_ci:.4f} | Val CI {va_ci:.4f} | "
                              f"Gap(T−V) {gap:+.4f} | "
                              f"ΔVal-CI MA({DELTA_CI_MA_K}) {ma:+.4f} | Best {tr['best_ci']:.4f}")

                    # track best with tie/close-call handling using smallest |gap|
                    if va_ci > tr['best_ci'] + min_delta:
                        tr['best_ci'] = va_ci
                        tr['best_state'] = copy.deepcopy(tr['model'].state_dict())
                        tr['best_epoch'] = tr['epochs_done']
                        tr['best_gap'] = gap
                        tr['best_gap_abs'] = abs(gap)
                        tr['train_ci_at_best'] = tr_ci
                        tr['val_ci_at_best'] = va_ci
                        tr['no_improve'] = 0
                    else:
                        # if tied/close (within min_delta), prefer epoch with smaller |gap|
                        if abs(va_ci - tr['best_ci']) <= min_delta and abs(gap) < tr['best_gap_abs'] - 1e-12:
                            tr['best_state'] = copy.deepcopy(tr['model'].state_dict())
                            tr['best_epoch'] = tr['epochs_done']
                            tr['best_gap'] = gap
                            tr['best_gap_abs'] = abs(gap)
                            tr['train_ci_at_best'] = tr_ci
                            tr['val_ci_at_best'] = va_ci
                        tr['no_improve'] += 1
                        if tr['no_improve'] >= es_patience:
                            print(f"Trial {tr['id']} early-stopped at epoch {tr['epochs_done']} "
                                  f"(best Val CI={tr['best_ci']:.4f})")
                            break

            # prune: keep top 1/eta among alive
            alive = [tr for tr in trials if tr['alive']]
            alive.sort(key=lambda z: z['best_ci'], reverse=True)
            keep_n = max(1, math.ceil(len(alive) / eta))
            survivors = set(tr['id'] for tr in alive[:keep_n])

            print(f"Alive before prune: {len(alive)}; keeping top {keep_n}")
            for tr in alive:
                if tr['id'] not in survivors:
                    tr['alive'] = False
                    # free memory
                    del tr['model']; tr['model'] = None
                    tr['opt'] = None; tr['sched'] = None
                    if torch.cuda.is_available(): torch.cuda.empty_cache()

            prev_target = rung_ep

        # ----- save plots per trial -----
        out_date = current_date
        for tr in trials:
            cfg = (f"trial{tr['id']}_layers-{'-'.join(map(str, tr['hp']['layers']))}"
                   f"_drop{tr['hp']['dropout']:.2f}_lr{tr['hp']['lr']:.2e}"
                   f"_wd{tr['hp']['wd']:.2e}_l1{tr['hp']['l1']:.2e}")
            epochs = range(1, len(tr['hist_train_ci'])+1)
            plt.figure()
            plt.plot(epochs, tr['hist_train_ci'], label='Train CI')
            plt.plot(epochs, tr['hist_val_ci'], label='Val CI')
            # Optional: plot ΔVal-CI MA on a secondary axis as trend
            ax = plt.gca()
            ax2 = ax.twinx()
            ax2.plot(epochs, tr['hist_val_ci_delta_ma'], linestyle='--', alpha=0.5, label=f'ΔVal-CI MA')
            ax.set_xlabel('Epoch'); ax.set_ylabel('Concordance Index')
            ax2.set_ylabel('ΔVal-CI MA')
            lines, labels = ax.get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax.legend(lines + lines2, labels + labels2, loc='lower right')
            plt.grid(True, alpha=0.3); plt.title(cfg)
            ax.set_ylim(0.4, 1.0)  # constant y-scale across plots
            plot_path = os.path.join(output_dir, f"{out_date}_ci_{cfg}.png")
            plt.savefig(plot_path, dpi=150, bbox_inches='tight'); plt.close()
            print(f"Saved CI plot to {plot_path}")

        # ----- build results table (needed before 1-SE selection) -----
        results = []
        for tr in trials:
            row = {
                'trial_id': tr['id'],
                'val_ci': tr['best_ci'],
                'best_epoch': tr.get('best_epoch', np.nan),
                'overfit_gap_at_best': tr.get('best_gap', np.nan),     # Train−Val at chosen epoch
                'train_ci_at_best': tr.get('train_ci_at_best', np.nan),
                'val_ci_at_best': tr.get('val_ci_at_best', np.nan),
                'epochs_trained': len(tr['hist_val_ci']),
                'alive_final': tr['alive'],
                'avg_loss_last': tr['hist_loss'][-1] if tr['hist_loss'] else np.nan,
                'skip_frac_last': tr['hist_skip'][-1] if tr['hist_skip'] else np.nan,
                'val_ci_ma10_last': tr['hist_val_ci_delta_ma'][-1] if tr['hist_val_ci_delta_ma'] else np.nan
            }
            row.update({
                'layers': '-'.join(map(str, tr['hp']['layers'])),
                'dropout': tr['hp']['dropout'], 'lr': tr['hp']['lr'],
                'weight_decay(L2)': tr['hp']['wd'], 'l1_lambda': tr['hp']['l1'],
                'param_count': count_params(X_train.shape[1], tr['hp']['layers'])
            })
            results.append(row)

        df = pd.DataFrame(results).sort_values('val_ci', ascending=False)

        # --- 1-SE + simplicity selection on the validation set (no nested CV) ---
        boot_rows = []
        for tr in trials:
            mu, se, p05 = _bootstrap_val_ci_for_trial(
                tr, X_valid, y_valid_time, y_valid_event, X_train.shape[1], device, B=150, seed=123
            )
            boot_rows.append({'trial_id': tr['id'], 'boot_mean_val_ci': mu, 'boot_se_val_ci': se, 'boot_p05_val_ci': p05})

        boot_df = pd.DataFrame(boot_rows)
        df = df.merge(boot_df, on='trial_id', how='left')

        # 1-SE threshold
        best_mu_idx = df['boot_mean_val_ci'].idxmax()
        best_mu = df.loc[best_mu_idx, 'boot_mean_val_ci']
        best_se = df.loc[best_mu_idx, 'boot_se_val_ci']
        one_se_threshold = best_mu - best_se

        # Candidates within 1-SE; prefer simpler (param_count), then smaller |gap|, then higher pessimistic bound
        df['abs_gap'] = df['overfit_gap_at_best'].abs()
        cands = df[df['boot_mean_val_ci'] >= one_se_threshold].copy()
        one_se_pick = cands.sort_values(['param_count', 'abs_gap', 'boot_p05_val_ci'],
                                        ascending=[True, True, False]).iloc[0]

        print("\n[Selection] 1-SE threshold:", one_se_threshold)
        print("[Selection] 1-SE pick:",
              dict(one_se_pick[['trial_id','boot_mean_val_ci','boot_se_val_ci','param_count','overfit_gap_at_best','boot_p05_val_ci']]))

        # Use the chosen trial as the final model
        selected_id = int(one_se_pick['trial_id'])
        best_trial = next(tr for tr in trials if tr['id'] == selected_id)
        best_hp = best_trial['hp']

        # (optional) annotate CSV
        df['selected_via'] = (df['trial_id'] == selected_id).astype(int)
        csv_path = os.path.join(output_dir, f"{current_date}_deepsurv_randomSH_results.csv")
        df.to_csv(csv_path, index=False)
        print(f"Hyperparameter search results saved to {csv_path}")

        # ----- save best model -----
        best_model_path = os.path.join(output_dir, f"{out_date}_best_deepsurv_model.pth")
        torch.save(best_trial['best_state'], best_model_path)
        print(f"Best model saved to {best_model_path}")

        # ----- test evaluation -----
        test_df = pd.read_csv("/content/drive/MyDrive/affyTestFrozen.csv")
        if 'Adjuvant Chemo' in test_df.columns:
            test_df['Adjuvant Chemo'] = test_df['Adjuvant Chemo'].replace({'OBS':0,'ACT':1})
        for col in binary_columns:
            if col in test_df.columns: test_df[col] = test_df[col].astype(int)
        X_test = scaler.transform(test_df[feature_cols].values.astype(np.float32)).astype(np.float32)
        y_test_time = test_df['OS_MONTHS'].values.astype(np.float32)
        y_test_event = test_df['OS_STATUS'].values.astype(np.float32)

        test_loader = DataLoader(SurvivalDataset(X_test, y_test_time, y_test_event),
                                 batch_size=BATCH_SIZE, shuffle=False)

        # rebuild & load best
        final_model = DeepSurvMLP(X_train.shape[1], best_hp['layers'], dropout=best_hp['dropout']).to(device)
        final_model.load_state_dict(torch.load(best_model_path, map_location=device))
        test_ci = evaluate_ci(final_model, test_loader, device)
        print(f"Test CI: {test_ci:.4f}")

        sys.stdout.flush()

    sys.stdout = original_stdout
    print("Training completed. Check your log file at:", log_path)

if __name__ == "__main__":
    main()

  df['Adjuvant Chemo'] = df['Adjuvant Chemo'].replace({'OBS':0,'ACT':1})



=== Rung 1/4 → target 16 epochs ===
[Trial 00 | Rung 1/4 | Epoch   1] Loss 7.0758 | FullStepLoss 8.8698 | Skip% 0.0 | Train CI 0.6122 | Val CI 0.5766 | Gap(T−V) +0.0356 | ΔVal-CI MA(10) +0.0000 | Best -inf
[Trial 00 | Rung 1/4 | Epoch   2] Loss 8.6257 | FullStepLoss 10.9897 | Skip% 0.0 | Train CI 0.7110 | Val CI 0.6087 | Gap(T−V) +0.1023 | ΔVal-CI MA(10) +0.0160 | Best 0.5766
[Trial 00 | Rung 1/4 | Epoch   3] Loss 10.2127 | FullStepLoss 12.9538 | Skip% 0.0 | Train CI 0.7290 | Val CI 0.6088 | Gap(T−V) +0.1202 | ΔVal-CI MA(10) +0.0107 | Best 0.6087
[Trial 00 | Rung 1/4 | Epoch   4] Loss 12.1492 | FullStepLoss 14.6535 | Skip% 0.0 | Train CI 0.7579 | Val CI 0.6136 | Gap(T−V) +0.1443 | ΔVal-CI MA(10) +0.0092 | Best 0.6087
[Trial 00 | Rung 1/4 | Epoch   5] Loss 14.5674 | FullStepLoss 16.3424 | Skip% 0.0 | Train CI 0.7264 | Val CI 0.5977 | Gap(T−V) +0.1288 | ΔVal-CI MA(10) +0.0042 | Best 0.6136
[Trial 00 | Rung 1/4 | Epoch   6] Loss 16.2351 | FullStepLoss 18.2650 | Skip% 0.0 | Train CI 0.637

  test_df['Adjuvant Chemo'] = test_df['Adjuvant Chemo'].replace({'OBS':0,'ACT':1})


Test CI: 0.6338
Training completed. Check your log file at: /content/drive/MyDrive/deepsurv_9-16-25_training_log_fRMA.txt


**SINGLE MODEL**

In [None]:
import os
import sys
import math
import datetime
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Sampler
from torchsurv.loss.cox import neg_partial_log_likelihood
from sksurv.metrics import concordance_index_censored
import warnings
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import copy
import random

warnings.filterwarnings("ignore", message="Ties in event time detected; using efron's method to handle ties.")
torch.manual_seed(0); np.random.seed(0); random.seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# ---------------- utils ----------------
class Tee:
    def __init__(self, *files): self.files = files
    def write(self, data):
        for f in self.files: f.write(data)
    def flush(self):
        for f in self.files: f.flush()

def loguniform(rng, lo, hi):
    return float(np.exp(rng.uniform(np.log(lo), np.log(hi))))

# ---------------- model & data ----------------
class DeepSurvMLP(nn.Module):
    def __init__(self, in_features, hidden_layers, dropout=0.0, activation=nn.ReLU()):
        super().__init__()
        layers, d = [], in_features
        for units in hidden_layers:
            layers += [nn.Linear(d, units), activation]
            if dropout > 0: layers.append(nn.Dropout(dropout))
            d = units
        layers.append(nn.Linear(d, 1))
        self.model = nn.Sequential(*layers)
    def forward(self, x): return self.model(x)

class SurvivalDataset(Dataset):
    def __init__(self, features, time_vals, events):
        self.x = torch.tensor(features, dtype=torch.float32)
        self.time = torch.tensor(time_vals, dtype=torch.float32)
        self.event = torch.tensor(events, dtype=torch.bool)
    def __len__(self): return len(self.x)
    def __getitem__(self, idx): return self.x[idx], self.time[idx], self.event[idx]

# ---- (1) Event-balanced batch sampler: guarantee ≥1 event per batch ----
class EventBalancedBatchSampler(Sampler):
    def __init__(self, events_numpy, batch_size, seed=0):
        events = np.asarray(events_numpy).astype(bool)
        self.pos_idx = np.where(events)[0]
        self.neg_idx = np.where(~events)[0]
        assert len(self.pos_idx) > 0, "No events in training set — cannot balance batches."
        self.bs = int(batch_size)
        self.rng = np.random.default_rng(seed)

    def __iter__(self):
        # one finite pass == one epoch
        pos = self.rng.permutation(self.pos_idx)
        neg = self.rng.permutation(self.neg_idx)
        n_total = len(pos) + len(neg)
        n_batches = math.ceil(n_total / self.bs)

        pi = ni = 0
        for _ in range(n_batches):
            take_pos = 1 if pi < len(pos) else 0
            # fill the rest with negatives if we can
            avail_neg = max(0, len(neg) - ni)
            take_neg = min(self.bs - take_pos, avail_neg)

            # if we ran out of negatives, top up with extra positives
            need = self.bs - (take_pos + take_neg)
            extra_pos = min(need, max(0, len(pos) - (pi + take_pos)))
            take_pos += extra_pos

            batch = np.concatenate([
                pos[pi:pi+take_pos],
                neg[ni:ni+take_neg]
            ])
            pi += take_pos
            ni += take_neg

            if batch.size == 0:
                break
            self.rng.shuffle(batch)
            yield batch.tolist()

    def __len__(self):
        return math.ceil((len(self.pos_idx) + len(self.neg_idx)) / self.bs)

# ---- Param groups: L2 only on non-bias, non-final weights ----
def make_optimizer(model, lr, wd):
    # find last Linear
    linears = [m for m in model.modules() if isinstance(m, nn.Linear)]
    last_linear = linears[-1] if len(linears) > 0 else None

    decay, no_decay = [], []
    for name, p in model.named_parameters():
        if not p.requires_grad:
            continue
        if name.endswith('bias'):
            no_decay.append(p); continue
        if (last_linear is not None) and (p is last_linear.weight):
            no_decay.append(p); continue
        decay.append(p)

    param_groups = [
        {'params': decay, 'weight_decay': wd},
        {'params': no_decay, 'weight_decay': 0.0},
    ]
    return optim.AdamW(param_groups, lr=lr)

# ---- (2) Reg warm-up helpers (dropout + WD) ----
def set_dropout_p(model, p):
    for m in model.modules():
        if isinstance(m, nn.Dropout):
            m.p = float(p)

def set_weight_decay(optimizer, wd):
    for g in optimizer.param_groups:
        g['weight_decay'] = float(wd)

# L1 ONLY on the first (input) Linear layer (already warmed up in train loop)
def l1_penalty_first_layer(model):
    for m in model.modules():
        if isinstance(m, nn.Linear):
            return m.weight.abs().sum()
    return torch.tensor(0.0, device=next(model.parameters()).device)

def train_one_epoch(model, optimizer, dataloader, device, l1_lambda=0.0, epoch=0, warmup_epochs=20):
    model.train()
    warm = min(1.0, (epoch + 1) / float(warmup_epochs))  # linear warmup of L1
    loss_sum, n_seen = 0.0, 0
    skipped, total_batches = 0, 0
    for x, t, e in dataloader:
        total_batches += 1
        # with balanced sampler, this should almost never trigger, but keep it safe:
        if e.sum().item() == 0:
            skipped += 1
            continue
        x, t, e = x.to(device), t.to(device), e.to(device)
        optimizer.zero_grad(set_to_none=True)
        out = torch.clamp(model(x), -20, 20)
        loss = neg_partial_log_likelihood(out, e, t, reduction='mean')
        if l1_lambda > 0:
            loss = loss + (l1_lambda * warm) * l1_penalty_first_layer(model)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()
        loss_sum += loss.item() * x.size(0)
        n_seen += x.size(0)
    avg_loss = loss_sum / max(n_seen, 1)
    skip_frac = skipped / max(total_batches, 1)
    return {'avg_loss': avg_loss, 'skip_frac': skip_frac, 'warm': warm}

# ---- (3) Full-risk-set correction step (1x per epoch) ----
def full_risk_set_step(model, optimizer, train_ds, device, l1_lambda=0.0, warm=1.0):
    model.train()
    X_all = train_ds.x.to(device)
    t_all = train_ds.time.to(device)
    e_all = train_ds.event.to(device)
    optimizer.zero_grad(set_to_none=True)
    out_all = torch.clamp(model(X_all), -20, 20)
    loss_full = neg_partial_log_likelihood(out_all, e_all, t_all, reduction='mean')
    if l1_lambda > 0:
        loss_full = loss_full + (l1_lambda * warm) * l1_penalty_first_layer(model)
    loss_full.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
    optimizer.step()
    return float(loss_full.detach().cpu().item())

def evaluate_ci(model, dataloader, device):
    model.eval()
    preds, times, events = [], [], []
    with torch.no_grad():
        for x, t, e in dataloader:
            x = x.to(device)
            y = torch.clamp(model(x), -20, 20)
            preds.append(y.cpu().numpy().ravel())
            times.append(t.numpy()); events.append(e.numpy())
    preds = np.concatenate(preds)
    if np.isnan(preds).any():
        print("Warning: NaN predictions detected, returning -inf for concordance index")
        return -np.inf
    times = np.concatenate(times); events = np.concatenate(events)
    return concordance_index_censored(events.astype(bool), times, preds)[0]

def count_params(in_dim, hidden_layers):
    params, d = 0, in_dim
    for h in hidden_layers:
        params += d*h + h  # weights + bias
        d = h
    params += d*1 + 1     # final layer
    return params

# single model - trial 71
def main():
    # ----- fixed hyperparameters (single run) -----
    HIDDEN_LAYERS = [512]
    DROPOUT = 0.355164077
    LR = 2.69E-05
    WD = 0.4063904607
    L1 = 0.008506512284

    MAX_EPOCHS = 30
    BATCH_SIZE = 64
    L1_WARMUP_EPOCHS = 30
    WD_WARMUP_EPOCHS = 30
    DROPOUT_WARMUP_EPOCHS = 30
    DROPOUT_START = 0.15
    WD_START = 0.0
    PRINT_EVERY = 1
    MIN_DELTA = 1e-4
    DELTA_CI_MA_K = 10

    # logging
    original_stdout = sys.stdout
    log_path = "/content/drive/MyDrive/deepsurv_single_run_log_71.txt"
    with open(log_path, "w") as log_file:
        sys.stdout = Tee(original_stdout, log_file)

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        current_date = datetime.datetime.now().strftime("%Y%m%d")
        output_dir = "/content/drive/MyDrive/deepsurv_results_single"
        os.makedirs(output_dir, exist_ok=True)

        # ----- load & prep data -----
        train_df = pd.read_csv("/content/drive/MyDrive/affyTrainFrozen.csv")
        valid_df = pd.read_csv("/content/drive/MyDrive/affyValidationFrozen.csv")
        for df in (train_df, valid_df):
            if 'Adjuvant Chemo' in df.columns:
                df['Adjuvant Chemo'] = df['Adjuvant Chemo'].replace({'OBS':0,'ACT':1})
        binary_columns = ['Adjuvant Chemo','IS_MALE']
        for col in binary_columns:
            if col in train_df.columns: train_df[col] = train_df[col].astype(int)
            if col in valid_df.columns: valid_df[col] = valid_df[col].astype(int)

        survival_cols = ['OS_STATUS','OS_MONTHS']
        feature_cols = [c for c in train_df.columns if c not in survival_cols]

        X_train = train_df[feature_cols].values.astype(np.float32)
        X_valid = valid_df[feature_cols].values.astype(np.float32)
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train).astype(np.float32)
        X_valid = scaler.transform(X_valid).astype(np.float32)

        y_train_time = train_df['OS_MONTHS'].values.astype(np.float32)
        y_train_event = train_df['OS_STATUS'].values.astype(np.float32)
        y_valid_time = valid_df['OS_MONTHS'].values.astype(np.float32)
        y_valid_event = valid_df['OS_STATUS'].values.astype(np.float32)

        train_ds = SurvivalDataset(X_train, y_train_time, y_train_event)
        valid_ds = SurvivalDataset(X_valid, y_valid_time, y_valid_event)

        # loaders (event-balanced batches for training)
        train_sampler = EventBalancedBatchSampler(y_train_event, BATCH_SIZE, seed=42)
        train_loader  = DataLoader(train_ds, batch_sampler=train_sampler)
        train_eval_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=False)
        valid_loader      = DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=False)

        # ----- build single model -----
        model = DeepSurvMLP(X_train.shape[1], HIDDEN_LAYERS, dropout=DROPOUT).to(device)
        optimizer = make_optimizer(model, lr=LR, wd=WD)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCHS)

        # tracking
        best_state = copy.deepcopy(model.state_dict())
        best_val_ci = -np.inf
        best_epoch = 0
        best_gap = np.inf
        best_gap_abs = np.inf
        train_ci_at_best = np.nan
        val_ci_at_best = np.nan

        hist_train_ci, hist_val_ci, hist_gap = [], [], []
        hist_loss, hist_skip, hist_val_ci_delta, hist_val_ci_delta_ma = [], [], [], []

        # ----- train loop -----
        for epoch in range(1, MAX_EPOCHS + 1):
            # warm up dropout & weight decay schedules
            frac_d = min(1.0, (epoch-1) / float(DROPOUT_WARMUP_EPOCHS))
            frac_w = min(1.0, (epoch-1) / float(WD_WARMUP_EPOCHS))
            p_t = DROPOUT_START + (DROPOUT - DROPOUT_START) * frac_d
            wd_t = WD_START + (WD - WD_START) * frac_w
            set_dropout_p(model, p_t)
            set_weight_decay(optimizer, wd_t)

            stats = train_one_epoch(
                model, optimizer, train_loader, device,
                l1_lambda=L1, epoch=epoch-1, warmup_epochs=L1_WARMUP_EPOCHS
            )
            full_loss = full_risk_set_step(
                model, optimizer, train_ds, device,
                l1_lambda=L1, warm=stats['warm']
            )

            # eval
            tr_ci = evaluate_ci(model, train_eval_loader, device)
            va_ci = evaluate_ci(model, valid_loader, device)
            gap = tr_ci - va_ci

            hist_train_ci.append(tr_ci)
            hist_val_ci.append(va_ci)
            hist_gap.append(gap)
            hist_loss.append(stats['avg_loss'])
            hist_skip.append(stats['skip_frac'])

            if len(hist_val_ci) >= 2:
                delta = hist_val_ci[-1] - hist_val_ci[-2]
            else:
                delta = 0.0
            hist_val_ci_delta.append(delta)
            ma = float(np.mean(hist_val_ci_delta[-DELTA_CI_MA_K:]))
            hist_val_ci_delta_ma.append(ma)

            scheduler.step()

            if epoch % PRINT_EVERY == 0:
                print(f"[Epoch {epoch:3d}] "
                      f"Loss {stats['avg_loss']:.4f} | FullStepLoss {full_loss:.4f} | "
                      f"Skip% {100*stats['skip_frac']:.1f} | "
                      f"Train CI {tr_ci:.4f} | Val CI {va_ci:.4f} | "
                      f"Gap(T−V) {gap:+.4f} | ΔVal-CI MA({DELTA_CI_MA_K}) {ma:+.4f} | "
                      f"Best {best_val_ci:.4f}")

            # best-by-val-ci; break ties by smaller |gap|
            if va_ci > best_val_ci + MIN_DELTA:
                best_val_ci = va_ci
                best_state = copy.deepcopy(model.state_dict())
                best_epoch = epoch
                best_gap = gap
                best_gap_abs = abs(gap)
                train_ci_at_best = tr_ci
                val_ci_at_best = va_ci
            else:
                if abs(va_ci - best_val_ci) <= MIN_DELTA and abs(gap) < best_gap_abs - 1e-12:
                    best_state = copy.deepcopy(model.state_dict())
                    best_epoch = epoch
                    best_gap = gap
                    best_gap_abs = abs(gap)
                    train_ci_at_best = tr_ci
                    val_ci_at_best = va_ci

        print("\n=== Training complete ===")
        print(f"Best Val CI: {best_val_ci:.4f} at epoch {best_epoch} (Train CI {train_ci_at_best:.4f}, Gap {best_gap:+.4f})")

        # ----- save artifacts -----
        # plot (optional, small)
        epochs = range(1, len(hist_train_ci)+1)
        plt.figure()
        plt.plot(epochs, hist_train_ci, label='Train CI')
        plt.plot(epochs, hist_val_ci, label='Val CI')
        ax = plt.gca()
        ax2 = ax.twinx()
        ax2.plot(epochs, hist_val_ci_delta_ma, linestyle='--', alpha=0.5, label='ΔVal-CI MA')
        ax.set_xlabel('Epoch'); ax.set_ylabel('Concordance Index'); ax2.set_ylabel('ΔVal-CI MA')
        lines, labels = ax.get_legend_handles_labels()
        lines2, labels2 = ax2.get_legend_handles_labels()
        ax.legend(lines + lines2, labels + labels2, loc='lower right')
        ax.set_ylim(0.4, 1.0)
        plt.grid(True, alpha=0.3)
        cfg = f"layers-{'-'.join(map(str,HIDDEN_LAYERS))}_drop{DROPOUT:.2f}_lr{LR:.2e}_wd{WD:.2e}_l1{L1:.2e}"
        plt.title(cfg)
        plot_path = os.path.join(output_dir, f"{current_date}_ci_single_{cfg}.png")
        plt.savefig(plot_path, dpi=150, bbox_inches='tight'); plt.close()
        print(f"Saved CI plot to {plot_path}")

        # save best model
        best_model_path = os.path.join(output_dir, f"{current_date}_single_deepsurv_model.pth")
        torch.save(best_state, best_model_path)
        print(f"Best model saved to {best_model_path}")

        # save brief results CSV
        results = pd.DataFrame([{
            'layers': '-'.join(map(str, HIDDEN_LAYERS)),
            'dropout': DROPOUT, 'lr': LR,
            'weight_decay(L2)': WD, 'l1_lambda': L1,
            'param_count': count_params(X_train.shape[1], HIDDEN_LAYERS),
            'best_epoch': best_epoch,
            'train_ci_at_best': train_ci_at_best,
            'val_ci_at_best': val_ci_at_best,
            'overfit_gap_at_best': best_gap,
            'epochs_trained': len(hist_val_ci),
            'avg_loss_last': hist_loss[-1] if hist_loss else np.nan,
            'skip_frac_last': hist_skip[-1] if hist_skip else np.nan,
            'val_ci_ma10_last': hist_val_ci_delta_ma[-1] if hist_val_ci_delta_ma else np.nan
        }])
        csv_path = os.path.join(output_dir, f"{current_date}_single_run_results.csv")
        results.to_csv(csv_path, index=False)
        print(f"Single-run results saved to {csv_path}")

        # ----- test evaluation -----
        test_df = pd.read_csv("/content/drive/MyDrive/affyTestFrozen.csv")
        if 'Adjuvant Chemo' in test_df.columns:
            test_df['Adjuvant Chemo'] = test_df['Adjuvant Chemo'].replace({'OBS':0,'ACT':1})
        for col in binary_columns:
            if col in test_df.columns: test_df[col] = test_df[col].astype(int)

        X_test = scaler.transform(test_df[feature_cols].values.astype(np.float32)).astype(np.float32)
        y_test_time = test_df['OS_MONTHS'].values.astype(np.float32)
        y_test_event = test_df['OS_STATUS'].values.astype(np.float32)

        test_loader = DataLoader(SurvivalDataset(X_test, y_test_time, y_test_event),
                                 batch_size=BATCH_SIZE, shuffle=False)

        # rebuild & load best
        final_model = DeepSurvMLP(X_train.shape[1], HIDDEN_LAYERS, dropout=DROPOUT).to(device)
        final_model.load_state_dict(torch.load(best_model_path, map_location=device))
        test_ci = evaluate_ci(final_model, test_loader, device)
        print(f"Test CI: {test_ci:.4f}")

        sys.stdout.flush()

    sys.stdout = original_stdout
    print("Single-model training completed. Check your log file at:", log_path)

if __name__ == "__main__":
    main()

  df['Adjuvant Chemo'] = df['Adjuvant Chemo'].replace({'OBS':0,'ACT':1})


[Epoch   1] Loss 12.1418 | FullStepLoss 15.2923 | Skip% 0.0 | Train CI 0.5215 | Val CI 0.5354 | Gap(T−V) -0.0139 | ΔVal-CI MA(10) +0.0000 | Best -inf
[Epoch   2] Loss 20.1175 | FullStepLoss 22.7214 | Skip% 0.0 | Train CI 0.6804 | Val CI 0.6259 | Gap(T−V) +0.0546 | ΔVal-CI MA(10) +0.0452 | Best 0.5354
[Epoch   3] Loss 27.5403 | FullStepLoss 30.2976 | Skip% 0.0 | Train CI 0.6819 | Val CI 0.6455 | Gap(T−V) +0.0365 | ΔVal-CI MA(10) +0.0367 | Best 0.6259
[Epoch   4] Loss 34.7130 | FullStepLoss 37.2511 | Skip% 0.0 | Train CI 0.7156 | Val CI 0.6512 | Gap(T−V) +0.0644 | ΔVal-CI MA(10) +0.0289 | Best 0.6455
[Epoch   5] Loss 41.1263 | FullStepLoss 43.5886 | Skip% 0.0 | Train CI 0.7696 | Val CI 0.6482 | Gap(T−V) +0.1214 | ΔVal-CI MA(10) +0.0226 | Best 0.6512
[Epoch   6] Loss 47.3751 | FullStepLoss 49.5353 | Skip% 0.0 | Train CI 0.6963 | Val CI 0.6253 | Gap(T−V) +0.0710 | ΔVal-CI MA(10) +0.0150 | Best 0.6512
[Epoch   7] Loss 52.5724 | FullStepLoss 54.6075 | Skip% 0.0 | Train CI 0.7819 | Val CI 0.6

  test_df['Adjuvant Chemo'] = test_df['Adjuvant Chemo'].replace({'OBS':0,'ACT':1})


Test CI: 0.6184
Single-model training completed. Check your log file at: /content/drive/MyDrive/deepsurv_single_run_log_71.txt
