# RSF - VIMP

In [25]:
################################################################################
# SET UP
################################################################################

import os
import sys
import time
import numpy as np
import pandas as pd
import joblib
import pickle
from collections import Counter
#from sksurv.util import Surv
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import OneHotEncoder

# Set working directory
os.chdir(os.path.expanduser("~/PhD_Workspace/PredictRecurrence/"))

# Import custom RSF functions
sys.path.append("/Users/le7524ho/PhD_Workspace/PredictRecurrence/src/")
from src.utils import (
    log,
    load_training_data,
    beta2m,
    subset_methylation)
from src.annotation_functions import (
    run_univariate_cox_for_cpgs
)

In [33]:

################################################################################
# PARAMS
################################################################################

# Output directory and files
output_dir = "output/RSF/VIMP_analysis/" # ⚠️ ADAPT
os.makedirs(output_dir, exist_ok=True)
#outfile_univariate_cox = os.path.join(output_dir, "testset_univariate_cox.csv")


In [26]:
################################################################################
# INPUT FILES
################################################################################

infile_map = {
    "ERpHER2n_Clinical" : "./output/RSF/ERpHER2n/Clinical/None/outer_cv_models.pkl", 
    "ERpHER2n_Combined" : "./output/RSF/ERpHER2n/Combined/Unadjusted/outer_cv_models.pkl",
    "ERpHER2n_Methylation" : "./output/RSF/ERpHER2n/Methylation/Unadjusted/outer_cv_models.pkl",

    "TNBC_Clinical" : "./output/RSF/TNBC/Clinical/None/outer_cv_models.pkl",
    "TNBC_Combined" : "./output/RSF/TNBC/Combined/Unadjusted/outer_cv_models.pkl", 
    "TNBC_Methylation" : "./output/RSF/TNBC/Methylation/Unadjusted/outer_cv_models.pkl", 

    "All_Clinical" : "./output/RSF/All/Clinical/None/outer_cv_models.pkl", 
    "All_Combined" : "./output/RSF/All/Combined/Unadjusted/outer_cv_models.pkl", 
    "All_Methylation" : "./output/RSF/All/Methylation/Unadjusted/outer_cv_models.pkl" 
}

In [27]:
# Input files
infile_betavalues = "./data/train/train_methylation_unadjusted.csv" # ⚠️ ADAPT
infile_clinical = "./data/train/train_clinical.csv"
infile_train_ids = "./data/train/train_subcohorts/ERpHER2n_train_ids.csv" # sample ids of training cohort

## Load Models and Data

In [20]:
outerfolds = {}
for key, filepath in infile_map.items():
    outerfolds[key] = joblib.load(filepath)

In [28]:
# Load and prepare data
train_ids = pd.read_csv(infile_train_ids, header=None).iloc[:, 0].tolist()
beta_matrix, clinical_data = load_training_data(train_ids, infile_betavalues, infile_clinical)

# convert to M-values
mvals = beta2m(beta_matrix, beta_threshold=0.001)
infile_cpg_ids = "./data/set_definitions/CpG_prefiltered_sets/cpg_ids_atac_overlap.txt"

# admin censoring for tnbc
# subset methylation atac overlap (needed to match ids)
mvals = subset_methylation(mvals,infile_cpg_ids)
X = mvals.copy()

Loaded training data.
Successfully loaded 205799 CpG IDs for pre-filtering.
Successfully subsetted methylation data to 193246 pre-filtered CpGs.


In [29]:
# onehot encode cat clinvars
# subset clinical data aligned to X
clin = clinical_data[["Age", "Size.mm", "NHG", "LN"]].loc[X.index]
# one-hot encode the categorical clinical variables
encoder = OneHotEncoder(drop=None, dtype=float, sparse_output=False)
encoded = encoder.fit_transform(clin[["NHG", "LN"]])

encoded_cols = encoder.get_feature_names_out(["NHG", "LN"]).tolist()

# make a DataFrame for the encoded columns
encoded_df = pd.DataFrame(encoded, columns=encoded_cols, index=X.index)
# build the encoded clinical DataFrame (drop original categorical cols)
clin_encoded = pd.concat([clin.drop(columns=["NHG", "LN"]), encoded_df], axis=1)
# concatenate encoded clinical back into X
X = pd.concat([X, clin_encoded], axis=1).copy()

# build clinvars_included_encoded: replace original categorical names with encoded column names
clinvars_included_encoded = [c for c in ["Age", "Size.mm", "NHG", "LN"] if c not in ["NHG", "LN"]] + encoded_cols
log(f"Added {clinvars_included_encoded} clinical variables. New X shape: {X.shape}")


=== Added ['Age', 'Size.mm', 'NHG_1', 'NHG_2', 'NHG_3', 'LN_N+', 'LN_N0'] clinical variables. New X shape: (1008, 193253) ===



In [30]:
from sksurv.util import Surv

y = Surv.from_dataframe("RFi_event", "RFi_years", clinical_data)

## Test example: ERpHER2n_Combined models

In [31]:
def save_checkpoint(results, checkpoint_file='permutation_checkpoint.pkl'):
    """Save results after each fold."""
    with open(checkpoint_file, 'wb') as f:
        pickle.dump(results, f)
    print(f"Checkpoint saved to {checkpoint_file}")

def load_checkpoint(checkpoint_file='permutation_checkpoint.pkl'):
    """Load existing results if available."""
    try:
        with open(checkpoint_file, 'rb') as f:
            return pickle.load(f)
    except FileNotFoundError:
        return {}

In [46]:
small_dict = {k: v[0:1] for k, v in outerfolds.items()}


In [47]:
for k, v in small_dict.items():
    print(k)
    print(v[0])

ERpHER2n_Clinical
{'fold': 0, 'model': Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  ['Age', 'Size.mm']),
                                                 ('passthrough_encoded',
                                                  'passthrough',
                                                  ['NHG_1', 'NHG_2', 'NHG_3',
                                                   'LN_N+', 'LN_N0'])],
                                   verbose_feature_names_out=False)),
                ('randomsurvivalforest',
                 RandomSurvivalForest(max_features=0.15, min_samples_leaf=8,
                                      min_samples_split=16,
                                      n_estimators=1454))]), 'train_idx': array([   0,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,
         12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   23,
         24,   25, 

In [None]:
for subcohort_modeltype in small_dict.keys():#outerfolds.keys():
    print(f"Current run for : {subcohort_modeltype}")
    subcohort_modeltype_dict = small_dict[subcohort_modeltype]#outerfolds[subcohort_modeltype]
    for entry in subcohort_modeltype_dict:
        fold = entry["fold"]
                print(fold)


Current run for : ERpHER2n_Clinical
{'fold': 0, 'model': Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  ['Age', 'Size.mm']),
                                                 ('passthrough_encoded',
                                                  'passthrough',
                                                  ['NHG_1', 'NHG_2', 'NHG_3',
                                                   'LN_N+', 'LN_N0'])],
                                   verbose_feature_names_out=False)),
                ('randomsurvivalforest',
                 RandomSurvivalForest(max_features=0.15, min_samples_leaf=8,
                                      min_samples_split=16,
                                      n_estimators=1454))]), 'train_idx': array([   0,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,
         12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   23,
 

In [34]:
from sklearn.inspection import permutation_importance
import pickle
import pandas as pd
import numpy as np
from datetime import datetime

# Main execution
results = load_checkpoint()  # Resume if interrupted

print(f"\n{'='*80}")
print(f"PERMUTATION IMPORTANCE ANALYSIS")
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Configuration:")
print(f"  - Folds: 10")
print(f"  - Features per fold: ~5,000")
print(f"  - Trees per model: 800")
print(f"  - n_repeats: 3")
print(f"  - max_samples: 0.5")
print(f"Estimated total time: 8-12 hours")
print(f"{'='*80}\n")

for subcohort_modeltype in small_dict.keys():#outerfolds.keys():
    print(f"Current run for : {subcohort_modeltype}")
    subcohort_modeltype_dict = small_dict[subcohort_modeltype]#outerfolds[subcohort_modeltype]
    for entry in subcohort_modeltype_dict:
        fold = entry["fold"]
        
        # Skip if already computed
        if fold in results:
            print(f"Fold {fold}: Already computed, skipping...")
            continue
        
        if entry["model"] is None:
            print(f"Fold {fold}: No model, skipping...")
            continue
        
        model = entry["model"]
        test_idx = entry["test_idx"]
        train_idx = entry["train_idx"]
        
        features_to_use = entry.get("features_after_filter2") or entry.get("features_after_filter1")
        X_test = X.iloc[test_idx][features_to_use]
        y_test = y[test_idx]
        
        print(f"\n{'─'*80}")
        print(f"   Fold {fold}/{len(subcohort_modeltype_dict)-1}")
        print(f"   Features: {len(features_to_use):,}")
        print(f"   Test samples: {len(X_test)}")
        print(f"   Started: {datetime.now().strftime('%H:%M:%S')}")
        
        # Estimate time
        est_time_min = (len(features_to_use) * 3 * 0.4) / 60  # Conservative estimate
        print(f"   Estimated duration: {est_time_min:.0f}-{est_time_min*1.5:.0f} minutes")
        print(f"{'─'*80}")
        
        try:
            # Compute permutation importance
            start_time = datetime.now()
            perm_result = permutation_importance(
                model, 
                X_test, 
                y_test, 
                n_repeats=3,
                random_state=42, 
                n_jobs=-1,
                max_samples=0.5
            )
            
            elapsed = (datetime.now() - start_time).total_seconds() / 60
            
            # Store results
            results[fold] = {
                'importances_mean': perm_result.importances_mean,
                'importances_std': perm_result.importances_std,
                'features': features_to_use,
                'elapsed_minutes': elapsed,
                'timestamp': datetime.now().isoformat()
            }
            
            # Save checkpoint after each fold
            save_checkpoint(results)
            
            print(f"   Completed in {elapsed:.1f} minutes")
            
            # Show top 5 features
            top_idx = np.argsort(perm_result.importances_mean)[-5:][::-1]
            print(f"   Top 5 features:")
            for idx in top_idx:
                print(f"      {features_to_use[idx]}: {perm_result.importances_mean[idx]:.4f}")
            
            # Progress update
            completed = len(results)
            remaining = 10 - completed
            avg_time = np.mean([results[f]['elapsed_minutes'] for f in results])
            est_remaining = remaining * avg_time
            
            print(f"\n   Progress: {completed}/10 folds complete")
            print(f"   Average time per fold: {avg_time:.1f} minutes")
            print(f"   Estimated time remaining: {est_remaining:.0f} minutes ({est_remaining/60:.1f} hours)")
            
        except Exception as e:
            print(f"   Error in fold {fold}: {e}")
            import traceback
            traceback.print_exc()
            continue

# Final save
print(f"\n{'='*80}")
print(f"ALL FOLDS COMPLETE!")
print(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*80}\n")

# Save final results
with open('permutation_importance_final.pkl', 'wb') as f:
    pickle.dump(results, f)

# Quick summary
print("Summary of computation times:")
for fold in sorted(results.keys()):
    print(f"  Fold {fold}: {results[fold]['elapsed_minutes']:.1f} minutes")

total_time = sum(results[fold]['elapsed_minutes'] for fold in results)
print(f"\nTotal computation time: {total_time:.1f} minutes ({total_time/60:.1f} hours)")


PERMUTATION IMPORTANCE ANALYSIS
Started: 2025-12-08 13:32:39
Configuration:
  - Folds: 10
  - Features per fold: ~5,000
  - Trees per model: 800
  - n_repeats: 3
  - max_samples: 0.5
Estimated total time: 8-12 hours

Current run for : ERpHER2n_Clinical


TypeError: string indices must be integers

In [None]:
for subcohort_modeltype in outerfolds.keys():
    print(subcohort_modeltype)
    print(len(outerfolds[subcohort_modeltype]))
    
    

ERpHER2n_Clinical
10
ERpHER2n_Combined
2
ERpHER2n_Methylation
2
TNBC_Clinical
2
TNBC_Combined
2
TNBC_Methylation
2
All_Clinical
2
All_Combined
2
All_Methylation
2


In [21]:
ERpHER2n_Combined_OuterDicts = outerfolds['ERpHER2n_Combined']

In [55]:
# Quick test on one fold
fold_0 = ERpHER2n_Combined_OuterDicts[0]
features = fold_0.get("features_after_filter2") or fold_0.get("features_after_filter1")
X_test = X.iloc[fold_0["test_idx"]][features]
y_test = y[fold_0["test_idx"]]

# Test different tree counts
from sksurv.ensemble import RandomSurvivalForest

for n_trees in [400, 800, 1000, 1500]:
    # Get best params
    best_params = {
        k.replace("estimator__randomsurvivalforest__", ""): v 
        for k, v in fold_0["cv_results"]["params"][fold_0["cv_results"]["rank_test_score"].argmin()].items()
        if k.startswith("estimator__")
    }
    best_params['n_estimators'] = n_trees
    best_params['n_jobs'] = -1
    
    # Train and test
    rsf = RandomSurvivalForest(**best_params)
    X_train = X.iloc[fold_0["train_idx"]][features]
    y_train = y[fold_0["train_idx"]]
    
    # Transform with preprocessing
    X_train_trans = fold_0["model"].named_steps[list(fold_0["model"].named_steps.keys())[0]].transform(X_train)
    X_test_trans = fold_0["model"].named_steps[list(fold_0["model"].named_steps.keys())[0]].transform(X_test)
    
    rsf.fit(X_train_trans, y_train)
    score = rsf.score(X_test_trans, y_test)
    
    print(f"{n_trees} trees: C-index = {score:.4f}")

400 trees: C-index = 0.6398
800 trees: C-index = 0.6186
1000 trees: C-index = 0.6294
1500 trees: C-index = 0.6304


In [None]:
# Check actual feature usage in your current models
fold_0 = ERpHER2n_Combined_OuterDicts[0]
rsf = fold_0["model"].named_steps['randomsurvivalforest']

n_trees = rsf.n_estimators
n_features_total = len(fold_0.get("features_after_filter2") or fold_0.get("features_after_filter1"))
max_features = rsf.max_features

if max_features == "sqrt":
    max_features_per_tree = int(np.sqrt(n_features_total))
elif isinstance(max_features, float):
    max_features_per_tree = int(max_features * n_features_total)
else:
    max_features_per_tree = max_features

expected_appearances = n_trees * (max_features_per_tree / n_features_total)

print(f"Total features: {n_features_total}")
print(f"Features per tree: {max_features_per_tree}")
print(f"Number of trees: {n_trees}")
print(f"Expected appearances per feature: {expected_appearances:.1f}")


NameError: name 'ERpHER2n_Combined_OuterDicts' is not defined

In [None]:
result = permutation_importance(outerfolds['ERpHER2n_Combined'][0]['model'], X_test, y_test, n_repeats=1, random_state=42)