In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [None]:
file_path = './simulated_survival_data'
train_df_control = pd.read_csv(f'{file_path}/control_train.csv', index_col=0)
X_train = train_df_control.iloc[:, :20] #gene exp
Y_train = train_df_control.iloc[:, 20:22] #survival time and event

print(X_train.shape, Y_train.shape)

(640, 20) (640, 2)


In [17]:
# clean data

invalid_count = (Y_train['time'] <= 0).sum()
valid_mask = Y_train['time'] > 0

X_filtered = X_train.loc[valid_mask].copy()
y_filtered = Y_train.loc[valid_mask].copy()

X = X_filtered
y = y_filtered

print(X.shape, y.shape)

(636, 20) (636, 2)


In [18]:
## 1. prepare scikit-survival format

from sksurv.util import Surv

y_structured = Surv.from_arrays(
    event=y_filtered['event'].values.astype(bool),
    time=y_filtered['time'].values
)
# y_structured = Surv.from_arrays(event=y['event'].values.astype(bool), time=y['time'].values)
lifelines_df = pd.concat([X, y], axis=1)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

print(X_scaled.head)

<bound method NDFrame.head of             0         1         2         3         4         5         6  \
0    0.156414 -1.846155  0.558700  0.333909 -1.161833 -0.370504 -0.689271   
1   -0.515374 -0.046981  0.626424 -0.594277  0.850448  0.228842 -0.268030   
2   -1.379102  1.281639 -0.551970  0.118678  1.417668  0.075521  0.289096   
3   -0.213755  0.215975 -0.104993 -0.674988 -1.404927 -0.900159 -0.295207   
4   -0.268595 -0.669772 -0.253986  0.253197  0.539827 -1.192863 -0.200088   
..        ...       ...       ...       ...       ...       ...       ...   
631  0.252384 -1.555519 -1.161485 -1.374491  0.202196  1.218460 -0.390326   
632  0.197544 -0.655932 -0.308165 -0.957480  0.634364  2.096572 -0.064203   
633 -0.309725 -1.057286 -1.215664 -0.917124  2.457572  0.368225  1.403348   
634  1.760480 -1.472480 -0.741597  1.033412 -1.310391  1.594794 -1.124101   
635 -1.200872  0.645008  0.802506  1.329355 -0.148940 -0.969850  0.683161   

            7         8         9        10  

In [None]:
X_scaled.to_csv(f'{file_path}/X_scaled.csv')


#### 1. Random Forest (RF)
RF is often used for classification or regression, where we use it as a classification model to predict whether an event will occur (event).

`n_estimators`: The number of trees

`random_state`: to ensure that the result can be reproduced


In [19]:
from sksurv.ensemble import RandomSurvivalForest

rsf_model = RandomSurvivalForest(n_estimators=100, random_state=42)
rsf_model.fit(X_scaled, y_structured)

risk_scores_rsf = rsf_model.predict(X_scaled)

print(risk_scores_rsf[:5])

[216.79984524 402.33692749 443.34127128 195.58181746 143.87525722]


#### 2. Cox Regression (CoxPH)

Cox Proportional risk model, which is the standard survival analysis model

Note: The model of the lifelines library usually requires a single DataFrame that contains all the data (features, time, and events).

instead of separate X and y, and it is insensitive to data scaling

In [20]:
from lifelines import CoxPHFitter

cph = CoxPHFitter()
cph.fit(lifelines_df, duration_col='time', event_col='event')

cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'time'
event col,'event'
baseline estimation,breslow
number of observations,636
number of events observed,415
partial log-likelihood,-2015.26
time fit was run,2025-08-08 15:08:51 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
0,-0.7,0.5,0.05,-0.81,-0.59,0.45,0.55,0.0,-12.82,<0.005,122.63
1,0.41,1.51,0.05,0.31,0.52,1.36,1.68,0.0,7.81,<0.005,47.3
2,-0.64,0.53,0.05,-0.74,-0.53,0.48,0.59,0.0,-11.61,<0.005,101.13
3,-0.31,0.73,0.05,-0.41,-0.2,0.66,0.82,0.0,-5.73,<0.005,26.56
4,0.41,1.5,0.05,0.3,0.51,1.36,1.66,0.0,7.75,<0.005,46.68
5,-0.49,0.61,0.05,-0.59,-0.39,0.55,0.68,0.0,-9.29,<0.005,65.76
6,0.49,1.64,0.05,0.39,0.6,1.47,1.82,0.0,9.01,<0.005,62.05
7,0.4,1.5,0.05,0.3,0.51,1.35,1.67,0.0,7.56,<0.005,44.53
8,-0.51,0.6,0.05,-0.61,-0.41,0.54,0.66,0.0,-9.89,<0.005,74.19
9,0.42,1.52,0.05,0.32,0.52,1.37,1.69,0.0,7.87,<0.005,48.0

0,1
Concordance,0.85
Partial AIC,4070.52
log-likelihood ratio test,743.36 on 20 df
-log2(p) of ll-ratio test,477.81


#### 3.Aalen's Additive Model

Unlike the Cox model, it assumes that the effects of covariates are additive rather than multipliative.

In [21]:
from lifelines import AalenAdditiveFitter

aaf = AalenAdditiveFitter(fit_intercept=False)
aaf.fit(lifelines_df, duration_col='time', event_col='event')

aaf.print_summary()

0,1
model,lifelines.AalenAdditiveFitter
duration col,'time'
event col,'event'
number of subjects,636
number of events observed,415
time fit was run,2025-08-08 15:08:57 UTC

Unnamed: 0,slope(coef),se(slope(coef))
0,-0.03,0.02
1,0.02,0.02
2,-0.03,0.02
3,-0.01,0.01
4,0.02,0.02
5,-0.03,0.02
6,0.02,0.02
7,0.02,0.02
8,-0.02,0.02
9,0.02,0.02

0,1
Concordance,0.56


#### 4.Survival Support Vector Model

Apply the principles of support vector machines (SVMs) to survival data. It aims to find a hyperplane that maximizes the difference between the time of occurrence of an event and the time of review.

In [22]:
from sksurv.svm import FastSurvivalSVM

svm_model = FastSurvivalSVM(random_state=42)
svm_model.fit(X_scaled, y_structured)
risk_scores_svm = svm_model.predict(X_scaled)

risk_scores_svm[:5]

array([-0.55663606,  1.20539524,  1.61091243, -0.00165148,  0.16768141])

#### 5.XGBoost for survival version

Consists of a decision tree and is an extension of Random Forest in survival analysis. It predicts an individual's survival function by averaging the survival tree on multiple subsamples.

In [24]:
from sksurv.util import Surv
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)
xgb_clf.fit(X_scaled, y['event'])

event_probabilities = xgb_clf.predict_proba(X_scaled)[:, 1]

print(event_probabilities[:5])

[0.02376696 0.99750537 0.99965906 0.9850684  0.98297656]


In [None]:
!conda install scipy -y

Channels:
 - conda-forge
 - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free
 - defaults
 - pytorch
 - pyg
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/yuzimeng/anaconda3/envs/pyreadr_env

  added / updated specs:
    - scipy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2025.8.3   |       hbd8a1cb_0         151 KB  conda-forge
    certifi-2025.8.3           |     pyhd8ed1ab_0         155 KB  conda-forge
    conda-25.7.0               |   py39h2804cbe_0         939 KB  conda-forge
    conda-libmamba-solver-25.4.0|     pyhd8ed1ab_0          41 KB  conda-forge
    cpp-expected-1.1.0         |       h177bc72_1          24 KB  conda-forge
    fmt-11.0.2                 |       h440487c_1         176 KB  conda-forge
    libmamba-2.0.2             |       h66a2e1b_0

In [6]:
!conda update numpy scipy scikit-learn -y

Channels:
 - conda-forge
 - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free
 - defaults
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.



In [4]:
!conda install -c anaconda gfortran -y

Channels:
 - anaconda
 - conda-forge
 - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free
 - defaults
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/yuzimeng/anaconda3/envs/pyreadr_env

  added / updated specs:
    - gfortran


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    cctools-986                |       h0dbccd1_3         1.9 MB  anaconda
    clang-17.0.6               |       hf41426e_4         133 KB  anaconda
    clang-17-17.0.6            |default_h892e17a_4         819 KB  anaconda
    clang_impl_osx-arm64-17.0.6|       hcb56dc5_4          19 KB  anaconda
    clang_osx-arm64-17.0.6     |       hcb56dc5_4          20 KB  anaconda
    clangxx-17.0.6             |default_h892e17a_4         133 KB  anaconda
    compiler-rt-17.0.6         |       h6a6761b_1          91 KB  ana