In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("/kaggle/input/breast-test/res_data1.csv")
train, test = train_test_split(data, test_size=0.2)


train["flag"] = 1
test["flag"] = 0


data = pd.concat([train, test])


features_cat = ["hualiao","xiluoda","fangliao","neifenmi","baxiang","fufa","yuanchuzhuanyi","zuzhixue_leixing",
                "T_fenqi","fenhua_xuhao","linbajie_zhuangtai","N_fenqi","TNM_fenqi","012/3","HR","HER-2",
                "Ki67","CK56","EGFR","fenxing_xuhao","shoushu_leibie","OP4"]
features_con = ["mm","LN","CCI_score","age_score","age-CCI_score"]

df_dummy = pd.get_dummies(data[features_cat])
data = pd.concat([data, df_dummy], axis = 1)

train = data[data["flag"] == 1]
test = data[data["flag"] == 0]


features = df_dummy.columns.to_list() + features_con
train_sel = train[["OS_month", "siwang"] + features]
test_sel = test[["OS_month", "siwang"] + features]
train_sel.to_csv("/kaggle/working/data_train.csv", index = False)
test_sel.to_csv("/kaggle/working/data_test.csv", index = False)

In [7]:
pip install pycox

Collecting pycox
  Downloading pycox-0.2.3-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.7/73.7 kB[0m [31m723.4 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting py7zr>=0.11.3
  Downloading py7zr-0.20.0-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.0/65.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting torchtuples>=0.2.0
  Downloading torchtuples-0.2.2-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting multivolumefile>=0.2.3
  Downloading multivolumefile-0.2.3-py3-none-any.whl (17 kB)
Collecting pyzstd>=0.14.4
  Downloading pyzstd-0.15.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (379 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m379.2/379.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pybcj>=0.6.0
  Do

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# For preprocessing
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper 

import torch # For building the networks 
import torchtuples as tt # Some useful functions

from pycox.datasets import metabric
from pycox.models import MTLR
from pycox.evaluation import EvalSurv


np.random.seed(1234)
_ = torch.manual_seed(123)


df_train = pd.read_csv("/kaggle/working/data_train.csv")
df_test = pd.read_csv("/kaggle/working/data_test.csv")


cols_standardize = ['age_score', "linbajie_zhuangtai"]
cols_leave = [x for x in df_train.columns.to_list() if x not in ["siwang", "OS_month", "age_score", "linbajie_zhuangtai"]]

standardize = [([col], StandardScaler()) for col in cols_standardize]
leave = [(col, None) for col in cols_leave]

x_mapper = DataFrameMapper(standardize + leave)

x_train = x_mapper.fit_transform(df_train).astype('float32')
x_test = x_mapper.transform(df_test).astype('float32')


num_durations = 108
labtrans = MTLR.label_transform(num_durations)
get_target = lambda df: (df['siwang'].values, df['OS_month'].values)
y_train = labtrans.fit_transform(*get_target(df_train))
y_test = labtrans.fit_transform(*get_target(df_test))
train = (x_train, y_train)

durations_test, events_test = get_target(df_test)



in_features = x_train.shape[1]
out_features = labtrans.out_features



list_num_nodes = [[32, 32], [32, 16], [32, 8], [16, 8], [16, 4], [8, 4]]
list_batch_norm = [False, True]
list_dropout = [0.1, 0.2, 0.3, 0.4, 0.5]
list_batch_size = [64, 128, 256, 512, 1024]
list_lr = [0.1, 0.01, 0.001, 0.0001]

parameters = []
for num_nodes in list_num_nodes:
    for batch_norm in list_batch_norm:
        for dropout in list_dropout:
            for batch_size in list_batch_size:
                for lr in list_lr:
                    parameters.append([num_nodes, batch_norm, dropout, batch_size, lr])


mtlr_cv_results = pd.DataFrame(parameters)
mtlr_cv_results["cindex"] = 0



kf = KFold(n_splits = 5)


# mtlr_cv_results = pd.read_csv("./data/cv.results.mtlr.csv")


for index in range(mtlr_cv_results.shape[0]):
#     num_nodes = eval(mtlr_cv_results.iloc[index, 0])
    num_nodes = mtlr_cv_results.iloc[index, 0]  
    batch_norm = mtlr_cv_results.iloc[index, 1]
    dropout = mtlr_cv_results.iloc[index, 2]
    batch_size = mtlr_cv_results.iloc[index, 3]
    lr = mtlr_cv_results.iloc[index, 4]
    
    cindexes = []
    for train_index, test_index in kf.split(df_train):
        X_tr = x_train[train_index, ]
        X_val = x_train[test_index, ]
        Y_tr_0 = y_train[0][train_index, ]
        Y_tr_1 = y_train[1][train_index, ]
        Y_val_0 = y_train[0][test_index, ]
        Y_val_1 = y_train[1][test_index, ]
        Y_tr = (Y_tr_0, Y_tr_1)
        Y_val = (Y_val_0, Y_val_1)
        
        net = tt.practical.MLPVanilla(in_features, num_nodes, out_features, batch_norm, dropout)
        model = MTLR(net, tt.optim.Adam, duration_index=labtrans.cuts)
        model.optimizer.set_lr(lr)
        
        epochs = 100
        callbacks = [tt.callbacks.EarlyStopping(patience = 3)]
        log = model.fit(X_tr, Y_tr, int(batch_size), epochs, callbacks,
                        val_data = (X_val, Y_val))
        
        surv = model.predict_surv_df(X_val)
        ev = EvalSurv(surv, df_train["siwang"][test_index].values,
                      df_train["OS_month"][test_index].values, censor_surv='km')
        c_index = ev.concordance_td('antolini')
        cindexes.append(c_index)

    mtlr_cv_results.iloc[index, 5] = np.mean(cindexes)
    mtlr_cv_results.to_csv('/kaggle/working/cv.results.mtlr.csv', index = False)
    print(mtlr_cv_results.iloc[index, ])


mtlr_cv_results = pd.read_csv("/kaggle/working/cv.results.mtlr.csv")
print(mtlr_cv_results["cindex"].values.max())
ind_best = mtlr_cv_results["cindex"].values.argmax()
num_nodes = mtlr_cv_results.iloc[ind_best, 0]
batch_norm = mtlr_cv_results.iloc[ind_best, 1]
dropout = mtlr_cv_results.iloc[ind_best, 2]
batch_size = mtlr_cv_results.iloc[ind_best, 3]
lr = mtlr_cv_results.iloc[ind_best, 4]



net = tt.practical.MLPVanilla(in_features, eval(num_nodes), out_features, batch_norm, dropout)
model = MTLR(net, tt.optim.Adam, duration_index=labtrans.cuts)
model.optimizer.set_lr(lr)
        
epochs = 100
callbacks = [tt.callbacks.EarlyStopping(patience = 3)]
log = model.fit(x_train, y_train, int(batch_size), epochs, callbacks,
                        val_data = (x_test, y_test))
        
surv = model.predict_surv_df(x_test)
ev = EvalSurv(surv, df_test["siwang"].values, df_test["OS_month"].values, censor_surv='km')
c_index = ev.concordance_td('antolini')
print('C-index: {:.4f}'.format(c_index))


time_grid = np.linspace(df_test["siwang"].values.min(), df_test["siwang"].values.max(), 100)
ibs = ev.integrated_brier_score(time_grid) 
print('IBS: {:.4f}'.format(ibs))




def bootstrap_replicate_1d(data):
    bs_sample = np.random.choice(data,len(data))
    return bs_sample


bootstrap_R = 100
c_indexes = []
ibss = []


for i in range(bootstrap_R):
    print(i)
    train_bs_idx = bootstrap_replicate_1d(np.array(range(df_train.shape[0])))
    # Creating the X, T and E input
    X_train = x_train[train_bs_idx, ]
    T_train = y_train[0][train_bs_idx]
    E_train = y_train[1][train_bs_idx]
    Y_train = (T_train, E_train)
    net = tt.practical.MLPVanilla(in_features, eval(num_nodes), out_features, batch_norm, dropout)
    model = MTLR(net, tt.optim.Adam, duration_index=labtrans.cuts)
    model.optimizer.set_lr(lr)
    epochs = 100
    callbacks = [tt.callbacks.EarlyStopping(patience = 3)]
    log = model.fit(x_train, y_train, int(batch_size), epochs, callbacks,
                        val_data = (x_test, y_test))
    surv = model.predict_surv_df(x_test)
    ev = EvalSurv(surv, df_test["siwang"].values, df_test["OS_month"].values, censor_surv='km')
    c_index = ev.concordance_td('antolini')
    time_grid = np.linspace(df_test["siwang"].values.min(), df_test["siwang"].values.max(), 100)
    ibs = ev.integrated_brier_score(time_grid) 
    c_indexes.append(np.round(c_index, 4))
    ibss.append(np.round(ibs, 4))

pd.DataFrame(data = {"cindex": c_indexes, "ibs": ibss}).to_csv("/kaggle/working/results.ci.mtlr.csv", index=False)


# Compute the 95% confidence interval: conf_int
mean_cindex = np.mean(c_indexes)
mean_ibs = np.mean(ibss)


# Print the mean
print('mean cindex =', mean_cindex)
print('mean ibs =', mean_ibs)



ci_cindex = np.percentile(c_indexes, [2.5, 97.5])
ci_ibs = np.percentile(ibss, [2.5, 97.5])
 
# Print the confidence interval
print('confidence interval =', ci_cindex)
print('confidence interval =', ci_ibs)

0:	[0s / 0s],		train_loss: 26.2969,	val_loss: 2.2453
1:	[0s / 0s],		train_loss: 7.0657,	val_loss: 3.2279
2:	[0s / 0s],		train_loss: 1.7830,	val_loss: 0.6327
3:	[0s / 0s],		train_loss: 0.9481,	val_loss: 0.8428
4:	[0s / 0s],		train_loss: 0.5401,	val_loss: 0.6394
5:	[0s / 0s],		train_loss: 0.4981,	val_loss: 0.4748
6:	[0s / 0s],		train_loss: 0.4678,	val_loss: 0.5178
7:	[0s / 0s],		train_loss: 0.4725,	val_loss: 0.4557
8:	[0s / 0s],		train_loss: 0.4438,	val_loss: 0.4759
9:	[0s / 0s],		train_loss: 0.4330,	val_loss: 0.6001
10:	[0s / 0s],		train_loss: 0.4476,	val_loss: 0.4976
0:	[0s / 0s],		train_loss: 26.8026,	val_loss: 2.5129
1:	[0s / 0s],		train_loss: 1.3113,	val_loss: 0.5970
2:	[0s / 0s],		train_loss: 0.5022,	val_loss: 0.4900
3:	[0s / 0s],		train_loss: 0.4526,	val_loss: 0.4619
4:	[0s / 0s],		train_loss: 0.4445,	val_loss: 0.4617
5:	[0s / 0s],		train_loss: 0.4644,	val_loss: 0.4831
6:	[0s / 0s],		train_loss: 0.6156,	val_loss: 0.5225
7:	[0s / 0s],		train_loss: 0.4561,	val_loss: 0.4348
8:	[0s / 

KeyboardInterrupt: 