In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import ShuffleSplit


In [6]:
# The Veterans' Administration Lung Cancer Trial
# The Statistical Analysis of Failure Time Data by Kalbfleisch J. and Prentice R (1980)

df = pd.read_csv('./data/veterans_lung_cancer.csv')
print(df.shape)


(137, 13)


Unnamed: 0,Survival_label_lower_bound,Survival_label_upper_bound,Age_in_years,Karnofsky_score,Months_from_Diagnosis,Celltype=adeno,Celltype=large,Celltype=smallcell,Celltype=squamous,Prior_therapy=no,Prior_therapy=yes,Treatment=standard,Treatment=test
0,72.0,72.0,69.0,60.0,7.0,0,0,0,1,1,0,1,0
1,411.0,411.0,64.0,70.0,5.0,0,0,0,1,0,1,1,0
2,228.0,228.0,38.0,60.0,3.0,0,0,0,1,1,0,1,0
3,126.0,126.0,63.0,60.0,9.0,0,0,0,1,0,1,1,0
4,118.0,118.0,65.0,70.0,11.0,0,0,0,1,0,1,1,0


In [7]:
df.head()

Unnamed: 0,Survival_label_lower_bound,Survival_label_upper_bound,Age_in_years,Karnofsky_score,Months_from_Diagnosis,Celltype=adeno,Celltype=large,Celltype=smallcell,Celltype=squamous,Prior_therapy=no,Prior_therapy=yes,Treatment=standard,Treatment=test
0,72.0,72.0,69.0,60.0,7.0,0,0,0,1,1,0,1,0
1,411.0,411.0,64.0,70.0,5.0,0,0,0,1,0,1,1,0
2,228.0,228.0,38.0,60.0,3.0,0,0,0,1,1,0,1,0
3,126.0,126.0,63.0,60.0,9.0,0,0,0,1,0,1,1,0
4,118.0,118.0,65.0,70.0,11.0,0,0,0,1,0,1,1,0


Prep data

In [8]:
y_lower = df['Survival_label_lower_bound']
y_upper = df['Survival_label_upper_bound']
X = df.drop(['Survival_label_lower_bound', 'Survival_label_upper_bound'], axis=1)

In [9]:
X.shape

(137, 11)

In [12]:
# Split into training and validation sets
splits = ShuffleSplit(n_splits=2, test_size=.7, random_state=0)

train_idx, test_idx = next(splits.split(X))
dtrain = xgb.DMatrix(X.values[train_idx, :])
dtrain.set_float_info('label_lower_bound', y_lower[train_idx])
dtrain.set_float_info('label_upper_bound', y_upper[train_idx])
dtest = xgb.DMatrix(X.values[test_idx, :])
dtest.set_float_info('label_lower_bound', y_lower[test_idx])
dtest.set_float_info('label_upper_bound', y_upper[test_idx])

In [14]:
# Train gradient boosted trees using AFT loss and metric
params = {'verbosity': 0,
          'objective': 'survival:aft',
          'eval_metric': 'aft-nloglik',
          'tree_method': 'hist',
          'learning_rate': 0.05,
          'aft_loss_distribution': 'normal',
          'aft_loss_distribution_scale': 1.20,
          'max_depth': 6,
          'lambda': 0.01,
          'alpha': 0.02}
bst = xgb.train(params, dtrain, num_boost_round=10000,
                evals=[(dtrain, 'train'), (dtest, 'test')],
                early_stopping_rounds=50)

[0]	train-aft-nloglik:12.85854	test-aft-nloglik:12.70710
[1]	train-aft-nloglik:12.08484	test-aft-nloglik:12.01670
[2]	train-aft-nloglik:11.38597	test-aft-nloglik:11.39459
[3]	train-aft-nloglik:10.75452	test-aft-nloglik:10.83194
[4]	train-aft-nloglik:10.18402	test-aft-nloglik:10.32407
[5]	train-aft-nloglik:9.66887	test-aft-nloglik:9.87096
[6]	train-aft-nloglik:9.20339	test-aft-nloglik:9.46174
[7]	train-aft-nloglik:8.78319	test-aft-nloglik:9.08848
[8]	train-aft-nloglik:8.40146	test-aft-nloglik:8.74855
[9]	train-aft-nloglik:8.05753	test-aft-nloglik:8.44254
[10]	train-aft-nloglik:7.74512	test-aft-nloglik:8.16665
[11]	train-aft-nloglik:7.46393	test-aft-nloglik:7.91752
[12]	train-aft-nloglik:7.20956	test-aft-nloglik:7.69174
[13]	train-aft-nloglik:6.97823	test-aft-nloglik:7.48943
[14]	train-aft-nloglik:6.76904	test-aft-nloglik:7.30370
[15]	train-aft-nloglik:6.58040	test-aft-nloglik:7.13687
[16]	train-aft-nloglik:6.40851	test-aft-nloglik:6.98694
[17]	train-aft-nloglik:6.25385	test-aft-nloglik:

In [15]:
# generate predictions on test set
df = pd.DataFrame({'Label (lower bound)': y_lower[test_idx],
                   'Label (upper bound)': y_upper[test_idx],
                   'Predicted label': bst.predict(dtest)})

In [17]:
df.head()

Unnamed: 0,Label (lower bound),Label (upper bound),Predicted label
26,151.0,151.0,56.026299
8,314.0,314.0,14.947133
86,44.0,44.0,730.688293
78,389.0,389.0,339.255219
43,392.0,392.0,146.474976


In [18]:
# Show only data points with right-censored labels
print(df[np.isinf(df['Label (upper bound)'])])

     Label (lower bound)  Label (upper bound)  Predicted label
71                  87.0                  inf        17.963226
13                  25.0                  inf       657.172485
63                 182.0                  inf       116.085258
90                 103.0                  inf       110.423904
109                 83.0                  inf       136.804108
