In [1]:
# from __future__ import print_function
# import argparse
import torch
import torch.utils.data
import torch.nn as nn 
import torch.optim as optim
from torch.autograd import Variable  # change later
from torchvision import datasets, transforms
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
from torch.nn.parameter import Parameter # ?

from functools import reduce


In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

##### Dataset

In [2]:
url = 'https://raw.githubusercontent.com/chl8856/DeepHit/master/sample%20data/SYNTHETIC/synthetic_comprisk.csv'
dataset = pd.read_csv(url)
dataset.drop(['true_time', 'true_label'], axis=1, inplace=True)
dataset.head()



Unnamed: 0,time,label,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12
0,0,0,-0.4405,-0.035066,-0.025341,-0.029775,-0.55787,-0.15355,0.56819,-0.15432,-0.25023,0.33915,0.70388,0.28174
1,1,0,0.015579,-0.84608,0.48753,0.65193,0.20099,-0.11238,-1.3963,-0.18874,-0.30001,-0.24032,-0.38533,-1.0245
2,34,2,0.44649,1.641,-1.745,0.31795,-1.1406,0.3656,0.2811,-0.58253,-1.6907,1.2022,-0.5192,1.784
3,9,0,0.62946,-0.61575,-0.32345,-0.9002,0.4536,-0.61992,2.1624,0.19875,-1.1196,-2.7321,-0.25673,-0.81836
4,2,0,1.2498,-0.18561,-0.18378,-0.98108,-0.01499,-0.14437,-1.2529,-0.58432,-0.090523,0.93692,1.0749,0.79117


In [3]:
# Shuffle data
# df = df.sample(frac=1).reset_index(drop=True)
# df

# df.futime.describe()

In [13]:
print(dataset.shape)
print(dataset.dtypes)
print(dataset.isnull().sum())

(30000, 14)
time           int64
label          int64
feature1     float64
feature2     float64
feature3     float64
feature4     float64
feature5     float64
feature6     float64
feature7     float64
feature8     float64
feature9     float64
feature10    float64
feature11    float64
feature12    float64
dtype: object
time         0
label        0
feature1     0
feature2     0
feature3     0
feature4     0
feature5     0
feature6     0
feature7     0
feature8     0
feature9     0
feature10    0
feature11    0
feature12    0
dtype: int64


In [9]:
# Missing values processing
# print(df['creatinine'].mean())
# df['creatinine'] = df['creatinine'].fillna(df['creatinine'].mean())
# df = df.drop(columns=['chapter'])
# print(df.isnull().sum())

# pd.get_dummies(df.sex)
# df.sex.replace(['F', 'M'], [0, 1], inplace=True)
# df.sex.value_counts()

In [14]:
# Change values of outcome
print(dataset['label'])
dataset['label'].replace({2:1}, inplace=True)
print(dataset['label'])

0        0
1        0
2        1
3        0
4        0
        ..
29995    1
29996    1
29997    0
29998    0
29999    0
Name: label, Length: 30000, dtype: int64
0        0
1        0
2        1
3        0
4        0
        ..
29995    1
29996    1
29997    0
29998    0
29999    0
Name: label, Length: 30000, dtype: int64


##### Train - test - validation split


In [11]:
get_x = lambda df: (df
                    .drop(columns=['time', 'label'])
                    .values.astype('float32'))

df_test = dataset.sample(frac=0.2)
df_train = dataset.drop(df_test.index)
# df_val = df_train.sample(frac=0.2)
# df_train = df_train.drop(df_val.index)

X_train = get_x(df_train)
X_test = get_x(df_test)

Y_train = df_train[['label', 'time']].to_numpy()
Y_test = df_test[['label', 'time']].to_numpy()


In [15]:
D_in, H, D_out = X_train.shape[1], 128, 32    # D_out 32 ?
batch_size = 32
num_time_units = 24 # 24 month?
time_bin = 30   # 30?
n_epochs = 1
learning_rate = 1e-3


In [16]:
D_in

12

In [17]:
class survdl(nn.Module):
    def __init__(self, D_in, H, D_out, num_time_units):
        super(survdl, self).__init__()
        self.sigmoid = nn.Sigmoid()
        self.fc_layer = nn.Sequential(nn.Linear(D_in, H), nn.ReLU(), nn.Dropout(0.5), nn.Linear(H, D_out))
        self.fc_layer2 = nn.Linear(1, num_time_units)
        self.beta = Parameter(torch.Tensor(D_out, 1))
        self.beta.data.uniform_(-0.001, 0.001)  # initialization?
        
    def score_1(self, x):
        return torch.exp(x.mm(self.beta))  # hazard function - s1
    
    def score_2(self, score1):
        return self.sigmoid(self.fc_layer2(score1))
    
    def forward(self, x):
        new_x = self.fc_layer(x)
        score1 = self.score_1(new_x)
        score2 = self.score_2(score1)
        return score1, score2
    

##### Function for C-index

In [24]:
def unique_set(lifetime):
    a = lifetime.data.cpu().numpy()   # lifetime.data.cpu().numpy()
    t, idx = np.unique(a, return_inverse=True)
    sort_idx = np.argsort(a)
    a_sorted = a[sort_idx]
    unq_first = np.concatenate(([True], a_sorted[1:] != a_sorted[:-1]))
    unq_count = np.diff(np.nonzero(unq_first)[0])
    unq_idx = np.split(sort_idx, np.cumsum(unq_count))
    return t, unq_idx
    
def log_parlik(lifetime, censor, score1):  
    t, H = unique_set(lifetime)
    keep_index = np.nonzero(censor.data.cpu().numpy())[0]  #censor = 1  #.data.cpu()
    H = [list(set(h)&set(keep_index)) for h in H]
    n = [len(h) for h in H]
    
    score1 = score1.detach().data.cpu().numpy()   # .data.cpu()   #?
    total = 0
    for j in range(len(t)):
        total_1 = np.sum(np.log(score1)[H[j]])
        m = n[j]
        total_2 = 0
        for i in range(m):
            subtotal = np.sum(score1[sum(H[j:],[])]) - (i*1.0/m)*(np.sum(score1[H[j]]))
            subtotal = np.log(subtotal)
            total_2 = total_2 + subtotal
        total = total + total_1 - total_2
        total = np.array([total])
    return torch.Tensor(total).type(torch.FloatTensor).to(device).view(-1,1)
        

def acc_pairs(censor, lifetime):
    noncensor_index = np.nonzero(censor.data.cpu().numpy())[0]
    lifetime = lifetime.data.cpu().numpy()
    acc_pair = []
    for i in noncensor_index:
        all_j =  np.array(range(len(lifetime)))[lifetime > lifetime[i]]
        acc_pair.append([(i,j) for j in all_j])
    
    acc_pair = reduce(lambda x,y: x + y, acc_pair)
    return acc_pair


def rank_loss(lifetime, censor, score2, t, time_bin): 
    # score2 (n(samples)*24) at time unit t = 1,2,...,24
    acc_pair = acc_pairs(censor, lifetime)
    lifetime = lifetime.data.cpu().numpy()
    total = 0
    for i,j in acc_pair:
        yi = (lifetime[i] >= (t-1) * time_bin) * 1
        yj = (lifetime[j] >= (t-1) * time_bin) * 1
        a = torch.ones(1).type(torch.FloatTensor).to(device)
        L2dist = torch.dist(score2[j, t-1] - score2[i, t-1], a, 2)
        total = total + L2dist* yi * (1-yj)
    return total


def C_index(censor, lifetime, score1):
    score1 = score1.detach().data.cpu().numpy()  #.data.cpu()  #?
    acc_pair = acc_pairs(censor, lifetime)
    prob = sum([score1[i] >= score1[j] for (i, j) in acc_pair])[0]*1.0/len(acc_pair)
    return prob

In [25]:
model = survdl(D_in, H, D_out, num_time_units).to(device)

optimizer = optim.Adam(model.parameters(), lr = learning_rate)


##### Training and evaluating

In [26]:
print(X_test, X_train)
print(Y_test, Y_train)

[[-1.2869e+00 -1.2126e+00 -7.8916e-01 ...  7.5659e-01  6.2747e-01
   4.1510e-01]
 [ 2.6148e-01 -1.7578e+00  1.2720e-03 ... -1.1731e+00  6.3678e-01
  -1.1846e-01]
 [ 4.1238e-01 -7.1887e-01 -4.9787e-01 ...  1.5872e+00  2.3396e-01
  -1.9114e-01]
 ...
 [ 2.2060e+00 -3.7553e-02  7.7608e-01 ... -2.0386e+00  8.5972e-01
  -2.4876e-01]
 [-1.3101e+00  1.0438e+00 -5.0588e-01 ... -1.0752e+00 -5.9242e-01
  -9.6196e-01]
 [-1.3666e-01 -7.5170e-01 -1.5467e+00 ... -3.0804e-01 -1.8159e-01
  -1.1973e+00]] [[-0.4405   -0.035066 -0.025341 ...  0.33915   0.70388   0.28174 ]
 [ 0.015579 -0.84608   0.48753  ... -0.24032  -0.38533  -1.0245  ]
 [ 0.44649   1.641    -1.745    ...  1.2022   -0.5192    1.784   ]
 ...
 [-0.69875  -0.79495   0.47968  ... -0.88256  -0.13188   0.71109 ]
 [ 0.16694  -0.47959  -1.2024   ...  0.32197   0.35811  -0.22775 ]
 [ 0.23814   0.99571   0.61698  ...  1.3149    0.74628   0.071198]]
[[ 0 26]
 [ 0  9]
 [ 1 17]
 ...
 [ 1 98]
 [ 0 34]
 [ 0  1]] [[ 0  0]
 [ 0  1]
 [ 1 34]
 ...
 [ 0  2]

In [27]:
def train(epoch):
    model.train()
    train_loss = 0    
    # idx = np.random.permutation(X_train.shape[0])     
    j = 0
    while j < X_train.shape[0]:
        if j < X_train.shape[0] - batch_size:
            data = Variable(torch.from_numpy(X_train[j:(j + batch_size)])).type(torch.FloatTensor).to(device)
            lifetime = Variable(torch.from_numpy(Y_train[j:(j + batch_size),1])).type(torch.FloatTensor).to(device)
            censor = Variable(torch.from_numpy(Y_train[j:(j + batch_size),0])).type(torch.FloatTensor).to(device)
        else:
            data = Variable(torch.from_numpy(X_train[j:])).type(torch.FloatTensor).to(device)
            lifetime = Variable(torch.from_numpy(Y_train[j:,1])).type(torch.FloatTensor).to(device)
            censor = Variable(torch.from_numpy(Y_train[j:,0])).type(torch.FloatTensor).to(device)
            
        optimizer.zero_grad()
        score1, score2 = model(data)
        loss1 = log_parlik(lifetime, censor, score1)
        loss2 = []
        for t in range(num_time_units):
            loss2.append(rank_loss(lifetime, censor, score2, t+1, time_bin))
        loss2 = sum(loss2)
        loss = 1.0 * loss1 + 0.5 * loss2
        loss.backward()      
        train_loss = loss.data[0]
        optimizer.step()
        j += batch_size
    return train_loss*1.0 / X_train.shape[0]

def test(epoch):
    model.eval()
    test_loss = 0
    j = 0
    while j < X_test.shape[0]:
        if j < X_test.shape[0] - batch_size:
            data = Variable(torch.from_numpy(X_test[j:(j + batch_size)])).type(torch.FloatTensor).to(device)
            lifetime = Variable(torch.from_numpy(Y_test[j:(j + batch_size),1])).type(torch.FloatTensor).to(device)
            censor = Variable(torch.from_numpy(Y_test[j:(j + batch_size),0])).type(torch.FloatTensor).to(device)
        else:
            data = Variable(torch.from_numpy(X_test[j:])).type(torch.FloatTensor).to(device)
            lifetime = Variable(torch.from_numpy(Y_test[j:,1])).type(torch.FloatTensor).to(device)
            censor = Variable(torch.from_numpy(Y_test[j:,0])).type(torch.FloatTensor).to(device)
        y_pred = model(data)
        score1, score2 = model(data)
        loss1 = log_parlik(lifetime, censor, score1)
        loss2 = []
        for t in range(num_time_units):
            loss2.append(rank_loss(lifetime, censor, score2, t+1, time_bin))
        loss2 = sum(loss2)
        loss = 1.0 * loss1 + 0.5 * loss2
        test_loss += loss.data[0]
        j += batch_size
    return test_loss*1.0 / X_test.shape[0]
    
for epoch in range(1, n_epochs + 1):
    train_loss = train(epoch)
    test_loss = test(epoch)
    print('====> Epoch: %d training loss: %.4f'%(epoch, train_loss))
    print('====> Epoch: %d testing loss: %.4f'%(epoch, test_loss))
    

====> Epoch: 1 training loss: -0.0013
====> Epoch: 1 testing loss: -0.9856


In [None]:
# concordance - training
data_train = Variable(torch.from_numpy(X_train)).type(torch.FloatTensor).to(device)
lifetime_train = Variable(torch.from_numpy(Y_train[:,0])).type(torch.FloatTensor).to(device)
censor_train = Variable(torch.from_numpy(Y_train[:,1])).type(torch.FloatTensor).to(device)

score1_train, score2_train = model(data_train)
C_index_train = C_index(censor_train, lifetime_train, score1_train)
print('Concordance index for training data: {:.4f}'.format(C_index_train))


# concordance - test
data_test = Variable(torch.from_numpy(X_test)).type(torch.FloatTensor).to(device)
lifetime_test = Variable(torch.from_numpy(Y_test[:,0])).type(torch.FloatTensor).to(device)
censor_test = Variable(torch.from_numpy(Y_test[:,1])).type(torch.FloatTensor).to(device)

score1_test, score2_test = model(data_test)
C_index_test = C_index(censor_test, lifetime_test, score1_test)
print('Concordance index for test data: {:.4f}'.format(C_index_test))