In [22]:
# from __future__ import print_function
# import argparse
import torch
import torch.utils.data
import torch.nn as nn 
import torch.optim as optim
from torch.autograd import Variable  # change later
from torchvision import datasets, transforms
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
from torch.nn.parameter import Parameter # ?

from functools import reduce

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cpu')

##### Dataset

In [2]:
df = pd.read_csv("flchain.csv")
df

Unnamed: 0.1,Unnamed: 0,age,sex,sample.yr,kappa,lambda,flc.grp,creatinine,mgus,futime,death,chapter
0,1,97,F,1997,5.700,4.860,10,1.7,0,85,1,Circulatory
1,2,92,F,2000,0.870,0.683,1,0.9,0,1281,1,Neoplasms
2,3,94,F,1997,4.360,3.850,10,1.4,0,69,1,Circulatory
3,4,92,F,1996,2.420,2.220,9,1.0,0,115,1,Circulatory
4,5,93,F,1996,1.320,1.690,6,1.1,0,1039,1,Circulatory
...,...,...,...,...,...,...,...,...,...,...,...,...
7869,7870,52,F,1995,1.210,1.610,6,1.0,0,4997,0,
7870,7871,52,F,1999,0.858,0.581,1,0.8,0,3652,0,
7871,7872,54,F,2002,1.700,1.720,8,,0,2507,0,
7872,7873,53,F,1995,1.710,2.690,9,,0,4982,0,


In [3]:
# Shuffle data
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0.1,Unnamed: 0,age,sex,sample.yr,kappa,lambda,flc.grp,creatinine,mgus,futime,death,chapter
0,3462,69,F,1996,1.32,0.972,3,1.0,0,4639,0,
1,5986,56,M,1995,1.43,1.110,4,,0,4978,0,
2,621,84,F,1996,2.39,2.290,9,1.0,0,4090,1,Circulatory
3,4233,61,F,1996,1.94,2.920,10,1.1,0,4569,0,
4,2211,72,M,1995,1.41,1.640,6,1.2,0,3630,1,Respiratory
...,...,...,...,...,...,...,...,...,...,...,...,...
7869,5776,57,M,1998,1.62,1.750,7,1.2,0,4096,0,
7870,7545,50,F,1996,1.46,2.780,9,,0,4841,0,
7871,2717,68,M,1995,0.93,1.560,4,1.5,0,4952,0,
7872,4827,59,F,1997,1.08,1.370,4,0.9,0,4381,0,


In [7]:
df.futime.describe()

count    7874.000000
mean     3661.042291
std      1432.677330
min         0.000000
25%      2852.000000
50%      4302.000000
75%      4773.000000
max      5215.000000
Name: futime, dtype: float64

In [8]:
print(df.shape)
print(df.dtypes)
print(df.isnull().sum())

(7874, 12)
Unnamed: 0      int64
age             int64
sex             int64
sample.yr       int64
kappa         float64
lambda        float64
flc.grp         int64
creatinine    float64
mgus            int64
futime          int64
death           int64
chapter        object
dtype: object
Unnamed: 0       0
age              0
sex              0
sample.yr        0
kappa            0
lambda           0
flc.grp          0
creatinine    1350
mgus             0
futime           0
death            0
chapter       5705
dtype: int64


In [9]:
# Missing values processing
print(df['creatinine'].mean())
df['creatinine'] = df['creatinine'].fillna(df['creatinine'].mean())
df = df.drop(columns=['chapter'])
print(df.isnull().sum())

1.0935162477007978
Unnamed: 0    0
age           0
sex           0
sample.yr     0
kappa         0
lambda        0
flc.grp       0
creatinine    0
mgus          0
futime        0
death         0
dtype: int64


In [10]:
# pd.get_dummies(df.sex)
df.sex.replace(['F', 'M'], [0, 1], inplace=True)
df.sex.value_counts()

0    4350
1    3524
Name: sex, dtype: int64

##### Train - test - validation split


In [11]:
get_x = lambda df: (df
                    .drop(columns=['Unnamed: 0', 'death', 'futime'])
                    .values.astype('float32'))

df_test = df.sample(frac=0.2)
df_train = df.drop(df_test.index)

X_train = get_x(df_train)
X_test = get_x(df_test)

Y_train = df_train[['death', 'futime']].to_numpy()
Y_test = df_test[['death', 'futime']].to_numpy()


In [12]:
D_in, H, D_out = X_train.shape[1], 128, 32    # D_out 32 ?
batch_size = 32
num_time_units = 24 # 24 month?
time_bin = 30   # 30?
n_epochs = 1
learning_rate = 1e-3


In [13]:
class survdl(nn.Module):
    def __init__(self, D_in, H, D_out, num_time_units):
        super(survdl, self).__init__()
        self.sigmoid = nn.Sigmoid()
        self.fc_layer = nn.Sequential(nn.Linear(D_in, H), nn.ReLU(), nn.Dropout(0.5), nn.Linear(H, D_out))
        self.fc_layer2 = nn.Linear(1, num_time_units)
        self.beta = Parameter(torch.Tensor(D_out, 1))
        self.beta.data.uniform_(-0.001, 0.001)  # initialization?
        
    def score_1(self, x):
        return torch.exp(x.mm(self.beta))  # hazard function - s1
    
    def score_2(self, score1):
        return self.sigmoid(self.fc_layer2(score1))
    
    def forward(self, x):
        new_x = self.fc_layer(x)
        score1 = self.score_1(new_x)
        score2 = self.score_2(score1)
        return score1, score2
    

##### Function for C-index

In [23]:
def unique_set(lifetime):
    a = lifetime.data.cpu().numpy()   # lifetime.data.cpu().numpy()
    t, idx = np.unique(a, return_inverse=True)
    sort_idx = np.argsort(a)
    a_sorted = a[sort_idx]
    unq_first = np.concatenate(([True], a_sorted[1:] != a_sorted[:-1]))
    unq_count = np.diff(np.nonzero(unq_first)[0])
    unq_idx = np.split(sort_idx, np.cumsum(unq_count))
    return t, unq_idx
    
def log_parlik(lifetime, censor, score1):  
    t, H = unique_set(lifetime)
    keep_index = np.nonzero(censor.data.cpu().numpy())[0]  #censor = 1  #.data.cpu()
    H = [list(set(h)&set(keep_index)) for h in H]
    n = [len(h) for h in H]
    
    score1 = score1.detach().data.cpu().numpy()   # .data.cpu()   #?
    total = 0
    for j in range(len(t)):
        total_1 = np.sum(np.log(score1)[H[j]])
        m = n[j]
        total_2 = 0
        for i in range(m):
            subtotal = np.sum(score1[sum(H[j:],[])]) - (i*1.0/m)*(np.sum(score1[H[j]]))
            subtotal = np.log(subtotal)
            total_2 = total_2 + subtotal
        total = total + total_1 - total_2
        total = np.array([total])
    return torch.Tensor(total).type(torch.FloatTensor).to(device).view(-1,1)
        

def acc_pairs(censor, lifetime):
    noncensor_index = np.nonzero(censor.data.cpu().numpy())[0]
    lifetime = lifetime.data.cpu().numpy()
    acc_pair = []
    for i in noncensor_index:
        all_j =  np.array(range(len(lifetime)))[lifetime > lifetime[i]]
        acc_pair.append([(i,j) for j in all_j])
    
    acc_pair = reduce(lambda x,y: x + y, acc_pair)
    return acc_pair



def rank_loss(lifetime, censor, score2, t, time_bin): 
    # score2 (n(samples)*24) at time unit t = 1,2,...,24
    acc_pair = acc_pairs(censor, lifetime)
    lifetime = lifetime.data.cpu().numpy()
    total = 0
    for i,j in acc_pair:
        yi = (lifetime[i] >= (t-1) * time_bin) * 1
        yj = (lifetime[j] >= (t-1) * time_bin) * 1
        a = torch.ones(1).type(torch.FloatTensor).to(device)
        L2dist = torch.dist(score2[j, t-1] - score2[i, t-1], a, 2)
        total = total + L2dist* yi * (1-yj)
    return total


def C_index(censor, lifetime, score1):
    score1 = score1.detach().data.cpu().numpy()  #.data.cpu()  #?
    acc_pair = acc_pairs(censor, lifetime)
    prob = sum([score1[i] >= score1[j] for (i, j) in acc_pair])[0]*1.0/len(acc_pair)
    return prob

In [25]:
model = survdl(D_in, H, D_out, num_time_units).to(device)

optimizer = optim.Adam(model.parameters(), lr = learning_rate)


##### Training and evaluating

In [20]:
def train(epoch):
    model.train()
    train_loss = 0    
    # idx = np.random.permutation(X_train.shape[0])     
    j = 0
    while j < X_train.shape[0]:
        if j < X_train.shape[0] - batch_size:
            data = Variable(torch.from_numpy(X_train[j:(j + batch_size)])).type(torch.FloatTensor).to(device)
            lifetime = Variable(torch.from_numpy(Y_train[j:(j + batch_size),1])).type(torch.FloatTensor).to(device)
            censor = Variable(torch.from_numpy(Y_train[j:(j + batch_size),0])).type(torch.FloatTensor).to(device)
        else:
            data = Variable(torch.from_numpy(X_train[j:])).type(torch.FloatTensor).to(device)
            lifetime = Variable(torch.from_numpy(Y_train[j:,1])).type(torch.FloatTensor).to(device)
            censor = Variable(torch.from_numpy(Y_train[j:,0])).type(torch.FloatTensor).to(device)
            
        optimizer.zero_grad()
        score1, score2 = model(data)
        loss1 = log_parlik(lifetime, censor, score1)
        loss2 = []
        for t in range(num_time_units):
            loss2.append(rank_loss(lifetime, censor, score2, t+1, time_bin))
        loss2 = sum(loss2)
        loss = 1.0 * loss1 + 0.5 * loss2
        loss.backward()      
        train_loss = loss.data[0]
        optimizer.step()
        j += batch_size
    return train_loss*1.0 / X_train.shape[0]

def test(epoch):
    model.eval()
    test_loss = 0
    j = 0
    while j < X_test.shape[0]:
        if j < X_test.shape[0] - batch_size:
            data = Variable(torch.from_numpy(X_test[j:(j + batch_size)])).type(torch.FloatTensor).to(device)
            lifetime = Variable(torch.from_numpy(Y_test[j:(j + batch_size),1])).type(torch.FloatTensor).to(device)
            censor = Variable(torch.from_numpy(Y_test[j:(j + batch_size),0])).type(torch.FloatTensor).to(device)
        else:
            data = Variable(torch.from_numpy(X_test[j:])).type(torch.FloatTensor).to(device)
            lifetime = Variable(torch.from_numpy(Y_test[j:,1])).type(torch.FloatTensor).to(device)
            censor = Variable(torch.from_numpy(Y_test[j:,0])).type(torch.FloatTensor).to(device)
        y_pred = model(data)
        score1, score2 = model(data)
        loss1 = log_parlik(lifetime, censor, score1)
        loss2 = []
        for t in range(num_time_units):
            loss2.append(rank_loss(lifetime, censor, score2, t+1, time_bin))
        loss2 = sum(loss2)
        loss = 1.0 * loss1 + 0.5 * loss2
        test_loss += loss.data[0]
        j += batch_size
    return test_loss*1.0 / X_test.shape[0]
    
for epoch in range(1, n_epochs + 1):
    train_loss = train(epoch)
    test_loss = test(epoch)
    print('====> Epoch: %d training loss: %.4f'%(epoch, train_loss))
    print('====> Epoch: %d testing loss: %.4f'%(epoch, test_loss))
    

====> Epoch: 1 training loss: -0.0012
====> Epoch: 1 testing loss: -0.3664


In [21]:
# concordance - training
data_train = Variable(torch.from_numpy(X_train)).type(torch.FloatTensor).to(device)
lifetime_train = Variable(torch.from_numpy(Y_train[:,0])).type(torch.FloatTensor).to(device)
censor_train = Variable(torch.from_numpy(Y_train[:,1])).type(torch.FloatTensor).to(device)

score1_train, score2_train = model(data_train)
C_index_train = C_index(censor_train, lifetime_train, score1_train)
print('Concordance index for training data: {:.4f}'.format(C_index_train))


# concordance - test
data_test = Variable(torch.from_numpy(X_test)).type(torch.FloatTensor).to(device)
lifetime_test = Variable(torch.from_numpy(Y_test[:,0])).type(torch.FloatTensor).to(device)
censor_test = Variable(torch.from_numpy(Y_test[:,1])).type(torch.FloatTensor).to(device)

score1_test, score2_test = model(data_test)
C_index_test = C_index(censor_test, lifetime_test, score1_test)
print('Concordance index for test data: {:.4f}'.format(C_index_test))

Concordance index for training data: 0.7898
Concordance index for test data: 0.7545
