In [1]:
from EHRDataloader import EHRdataFromPickles, EHRdataloader

## 1. Load Dataset

This part of the code loads the dataset, we use the EHRDataLoader.py
The initial code could be found: https://github.com/ZhiGroup/pytorch_ehr


In [2]:
print('1 file found. Data will be split into train, validation and test.')
data = EHRdataFromPickles(root_dir = '../data/', 
                      file = 'toy.train', 
                      sort= False,
                      test_ratio = 0.2, 
                      valid_ratio = 0.1,
                      model='RNN') #No sort before splitting

# Dataloader splits
train, test, valid = data.__splitdata__()

1 file found. Data will be split into train, validation and test.


In [3]:
## Get the patients labels, where 1: heart failure and 0: no heart failure
labels=[]
for ii in range(len(train)):
    label=train[ii][1]
    labels.append(label)

In [4]:
## Distribution of the labels
from collections import Counter
Counter(labels)

Counter({1: 3511, 0: 3489})

In [None]:
## print sizes of train test and valid datasets

In [5]:
len(train),len(test),len(valid)

(7000, 2000, 1000)

## 2. Sample of dataset

In [6]:
## example dataset for patient 0 and visit 0
patient=0
visit=0
print("Patient ID:", train[patient][0])
print("Heart Failure:", train[patient][1])
print("# of visits:", len(train[patient][2]))

print(f' list of visit_time (since last time): {train[patient][2][visit][0]}')
print(f' list of codes corresponding to visit: {train[patient][2][visit][1]}')



Patient ID: 2569
Heart Failure: 1
# of visits: 154
 list of visit_time (since last time): [12753]
 list of codes corresponding to visit: [2836]


## 3. Preprocess Data for Training

This part of the code transforms the data which has the format described above

In [7]:
from tqdm import tqdm

In [8]:
batch_size=32
pack_pad = True

In [17]:
## Understand EHRdataloader
train_mbs = list(tqdm(EHRdataloader(train, batch_size = batch_size, packPadMode = pack_pad)))
print (' creating the list of valid minibatches')
valid_mbs = list(tqdm(EHRdataloader(valid, batch_size = batch_size, packPadMode = pack_pad)))
print (' creating the list of test minibatches')
test_mbs = list(tqdm(EHRdataloader(test, batch_size = batch_size, packPadMode = pack_pad)))

100%|██████████| 219/219 [00:03<00:00, 66.35it/s] 


 creating the list of valid minibatches


100%|██████████| 32/32 [00:00<00:00, 66.10it/s]


 creating the list of test minibatches


100%|██████████| 63/63 [00:00<00:00, 66.13it/s]


## 4. Train Recurrent Neural Network

Training the model, note that we are not using any pre-trained model.

In [18]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [19]:
import numpy as np 
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F 
use_cuda = torch.cuda.is_available()

#construct a whole embedding class from pytorch nn.module
#then we call this class in models after we define it 
class EHREmbeddings(nn.Module):
    #initialization and then the forward and things
    #DRNN has no bi, QRNN no bi, TLSTM has no bi, but DRNN has other cell-types 
    #cel_type are different for each model variation 
    def __init__(self, input_size, embed_dim ,hidden_size, n_layers=1,dropout_r=0.1,cell_type='LSTM', bii=False, time=False , preTrainEmb='', packPadMode = True):
        super(EHREmbeddings, self).__init__()
        self.embed_dim = embed_dim
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout_r = dropout_r
        self.cell_type = cell_type
        self.time=time
        self.preTrainEmb=preTrainEmb
        self.packPadMode = packPadMode
        if bii: 
            self.bi=2 
        else: 
            self.bi=1
            
        if len(input_size)==1:
            self.multi_emb=False
            if len(self.preTrainEmb)>0:
                emb_t= torch.FloatTensor(np.asmatrix(self.preTrainEmb))
                self.embed= nn.Embedding.from_pretrained(emb_t)#,freeze=False) 
                self.in_size= embed_dim ### need to be updated to be automatically capyured from the input
            else:
                input_size=input_size[0]
                self.embed= nn.Embedding(input_size, self.embed_dim,padding_idx=0)#,scale_grad_by_freq=True)
                self.in_size= embed_dim
        else:
            if len(input_size)!=3: 
                raise ValueError('the input list is 1 length')
            else: 
                self.multi_emb=True
                self.diag=self.med=self.oth=1

        if self.time: self.in_size= self.in_size+1 
               
        
        self.cell = nn.LSTM
        self.rnn_c = self.cell(self.in_size, self.hidden_size, num_layers=self.n_layers, dropout= self.dropout_r, bidirectional=bii, batch_first=True)
        self.out = nn.Linear(self.hidden_size*self.bi,1)
        self.sigmoid = nn.Sigmoid()
      
                            
    #let's define this class method
    def EmbedPatients_MB(self,mb_t, mtd): #let's define this
        self.bsize=len(mb_t) ## no of pts in minibatch
        
        embedded = self.embed(mb_t)  ## Embedding for codes
        embedded = torch.sum(embedded, dim=2) 
        if self.time:
            mtd_t= Variable(torch.stack(mtd,0))
            if use_cuda: 
                mtd_t.cuda()
            out_emb= torch.cat((embedded,mtd_t),dim=2)
        else:
            out_emb= embedded
        if use_cuda:
            out_emb.cuda()        
        return out_emb
    


In [20]:
# Model 1:RNN & Variations: GRU, LSTM, Bi-RNN, Bi-GRU, Bi-LSTM
class EHR_RNN(EHREmbeddings):
    def __init__(self,input_size,embed_dim, hidden_size, n_layers=1,dropout_r=0.1,cell_type='GRU',bii=False ,time=False, preTrainEmb='',packPadMode = True):
       	EHREmbeddings.__init__(self,input_size, embed_dim ,hidden_size, n_layers=n_layers, dropout_r=dropout_r, cell_type=cell_type, bii=bii, time=time , preTrainEmb=preTrainEmb, packPadMode=packPadMode)
    #embedding function goes here 
    
    def EmbedPatient_MB(self, input, mtd):
        return EHREmbeddings.EmbedPatients_MB(self, input, mtd)
    
    def init_hidden(self):
        h_0 = Variable(torch.rand(self.n_layers*self.bi,self.bsize, self.hidden_size))
        if use_cuda: 
            h_0.cuda()
        if self.cell_type == "LSTM":
            result = (h_0,h_0)
        else: 
            result = h_0
        return result
    
    def forward(self, input, x_lens, mtd):
        x_in  = self.EmbedPatient_MB(input, mtd) 
        ### uncomment the below lines if you like to initiate hidden to random instead of Zero which is the default
        #h_0= self.init_hidden()
        #if use_cuda: h_0.cuda()
        if self.packPadMode: 
            x_inp = nn.utils.rnn.pack_padded_sequence(x_in,x_lens,batch_first=True)   
            output, hidden = self.rnn_c(x_inp)#,h_0) 
        else:
            output, hidden = self.rnn_c(x_in)#,h_0) 
        
        if self.cell_type == "LSTM":
            hidden=hidden[0]
        if self.bi==2:
            output = self.sigmoid(self.out(torch.cat((hidden[-2],hidden[-1]),1)))
        else:
            output = self.sigmoid(self.out(hidden[-1]))
        return output.squeeze()

In [21]:
input_size=[30000]
embed_dim=128
hidden_size=128
n_layers=4
dropout_r=0.2


In [22]:
model = EHR_RNN(input_size= input_size, 
                          embed_dim=embed_dim, 
                          hidden_size= hidden_size,
                          dropout_r=dropout_r,cell_type='LSTM') 

optimizer = optim.Adam(model.parameters(), 
                           lr=0.001)
criterion = nn.BCELoss()

In [23]:
import random
from sklearn.metrics import roc_auc_score  
from sklearn.metrics import roc_curve 

def calculate_auc(model, mbs_list,shuffle = True): 
    model.eval() 
    y_real =[]
    y_hat= []
    if shuffle: 
        random.shuffle(mbs_list)
    for i,batch in enumerate(mbs_list):
        sample, label_tensor, seq_l, mtd = batch
        output = model(sample, seq_l, mtd)
        y_hat.extend(output.cpu().data.view(-1).numpy())  
        y_real.extend(label_tensor.cpu().data.view(-1).numpy())
         
    auc = roc_auc_score(y_real, y_hat)
    return auc, y_real, y_hat 


## 5. Results Recurrent Neural Network

Here you will find the table of the results for the RNN GRU

In [24]:
epochs=10
for ep in range(epochs):
    current_loss = 0
    train_loss =[]
    plot_every = 5
    n_iter = 0 
    for i,batch in enumerate(train_mbs):
        sample, label_tensor, seq_l, mtd = batch
        
        model.train() ## LR added Jul 10, that is the right implementation
        model.zero_grad()
        output = model(sample,seq_l, mtd)   
        loss = criterion(output, label_tensor.ravel())    
        loss.backward()   
        optimizer.step()
        
        current_loss += loss.item()
        n_iter +=1
    
        if n_iter % plot_every == 0:
            train_loss.append(current_loss/plot_every)
            current_loss = 0  
            

    avg_loss = np.mean(train_loss)
    shuffle= False
    train_auc, _, _ = calculate_auc(model = model, mbs_list = train_mbs, shuffle = shuffle)
    valid_auc, _, _ = calculate_auc(model = model, mbs_list = valid_mbs, shuffle = shuffle)
    test_auc, _, _ = calculate_auc(model = model, mbs_list = test_mbs, shuffle = shuffle)
#     valid_time = timeSince(valid_start)
    print('\n Epoch %s: Train_auc %s, Valid_auc %s, Test_auc %s ,Training Average_loss %s'%(ep, train_auc.round(4), valid_auc.round(4),test_auc.round(4), avg_loss.round(4)))

    


 Epoch 0: Train_auc 0.9654, Valid_auc 0.4756, Test_auc 0.4952 ,Training Average_loss 0.6965

 Epoch 1: Train_auc 0.9989, Valid_auc 0.4749, Test_auc 0.4917 ,Training Average_loss 0.5323

 Epoch 2: Train_auc 1.0, Valid_auc 0.4876, Test_auc 0.4962 ,Training Average_loss 0.1989

 Epoch 3: Train_auc 1.0, Valid_auc 0.4854, Test_auc 0.4951 ,Training Average_loss 0.0379

 Epoch 4: Train_auc 1.0, Valid_auc 0.482, Test_auc 0.4958 ,Training Average_loss 0.0151

 Epoch 5: Train_auc 1.0, Valid_auc 0.4801, Test_auc 0.4968 ,Training Average_loss 0.0085

 Epoch 6: Train_auc 1.0, Valid_auc 0.4793, Test_auc 0.4983 ,Training Average_loss 0.0054

 Epoch 7: Train_auc 1.0, Valid_auc 0.4789, Test_auc 0.4991 ,Training Average_loss 0.0037

 Epoch 8: Train_auc 1.0, Valid_auc 0.4784, Test_auc 0.4995 ,Training Average_loss 0.0026

 Epoch 9: Train_auc 1.0, Valid_auc 0.4785, Test_auc 0.5 ,Training Average_loss 0.0019
