In [1]:
from EHRDataloader import EHRdataFromPickles, EHRdataloader

### Load Dataset

In [2]:
print('1 file found. Data will be split into train, validation and test.')
data = EHRdataFromPickles(root_dir = '../data/', 
                      file = 'toy.train', 
                      sort= False,
                      test_ratio = 0.2, 
                      valid_ratio = 0.1,
                      model='RNN') #No sort before splitting

# Dataloader splits
train, test, valid = data.__splitdata__() #this time, sort is true
# can comment out this part if you dont want to know what's going on here
# print(colored("\nSee an example data structure from training data:", 'green'))
# print(data.__getitem__(35, seeDescription = True))

1 file found. Data will be split into train, validation and test.


In [29]:
## Load the labels

In [3]:
labels=[]
for ii in range(len(train)):
    label=train[ii][1]
    labels.append(label)

In [4]:
from collections import Counter
Counter(labels)

Counter({1: 3511, 0: 3489})

In [30]:
## print sizes of train test and valid datasets

In [31]:
len(train),len(test),len(valid)

(7000, 2000, 1000)

## Sample of dataset

In [32]:
## example dataset for patient 0 and visit 0
patient=0
visit=0
print("Patient ID:", train[patient][0])
print("Heart Failure:", train[patient][1])
print("# of visits:", len(train[patient][2]))

print(f' list of visit_time (since last time): {train[patient][2][visit][0]}')
print(f' list of codes corresponding to visit: {train[patient][2][visit][1]}')



Patient ID: 2569
Heart Failure: 1
# of visits: 154
 list of visit_time (since last time): [12753]
 list of codes corresponding to visit: [2836]


## Prepare Data for Training

In [36]:
from tqdm import tqdm

In [39]:
batch_size=32
pack_pad = True

In [40]:
## Understand EHRdataloader
train_mbs = list(tqdm(EHRdataloader(train, batch_size = batch_size, packPadMode = pack_pad)))
print (' creating the list of valid minibatches')
valid_mbs = list(tqdm(EHRdataloader(valid, batch_size = batch_size, packPadMode = pack_pad)))
print (' creating the list of test minibatches')
test_mbs = list(tqdm(EHRdataloader(test, batch_size = batch_size, packPadMode = pack_pad)))

100%|██████████| 219/219 [00:02<00:00, 88.38it/s] 
  9%|▉         | 3/32 [00:00<00:01, 27.20it/s]

 creating the list of valid minibatches


100%|██████████| 32/32 [00:00<00:00, 84.88it/s]
  3%|▎         | 2/63 [00:00<00:03, 17.57it/s]

 creating the list of test minibatches


100%|██████████| 63/63 [00:00<00:00, 87.90it/s]


## Train Model

In [51]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [121]:
# import models as model 
#from embedding import EHRembeddings 
# from EHREmb import EHREmbeddings

In [None]:
import numpy as np 
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F 
use_cuda = torch.cuda.is_available()

#construct a whole embedding class from pytorch nn.module
#then we call this class in models after we define it 
class EHREmbeddings(nn.Module):
    #initialization and then the forward and things
    #DRNN has no bi, QRNN no bi, TLSTM has no bi, but DRNN has other cell-types 
    #cel_type are different for each model variation 
    def __init__(self, input_size, embed_dim ,hidden_size, n_layers=1,dropout_r=0.1,cell_type='LSTM', bii=False, time=False , preTrainEmb='', packPadMode = True):
        super(EHREmbeddings, self).__init__()
        self.embed_dim = embed_dim
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout_r = dropout_r
        self.cell_type = cell_type
        self.time=time
        self.preTrainEmb=preTrainEmb
        self.packPadMode = packPadMode
        if bii: 
            self.bi=2 
        else: 
            self.bi=1
            
        if len(input_size)==1:
            self.multi_emb=False
            if len(self.preTrainEmb)>0:
                emb_t= torch.FloatTensor(np.asmatrix(self.preTrainEmb))
                self.embed= nn.Embedding.from_pretrained(emb_t)#,freeze=False) 
                self.in_size= embed_dim ### need to be updated to be automatically capyured from the input
            else:
                input_size=input_size[0]
                self.embed= nn.Embedding(input_size, self.embed_dim,padding_idx=0)#,scale_grad_by_freq=True)
                self.in_size= embed_dim
        else:
            if len(input_size)!=3: 
                raise ValueError('the input list is 1 length')
            else: 
                self.multi_emb=True
                self.diag=self.med=self.oth=1

        #self.emb = self.embed.weight  LR commented Jul 10 19
        if self.time: self.in_size= self.in_size+1 
               
        if self.cell_type == "GRU":
            self.cell = nn.GRU
        elif self.cell_type == "RNN":
            self.cell = nn.RNN
        elif self.cell_type == "LSTM":
            self.cell = nn.LSTM
        elif self.cell_type == "QRNN":
            from torchqrnn import QRNN
            self.cell = QRNN
        elif self.cell_type == "TLSTM":
            from tplstm import TPLSTM
            self.cell = TPLSTM 
        else:
            raise NotImplementedError
       
        if self.cell_type == "QRNN": 
            self.bi=1 ### QRNN not support Bidirectional, DRNN should not be BiDirectional either.
            self.rnn_c = self.cell(self.in_size, self.hidden_size, num_layers= self.n_layers, dropout= self.dropout_r)
        elif self.cell_type == "TLSTM":
            self.bi=1 
            self.rnn_c = self.cell(self.in_size, hidden_size)
        else:
            self.rnn_c = self.cell(self.in_size, self.hidden_size, num_layers=self.n_layers, dropout= self.dropout_r, bidirectional=bii, batch_first=True)
         
        self.out = nn.Linear(self.hidden_size*self.bi,1)
        self.sigmoid = nn.Sigmoid()
      
                            
    #let's define this class method
    def EmbedPatients_MB(self,mb_t, mtd): #let's define this
        self.bsize=len(mb_t) ## no of pts in minibatch
        
        embedded = self.embed(mb_t)  ## Embedding for codes
        embedded = torch.sum(embedded, dim=2) 
        if self.time:
            mtd_t= Variable(torch.stack(mtd,0))
            if use_cuda: 
                mtd_t.cuda()
            out_emb= torch.cat((embedded,mtd_t),dim=2)
        else:
            out_emb= embedded
        if use_cuda:
            out_emb.cuda()        
        return out_emb
    


In [140]:
# Model 1:RNN & Variations: GRU, LSTM, Bi-RNN, Bi-GRU, Bi-LSTM
class EHR_RNN(EHREmbeddings):
    def __init__(self,input_size,embed_dim, hidden_size, n_layers=1,dropout_r=0.1,cell_type='GRU',bii=False ,time=False, preTrainEmb='',packPadMode = True):
       	EHREmbeddings.__init__(self,input_size, embed_dim ,hidden_size, n_layers=n_layers, dropout_r=dropout_r, cell_type=cell_type, bii=bii, time=time , preTrainEmb=preTrainEmb, packPadMode=packPadMode)
    #embedding function goes here 
    
    def EmbedPatient_MB(self, input, mtd):
        return EHREmbeddings.EmbedPatients_MB(self, input, mtd)
    
    def init_hidden(self):
        h_0 = Variable(torch.rand(self.n_layers*self.bi,self.bsize, self.hidden_size))
        if use_cuda: 
            h_0.cuda()
        if self.cell_type == "LSTM":
            result = (h_0,h_0)
        else: 
            result = h_0
        return result
    
    def forward(self, input, x_lens, mtd):
        x_in  = self.EmbedPatient_MB(input, mtd) 
        ### uncomment the below lines if you like to initiate hidden to random instead of Zero which is the default
        #h_0= self.init_hidden()
        #if use_cuda: h_0.cuda()
        if self.packPadMode: 
            x_inp = nn.utils.rnn.pack_padded_sequence(x_in,x_lens,batch_first=True)   
            output, hidden = self.rnn_c(x_inp)#,h_0) 
        else:
            output, hidden = self.rnn_c(x_in)#,h_0) 
        
        if self.cell_type == "LSTM":
            hidden=hidden[0]
        if self.bi==2:
            output = self.sigmoid(self.out(torch.cat((hidden[-2],hidden[-1]),1)))
        else:
            output = self.sigmoid(self.out(hidden[-1]))
        return output.squeeze()

In [141]:
input_size=[30000]
embed_dim=128
hidden_size=128
n_layers=2
dropout_r=0.2
patience=3




In [142]:
model = EHR_RNN(input_size= input_size, 
                          embed_dim=embed_dim, 
                          hidden_size= hidden_size,
                          dropout_r=dropout_r) 

optimizer = optim.Adam(model.parameters(), 
                           lr=0.01, 
                           weight_decay=1e-04)


In [143]:
criterion = nn.BCELoss()

In [None]:
epochs=10
for ep in range(epochs):
    for i,batch in enumerate(train_mbs):
        sample, label_tensor, seq_l, mtd = batch
        
        model.train() ## LR added Jul 10, that is the right implementation
        model.zero_grad()
        output = model(sample,seq_l, mtd)   
        loss = criterion(output, label_tensor.ravel())    
        loss.backward()   
        optimizer.step()

    print(f'Epoch {ep}, Loss {loss}')


Epoch 0, Loss0.5003598928451538
Epoch 1, Loss0.1714329868555069
Epoch 2, Loss0.41065284609794617
Epoch 3, Loss0.15212054550647736
Epoch 4, Loss0.3622772991657257
Epoch 5, Loss0.1880924552679062
