# DL Model
This notebook defines, trains, and validates the DL model.

It first loads following data files generated from pre-processing steps. Input and output data Tensors are created and pushed to GPU memory. Then, model is defined, trained, and validated.

In [30]:
from gensim.models.keyedvectors import KeyedVectors
from gensim.parsing.preprocessing import preprocess_string
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
import math
import csv
import pickle
import time

In [2]:
# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [3]:
# Set the device type as cpu or cuda depending upon the execution environment.

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
device

device(type='cuda')

In [5]:
# Mount the google drive at 'drive' directory in the colab virtual machine.

from google.colab import drive
drive.mount('drive')

Mounted at drive


In [6]:
# Define variable to point to the project directory in google drive.

PROJECT_DIR = 'drive/My Drive/cs598-dl/'

In [7]:
'''
Import the "notes" DataFrame, with HADM_ID (Hospitalization ID) as index and TEXT (Discharge summary) as column, 
created in pre-processing step. Its dimensions are 52691 rows × 1 columns.
'''

notes_df = pd.read_pickle(PROJECT_DIR + 'data/notes.pkl.gz')

In [8]:
notes_df

Unnamed: 0_level_0,TEXT
HADM_ID,Unnamed: 1_level_1
167853,Admission Date: [**2151-7-16**] Dischar...
107527,Admission Date: [**2118-6-2**] Discharg...
167118,Admission Date: [**2119-5-4**] D...
196489,Admission Date: [**2124-7-21**] ...
135453,Admission Date: [**2162-3-3**] D...
...,...
147266,Admission Date: [**2147-2-25**] ...
129802,Admission Date: [**2190-5-13**] ...
182558,Admission Date: [**2121-6-13**] ...
184741,Admission Date: [**2182-4-19**] ...


In [9]:
'''
Import the "codes" DataFrame, with HADM_ID (Hospitalization ID) as index and multi-hot encoding of 
ICD9-codes (booleans) as columns. Its dimensions are a 52691 rows × 6984 columns. So, we have 6984 ICD9 codes.
'''

codes = pd.read_pickle(PROJECT_DIR + 'data/diagnoses.pkl.gz')

In [10]:
codes

Unnamed: 0_level_0,ICD9_CODE_0030,ICD9_CODE_0031,ICD9_CODE_0038,ICD9_CODE_0039,ICD9_CODE_0041,ICD9_CODE_0048,ICD9_CODE_0049,ICD9_CODE_0051,ICD9_CODE_00581,ICD9_CODE_0059,...,ICD9_CODE_V8801,ICD9_CODE_V8811,ICD9_CODE_V8812,ICD9_CODE_V8821,ICD9_CODE_V9010,ICD9_CODE_V902,ICD9_CODE_V9039,ICD9_CODE_V9081,ICD9_CODE_V9089,ICD9_CODE_V9103
HADM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
100003,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
100006,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
100007,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
100009,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199993,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
199994,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
199995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
199998,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [11]:
'''
Split the HADM_IDs (hospitalization IDs) into train-test in 90:10 ratio. Two lists are generated:
    - hadm_ids_train
    - hadm_ids_test
'''

hadm_ids_train, hadm_ids_test = train_test_split(notes_df.index.tolist(), test_size = 0.10, random_state=seed)
print(hadm_ids_train)

[156291, 145717, 153026, 115213, 174814, 157315, 129826, 138077, 171444, 148661, 177938, 106621, 138337, 166833, 136350, 124868, 103579, 106674, 138500, 150047, 183784, 153770, 199214, 189738, 124066, 149766, 154681, 141291, 187747, 193722, 143747, 141626, 130133, 116229, 124998, 180054, 122123, 138715, 179349, 144916, 132920, 199713, 199900, 179153, 128247, 130772, 136909, 121383, 126049, 192358, 146687, 181790, 124381, 197892, 105244, 121340, 136606, 122153, 101106, 102382, 164531, 163687, 106443, 186056, 114659, 197081, 100177, 178592, 155541, 120762, 161383, 104297, 146254, 144565, 151854, 137276, 161662, 104398, 179659, 165606, 184332, 133061, 168160, 148694, 172673, 124570, 116303, 189561, 158039, 128104, 160163, 140856, 155483, 101193, 188196, 121289, 193656, 103394, 165964, 165640, 165933, 116231, 166923, 182239, 184568, 173306, 130895, 180007, 181263, 113968, 139011, 112496, 169134, 183461, 133710, 166497, 135212, 104787, 116832, 146648, 107683, 178450, 150451, 187297, 144582,

In [12]:
'''
Create training tensor consisting of multi-hot ICD9-codes in each row, where each row corresponds to HADM_ID in 
hadm_ids_train list.
'''

codes_train = torch.zeros((len(hadm_ids_train), 6984), dtype=bool)
for index, hadm_id in enumerate(hadm_ids_train):
  vec = torch.tensor(codes.loc[hadm_id].to_numpy())
  codes_train[index] = vec

In [13]:
# Push the tensor to GPU memory.

codes_train = codes_train.to(device)

In [14]:
'''
Create test tensor consisting of multi-hot ICD9-codes in each row, where each row corresponds to HADM_ID in 
hadm_ids_test list.
'''
codes_test = torch.zeros((len(hadm_ids_test), 6984), dtype=bool)
for index, hadm_id in enumerate(hadm_ids_test):
  vec = torch.tensor(codes.loc[hadm_id].to_numpy())
  codes_test[index] = vec

In [16]:
'''
Load the Doc2Vec embeddings for discharge summary reports, generated during pre-processing step. The data is in 
Gensim KeyedVector format.
'''

dv = KeyedVectors.load(PROJECT_DIR + 'data/dv.kv')

In [17]:
'''
Create the training tensor consisting of a Doc2Vec embedding in each row, where each row corresponds to HADM_ID in 
hadm_ids_train list.
'''

dv_train = torch.zeros((len(hadm_ids_train), 128))
for index, hadm_id in enumerate(hadm_ids_train):
  vec = torch.Tensor(dv[str(hadm_id)].tolist())
  dv_train[index] = vec

In [18]:
# Push the tensor to GPU memory. Presence of pre-processed data in GPU memory helps improving the performance.

dv_train = dv_train.to(device)

In [19]:
dv_train.is_cuda

True

In [20]:
'''
Create the test tensor consisting of a Doc2Vec embedding in each row, where each row corresponds to HADM_ID in 
hadm_ids_test list.
'''

dv_test = torch.zeros((len(hadm_ids_test), 128))
for index, hadm_id in enumerate(hadm_ids_test):
  vec = torch.Tensor(dv[str(hadm_id)].tolist())
  dv_test[index] = vec

In [21]:
# Push the tensor to GPU memory.

dv_test = dv_test.to(device)

In [None]:
'''
Load the Word2Vec embeddings of all the words in vocabulary of the whole corpus, generated during the 
pre-processing step. This data is in Gensim KeyedVector format.
'''

wv = KeyedVectors.load(PROJECT_DIR + 'data/wv.kv')

In [None]:
'''
Load the dictionary mapping HADM_ID with a tokenized document. These tokens are basically the list of words 
belonging to the corresponding Discharge summary report.
'''

with open(PROJECT_DIR + 'data/tokens_map.pkl', 'rb') as handle:
  tokens_dict = pickle.load(handle)

In [None]:
'''
Create the training tensor consisting of a concatenated Word2Vec embeddings of all the words in the given document, 
in each row, where each row corresponds to HADM_ID in hadm_ids_train list. This is a very slow running step, 
but once done, helps train the model faster.
'''

tokens_train = torch.zeros((len(hadm_ids_train), 70000))
for index, hadm_id in enumerate(hadm_ids_train):
  tokens = tokens_dict[hadm_id]
  word_vecs = torch.Tensor(wv.__getitem__(tokens).flatten().tolist())
  tokens_train[index][0:len(word_vecs)] = word_vecs

In [None]:
# Push the tokens_train tensor to GPU memory.

tokens_train = tokens_train.to(device)

In [None]:
'''
Create the test tensor consisting of a concatenated Word2Vec embeddings of all the words in the given document, in 
each row, where each row corresponds to HADM_ID in hadm_ids_test list.
'''

tokens_test = torch.zeros((len(hadm_ids_test), 70000))
for index, hadm_id in enumerate(hadm_ids_test):
  tokens = tokens_dict[hadm_id]
  word_vecs = torch.Tensor(wv.__getitem__(tokens).flatten().tolist())
  tokens_test[index][0:len(word_vecs)] = word_vecs

In [None]:
# Push the tensor to GPU memory.

tokens_test = tokens_test.to(device)

In [22]:
# how many samples per batch to load
batch_size = 50

In [23]:
'''
Define the implementation of PyTorch Dataset. This is very light-weight, as all the data tensors (dv_train, 
dv_test, tokens_train, tokens_test, and codes_train) have already been pushed to GPU memory. So they can be 
referenced by HADM_ID index. This dataset simply returns the input index as the data in __getitem()__ method, which 
will be used during model training to access the input and output data from data tensors.
'''

class DocumentsDataset(Dataset):
    def __init__(self, count):
        super(DocumentsDataset).__init__()
        self.count = count
    def __len__(self):
        return self.count
    def __getitem__(self, idx):  
        return idx

In [24]:
# prepare dataloaders
train_loader = DataLoader(DocumentsDataset(len(hadm_ids_train)), batch_size = batch_size, shuffle = True)
test_loader = DataLoader(DocumentsDataset(len(hadm_ids_test)), batch_size = batch_size)

print("# of train batches:", len(train_loader))
print("# of val batches:", len(test_loader))

# of train batches: 949
# of val batches: 106


## Create DL Model

The model is a DL network with two "logical" components:
- **Encoder** to generate document embeddings: The function of this component is to generate effective fixed-length embedding for a given discharge summary document.This component consists of two "logical" sub-components:
        - D2V: This sub-component first trains (as pre-processing step) Doc2Vec model to learn input document vectors of length `128`, in an unsupervised way. It then fine tunes this vector,using a fully connected layer of `64` neurons, followed by a non-linear activation like sigmoid. This fine-tune layer is trained in supervised way.
        - CNN: This sub-component trains a Word2Vec model as pre-processing step to build word vectors for the whole vocabulary of the collective corpus of documents. For each document, all the vectors corresponding to the contained words, are concatenated, to represent the given document. These document vectors are used as input to the CNN sub-component. This sub-component actually comprises of 3 single-layer multi-channel CNN models. Three CNN models correspond to 3 kernel sizes (of 3, 4, and 5 words)) with 64 output channels each. For CNN layer in each model is followed by a MaxPool layer to perform temporal pooling. The outputs of each of these CNN models are concatenated to generate the output vector per document of size `192 (3 models * 64 channels each)`. 

The ouput vectors from the two sub-components (D2V and CNN) are concatenated to produce the final vector for each document in the batch. Ths final vector size is `256 (64 from DNN + 192 from CNN)`.

- **Classifier** to perform multi-label classification of ICD-9 codes. This component consists of:
    - Dropout layer: The document vector generated by encoder component is regularized by stochastically dropping different dimensions.
	- Fully connected layer with sigmoid activation: This layer generates the final output of size `6984` (total number of ICD-9 codes)}. Each dimension (representing an ICD-9 code) is assigned a probability by sigmoid activation.



<img src="./architecture.png">

In [1]:
'''
Define the model class which represents the D2V sub-component that uses single fully connected layer to fine-tune 
the Doc2Vec embeddings. The output is a vector representing the input document.
'''

class D2V(nn.Module):
    def __init__(self):
        super(D2V, self).__init__()
        self.fc = nn.Linear(128, 64)
    def forward(self, x):
        return torch.sigmoid(self.fc(x))

NameError: name 'nn' is not defined

In [None]:
'''
Define the model class which is the building block of the CNN sub-component containining a single layer of 
multi-channel CNN kernel, activation function, and max-pooling layer.
'''
# Define the model class which performs 1D-CNN and max-pooling. Details in starting cell.

class CNN(nn.Module):
    def __init__(self, kernel_size):
        super(CNN, self).__init__()
        self.conv = nn.Conv1d(1, 64, kernel_size)
        self.pool = nn.MaxPool1d(70000 - kernel_size + 1)
    def forward(self, x):
        return self.pool(torch.sigmoid(self.conv(x))).squeeze(2)

In [None]:
'''
Define the model class for CNN sub-component. This component concatenates the output of 3 CNN components (described 
previously), each corresponding to different kernel size (3, 4, and 5 words). Its input is "concatenated" Word2Vec 
embeddings of all words within a document, which it passes to the three CNN components parallely, and then combines 
their output to create a vector representing the input document.
'''

class CNN_COMBINED(nn.Module):
    def __init__(self):
        super(CNN_COMBINED, self).__init__()
        self.conv_3 = CNN(300)
        self.conv_4 = CNN(400)
        self.conv_5 = CNN(500)
    def forward(self, x):
        return torch.cat((self.conv_3(x), self.conv_4(x), self.conv_5(x)), dim = 1)

In [40]:
'''
Define the class for the main model, which concatenates the output from D2V and CNN_COMBINED sub-components, as the 
final vector representation of the input document. It is the input to the multi-label classification task. The 
classification layer consists of a dropout layer to achieve regularization by stochastically dropping different 
dimensions of input document vector. It is followed by a fully connected layer with sigmoid activation: This layer 
generates the final output vector of size of 6984 (total number of ICD-9 codes)}. Each dimension (representing an 
ICD-9 code) is assigned a probability by sigmoid activation.
'''

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.d2v = D2V()
        self.cnn = CNN_COMBINED()
        self.fc2 = nn.Linear(256, 6984)
        self.dropout = nn.Dropout(p = 0.20)

    def forward(self, x_indexes, train = True):
        x_d2v = dv_train[x_indexes] if train else dv_test[x_indexes]
        x_cnn = tokens_train[x_indexes] if train else tokens_test[x_indexes]
        x_cnn = x_cnn.unsqueeze(dim = 1)
        y_d2v = self.d2v(x_d2v)
        y_cnn = self.cnn(x_cnn)
        return torch.sigmoid(self.fc2(self.dropout(torch.cat((y_d2v, y_cnn), dim = 1))))

In [69]:
# Initialize the model and push it to GPU memory.

model = Net()
model.to(device)

Net(
  (d2v): D2V(
    (fc): Linear(in_features=128, out_features=64, bias=True)
  )
  (fc2): Linear(in_features=64, out_features=6984, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [70]:
# Define the loss function and optimizer for back-propagation.
 
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [71]:
# Model Evaluation

from sklearn.metrics import *


def classification_metrics(Y_pred, Y_true):
    """
    Calculate peformance metrics using scikit-learn.
    
    Arguments:
        Y_pred: Long dtype Tensor of output values for the test set batch, as predicted by model.
        Y_true: Long dtype Tensor of true values in the test-set batch.
        
    Outputs:
        precision: overall micro-averaged precision score
        recall: overall micro-averaged recall score
        f1: overall micro-averaged f1 score
        
    REFERENCE: checkout https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
"""
    
    precision, recall, f1score = precision_score(Y_true, Y_pred, average = 'micro'), \
                                           recall_score(Y_true, Y_pred, average = 'micro'), \
                                           f1_score(Y_true, Y_pred, average = 'micro')
    return precision, recall, f1score


def evaluate(model, loader, threshold):
    """
    Evaluate the model.
    
    Arguments:
        model: Trained model of type nn.Module
        loader: Test DataLoader
        
    Outputs:
        precision: overall micro-averaged precision score
        recall: overall micro-averaged recall score
        f1: overall micro-averaged f1 score
"""
    
    model.eval()
    all_y_true = torch.LongTensor()
    all_y_pred = torch.LongTensor()
    for x_indexes in loader:
        x_indexes = x_indexes.long()
        y = codes_test[x_indexes]
        x_indexes = x_indexes.to(device)
        y_hat = model(x_indexes, False)
        y_pred = y_hat.detach().to('cpu').apply_(lambda x: 1 if x > threshold else 0)
        all_y_true = torch.cat((all_y_true, y.long()), dim=0)
        all_y_pred = torch.cat((all_y_pred,  y_pred.to('cpu').long()), dim=0)
        
    precision, recall, f1 = classification_metrics(all_y_pred.detach().numpy(), all_y_true.detach().numpy())
    print(f"precision: {precision:.3f}, recall: {recall:.3f}, f1: {f1:.3f}")
    return precision, recall, f1

In [72]:
# Define the epoch counter. This is defined separately to facilitate multi-step training.
epoch = 0
#train_loss = 0

In [75]:
# Model Training

# number of epochs to train the model
n_epochs = 80

# The classification probability thershold. 
threshold = 0.20

sta = time.time()

while epoch < n_epochs:
    # prep model for training
    model.train()

    train_loss = 0
    for x_indexes in train_loader:
        x_indexes = x_indexes.long()
        x_indexes = x_indexes.to(device)
        y = codes_train[x_indexes].float()
        optimizer.zero_grad()
        y_hat = model(x_indexes, True)
        loss = criterion(y_hat, y)
        loss.backward()
        optimizer.step()
    print('Epoch: {} done!'.format(epoch))
    if epoch % 2 == 0:
        evaluate(model, test_loader, threshold)
    epoch += 1

end = time.time()
print('Time spent:' + str(end - sta))

Epoch: 60 done!
precision: 0.294, recall: 0.320, f1: 0.306
Epoch: 61 done!
Epoch: 62 done!
precision: 0.291, recall: 0.322, f1: 0.306
Epoch: 63 done!
Epoch: 64 done!
precision: 0.294, recall: 0.320, f1: 0.307
Epoch: 65 done!
Epoch: 66 done!
precision: 0.291, recall: 0.324, f1: 0.307
Epoch: 67 done!
Epoch: 68 done!
precision: 0.290, recall: 0.326, f1: 0.307
Epoch: 69 done!
Epoch: 70 done!
precision: 0.291, recall: 0.324, f1: 0.307
Epoch: 71 done!
Epoch: 72 done!
precision: 0.292, recall: 0.321, f1: 0.306
Epoch: 73 done!
Epoch: 74 done!
precision: 0.291, recall: 0.321, f1: 0.305
Epoch: 75 done!
Epoch: 76 done!
precision: 0.292, recall: 0.323, f1: 0.307
Epoch: 77 done!
Epoch: 78 done!
precision: 0.293, recall: 0.321, f1: 0.306
Epoch: 79 done!
Time spent:357.83302330970764


In [None]:
# Save model internal state as intermediate checkpoint to facilitate multi-step training.

torch.save(model.state_dict(), PROJECT_DIR + 'data/checkpoint.pth')