In [None]:
#########################
# 1. PREPROCESS
# 1-1. Setup & Load Data
#########################
'''
# Module Installation
!{sys.executable} -m pip install pandas
'''
import sys
import os
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

SEED = 1234
os.environ["PYTHONHASHSEED"] = str(SEED)

# 1-1-1.Define Data Path
DATA_PATH = 'mimic-iii-1.4'
assert os.path.isdir(DATA_PATH)
DATA_FILE = 'notes_labeled.csv'

notes = pd.read_csv('%s/%s' % (DATA_PATH, 'NOTEEVENTS.csv'), usecols=['HADM_ID', 'CATEGORY', 'TEXT'], dtype={'CATEGORY': str, 'TEXT': str})
procodes = pd.read_csv('%s/%s' % (DATA_PATH, 'PROCEDURES_ICD.csv'), usecols=['HADM_ID', 'ICD9_CODE'], dtype={'HADM_ID': np.uint64, 'ICD9_CODE': str})
digcodes = pd.read_csv('%s/%s' % (DATA_PATH, 'DIAGNOSES_ICD.csv'), usecols=['HADM_ID', 'ICD9_CODE'], dtype={'HADM_ID': np.uint64, 'ICD9_CODE': str})
pcodes = procodes.copy()
dcodes = digcodes.copy()
pcodes['ICD9_CODE'] = pcodes['ICD9_CODE'].apply(lambda x: x[:2] + '.' + x[2:] if len(str(x)) > 2 else x) # decimal point between the 2rd and 3rd digit
dcodes['ICD9_CODE'] = dcodes['ICD9_CODE'].apply(lambda x: x[:3] + '.' + x[3:] if len(str(x)) > 3 else x) # decimal point between the 3rd and 4th digit
codes = pd.concat([pcodes, dcodes])
codes.to_csv('%s/%s' % (DATA_PATH, 'codes.csv'), index=False, columns=['HADM_ID', 'ICD9_CODE'], header=['HADM_ID', 'ICD9_CODE'])

In [None]:
#########################
# 1-2. Preprocess Data
#########################
import string
import csv

# 1-2-1.Preprocess Clinical Notes
def preprocess_text(df):
    # Discharge Summary Notes
    df.drop(df[df['CATEGORY']!='Discharge summary'].index, inplace=True)
    df.drop(columns=['CATEGORY'], inplace=True)
    df.TEXT = df.TEXT.fillna(' ') # replace NA with space
    df.TEXT = df.TEXT.str.replace('\n', ' ') # replace newline with space
    df.TEXT = df.TEXT.str.replace('\r', ' ') # replace return with space
    
    # Replace punctuation (special characters) with space
    dictionary = dict.fromkeys(string.punctuation, ' ') # e.g. {'=' : ' '}
    trans_dictionary = str.maketrans(dictionary) # dictionary for translation 
    df.TEXT = df.TEXT.str.translate(trans_dictionary) # replace punctuation with space
    df.TEXT = df.TEXT.str.replace('\d+', ' ') # replace digit with space
    df.TEXT = df.TEXT.str.lower() # lowercase
    df.TEXT = df.TEXT.str.replace('\s+', ' ', regex=True) # replace multiple spaces with space
    return df

# 1-2-2.Concat Notes
def concat_notes(df):
    df = df.sort_values(['HADM_ID'], ascending=True)
    df0 = pd.DataFrame(columns=['HADM_ID', 'TEXT'])
    txs = []
    hadm_id = df.iloc[0]['HADM_ID']
    for row in df.itertuples():
        id = row.HADM_ID
        if hadm_id != id:
            df0 = df0.append({'HADM_ID': int(hadm_id), 'TEXT': str(' '.join(txs))}, ignore_index=True)
            txs = []
            hadm_id = id
        txs.append(str(row.TEXT))
    df0 = df0.append({'HADM_ID': int(hadm_id), 'TEXT': str(' '.join(txs))}, ignore_index=True)
    return df0

# 1-2-3.Exclude Unnecessary Codes (Consolidate Labels)
def exclude_codes(df1, df2):
    ids = set(df1['HADM_ID']) # = df1['HADM_ID'].unique()
    idcs = []
    i = 0
    for id in df2['HADM_ID']: # = df2.HADM_ID.values
        if id in ids: # or not in ids
            idcs.append(i)
        i += 1
    return df2.iloc[idcs] # or df2.drop(idcs, inplace=True)

# 1-2-4.Concat Codes
def concat_codes(df):
    df = df.sort_values(['HADM_ID'], ascending=True)
    df0 = pd.DataFrame(columns=['HADM_ID', 'ICD9_CODE'])
    cds = []
    hadm_id = df.iloc[0]['HADM_ID']
    for row in df.itertuples():
        id = row.HADM_ID
        if hadm_id != id:
            df0 = df0.append({'HADM_ID': int(hadm_id), 'ICD9_CODE': str(';'.join(cds))}, ignore_index=True)
            cds = []
            hadm_id = id
        cds.append(str(row.ICD9_CODE))
    df0 = df0.append({'HADM_ID': int(hadm_id), 'ICD9_CODE': str(';'.join(cds))}, ignore_index=True)
    return df0

# 1-2-5.Merge & Save (notes_labeled.csv)
def merge_save(df1, df2, file_name):
    df1 = df1.sort_values(['HADM_ID'])
    df2 = df2.sort_values(['HADM_ID'])
    df = pd.merge(df1, df2, how='inner', on='HADM_ID')
    df['LENGTH'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
    df = df.sort_values(['LENGTH'], ascending=False) # descending order
    df.to_csv('%s/%s' % (DATA_PATH, file_name), columns=['HADM_ID', 'LENGTH', 'TEXT', 'ICD9_CODE'],
              index=False, header=True, quoting=csv.QUOTE_NONE, escapechar='')
    return df

# Preprocess Data
notes = preprocess_text(notes)
notes = concat_notes(notes)
codes = exclude_codes(notes, codes)
codes = concat_codes(codes)
notes['HADM_ID'] = notes['HADM_ID'].astype('int')
notes_labeled = merge_save(notes, codes, DATA_FILE)

In [None]:
#########################
# 1-3. Top ICD9_CODE
#########################
from collections import Counter
TOP = 50

# 1-3-1.Top ICD9_CODE
def get_top_codes(df, top=TOP):
    counts = Counter()
    for row in df.itertuples():
        for code in str(row.ICD9_CODE).split(';'):
            counts[code] += 1
    codes = counts.most_common(top) # code, not count
    return [i[0] for i in codes]

# 1-3-2.Save Top Codes
def save_top_codes(codes):
    df = pd.DataFrame(codes, columns=['ICD9_CODE'])
    df.to_csv('%s/codes_top.csv' % DATA_PATH, columns=['ICD9_CODE'], index=False, header=False, quoting=csv.QUOTE_NONE, escapechar='')
    return df

# Top ICD9_CODE
codes_top = get_top_codes(notes_labeled)
codes_top = save_top_codes(codes_top)

In [None]:
#########################
# 1-4. Word2Vec Model
#########################
'''
# Module Installation
!{sys.executable} -m pip install gensim
'''
from gensim.models import Word2Vec

# 1-4-1.Word2Vec for Word Embedding based on the frequency of occurrence of a word
# similar words have vectors near each other (relationships = distance in vector space)
def create_w2v(text, vec_size=100, window=5, min_count=3, workers=4): # Default: vec_size=100, window=5
    # Word2Vec default: sg=0:CBOW(ContinuousBagOfWords)(sg=1:Skip-gram), vector_size=100, min_count=5, workers=3, epochs=5
    w2v = Word2Vec(sentences=text, size=vec_size, min_count=min_count, workers=workers)
    return w2v

# 1-4-2.Tokenize Text into Words on notes_labeled
text = [vals.split() for vals in notes_labeled.TEXT.values]

# W2V on text of notes_labeled
w2v = create_w2v(text)
w2v.save('%s/%s' % (DATA_PATH, 'w2v.model'))

In [None]:
#########################
# 1-5. Vocabulary
#########################
# 1-5-1.Get Vocab List
def get_vocab_list(vocab):
    vocab_list = list(vocab)
    vocab_df = pd.DataFrame(vocab_list)
    vocab_df.to_csv('%s/vocab.csv' % DATA_PATH, index=False, header=False, quoting=csv.QUOTE_NONE)
    return vocab_list

# Vocabulary
vocab = w2v.wv.vocab # datatype = dictionary
del w2v # free up memory
vocab_list = get_vocab_list(vocab)

In [None]:
#########################
# 1-6. Top ICD9_CODE Text
#########################
MAX_LENGTH = 2500

# 1-6-1.Get Max Length Text
def get_max_length_text(text):
    text = np.array(text)
    idxs = []
    done = False
    for v in vocab_list:
        for i, w in enumerate(text):
            if len(idxs) < MAX_LENGTH:
                idxs.append(i)
            else:
                done = True
                break
        if done:
            break
    return list(text[sorted(idxs)])

# 1-6-2.Get Top notes_labeled
def get_notes_labeled_top(df, codes_top, file_name):
    codes_top = set(codes_top)
    cols = ['HADM_ID', 'LENGTH', 'TEXT', 'ICD9_CODE']
    df0 = pd.DataFrame(columns=cols)
    for row in df.itertuples():
        codes = set(str(row.ICD9_CODE).split(';'))
        cds = codes.intersection(codes_top)
        if len(cds) > 0:
            length = int(row.LENGTH)
            text = str(row.TEXT).split()
            if length > MAX_LENGTH:
                text = get_max_length_text(text)
                length = len(text)
            df0 = df0.append({'HADM_ID': int(row.HADM_ID), 'LENGTH': int(length), 'TEXT': str(' '.join(text)), 'ICD9_CODE': str(';'.join(cds))}, ignore_index=True)

    df0 = df0.sort_values(['LENGTH'], ascending=False) # descending order
    df0.to_csv('%s/%s' % (DATA_PATH, file_name), columns=cols,
               index=False, header=True, quoting=csv.QUOTE_NONE, escapechar='')
    return df0

# Top notes_labeled
notes_labeled_top = get_notes_labeled_top(notes_labeled, codes_top['ICD9_CODE'], 'notes_labeled_top.csv')

In [None]:
#########################
# 1-7. Split Data for TOP
#########################
import random

# 1-7-1.Split Data of top_notes_labeled
def split_data(df, train_name='train_top.csv', valid_name='valid_top.csv', test_name='test_top.csv', ratio1=0.9, ratio2=0.7):
    cols = ['HADM_ID', 'LENGTH', 'TEXT', 'ICD9_CODE']
    max_length = len(df)
    train_idx = int(max_length * ratio1)
    valid_idx = train_idx + int((max_length - train_idx) * ratio2)
    
    # Reproduce the same train/valid/test datasets and the results
    random_indices_file = '%s/%s' % (DATA_PATH, 'random_indices.csv')
    if os.path.isfile(random_indices_file):
        idxs = (pd.read_csv(random_indices_file, header=None, dtype=np.uint64)[0]).tolist()
    else:
        idxs = [i for i in range(max_length)]
        random.shuffle(idxs)
        df_idxs = pd.DataFrame(idxs)
        df_idxs.to_csv(random_indices_file, index=False, header=False, quoting=csv.QUOTE_NONE, escapechar='')
    
    trn, vld, tst = df.iloc[idxs[:train_idx]], df.iloc[idxs[train_idx:valid_idx]], df.iloc[idxs[valid_idx:max_length]]
    trn.to_csv('%s/%s' % (DATA_PATH, train_name), index=False, columns=cols)
    vld.to_csv('%s/%s' % (DATA_PATH, valid_name), index=False, columns=cols)
    tst.to_csv('%s/%s' % (DATA_PATH, test_name), index=False, columns=cols)
    return trn, vld, tst

# Split Data on top_notes_labeled
train_top, valid_top, test_top = split_data(notes_labeled_top)

In [None]:
#########################
# 2. W2V EMBEDDING MATRIX
# 2-1. W2V Matrix
#########################
PAD_CHAR = '*'

def create_embeddings(wv_file, out_file):
    model = Word2Vec.load(wv_file)
    wv = model.wv
    del model # free up memory
    vocab = set(wv.vocab)
    ind2w = {i+1:w for i, w in enumerate(sorted(vocab))} # e.g. {4: 'aabdominal', 5: 'aabsent'}
    Ws, words = build_matrix(ind2w, wv)
    save_embeddings(Ws, words, out_file)

def build_matrix(ind2w, wv):
    length = len(wv.word_vec(wv.index2word[0])) # 100
    Ws = np.zeros((len(ind2w)+1, length)) # shape=(52515, 100)
    words = [PAD_CHAR]
    Ws[0][:] = np.zeros(length)
    for i, (idx, word) in enumerate(ind2w.items()): # e.g. idx=4, word='aabdominal'
        try:
            Ws[idx][:] = wv.word_vec(word) # e.g. Ws[1][:] = [ 2.09741426  0.7055248 ...]
            words.append(word)
        except:
            continue
    return Ws, words

def save_embeddings(Ws, words, out_file):
    with open(out_file, 'w', newline='') as f:
        for i in range(len(words)):
            line = [words[i]]
            line.extend([str(d) for d in Ws[i]])
            f.write(" ".join(line) + "\n") # e.g. i=82: abdominal -4.255998134613037 0.5219717621803284 ...

# Generate Embeddings by looking up vocab (on notes_labeled) using w2v (on notes_labeled)
create_embeddings('%s/%s' % (DATA_PATH, 'w2v.model'), '%s/%s' % (DATA_PATH, 'w2v.embed'))

In [None]:
#########################
# 3. DATASET VERIFICATION
# 3-1. Codes & Notes
#########################
# Be able to run all the sections below if all preprocessed data already exists
import os
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from gensim.models import Word2Vec

SEED = 1234
os.environ["PYTHONHASHSEED"] = str(SEED)
DATA_PATH = 'mimic-iii-1.4'
assert os.path.isdir(DATA_PATH)
DATA_FILE = 'notes_labeled.csv'

codes = pd.read_csv('%s/%s' % (DATA_PATH, 'codes.csv'), usecols=['HADM_ID', 'ICD9_CODE'], dtype={'HADM_ID': np.uint64, 'ICD9_CODE': str})
notes = pd.read_csv('%s/%s' % (DATA_PATH, 'NOTEEVENTS.csv'), usecols=['HADM_ID', 'CATEGORY', 'TEXT'], dtype={'CATEGORY': str, 'TEXT': str})
notes_labeled = pd.read_csv('%s/%s' % (DATA_PATH, DATA_FILE),
                            usecols=['HADM_ID', 'LENGTH', 'TEXT', 'ICD9_CODE'],
                            dtype={'HADM_ID': np.uint64, 'LENGTH': np.uint64, 'TEXT': str, 'ICD9_CODE': str})
pcodes = codes[codes.ICD9_CODE.str.find('.')==2]['ICD9_CODE'].tolist() # '.' appears at index2 (decimal point between the 2rd and 3th digit)

# 3-1-1.Check: Number of unique ICD9_CODE on codes
print('Number of unique ICD9_CODE : ' + str(len(codes['ICD9_CODE'].unique())) +
     ' (PROCEDURES_ICD=' + str(len(codes[codes.ICD9_CODE.isin(pcodes)]['ICD9_CODE'].unique())) +
     ', DIAGNOSES_ICD=' + str(len(codes[~codes.ICD9_CODE.isin(pcodes)]['ICD9_CODE'].unique())) + ')')

# 3-1-2.Check: Number of CATEGORY='Discharge summary' on notes
print('Number of unique CATEGORY=Discharge summary : ' + str(len(notes[notes['CATEGORY']=='Discharge summary'])))

# 3-1-3.Check: Number of HADM_ID
print('Number of unique HADM_IDs : ' + str(len(notes_labeled['HADM_ID'].unique())))

# 3-1-4.Check: Labeled Notes
print('Some Samples of the notes_labeled :')
print(notes_labeled.head())

In [None]:
#########################
# 3-2. Top Codes & Vocab
#########################
# 3-2-1.Similar Words
def get_similar_words(model, word):
    similar_words = model.wv.most_similar(word)
    return similar_words

codes_top = pd.read_csv('%s/%s' % (DATA_PATH, 'codes_top.csv'), header=None, dtype=str)
vocab = pd.read_csv('%s/%s' % (DATA_PATH, 'vocab.csv'), header=None, dtype=str)[0]
vocab_list = vocab.tolist()
w2v = Word2Vec.load('%s/%s' % (DATA_PATH, 'w2v.model'))

# 3-3-1.Check: Top 50 ICD9_CODE
print('Top 50 ICD9_CODE :')
print(str(codes_top))

# 3-3-2.Check: Vocabulary (cosine similarity) on trn
print('Vocabulary Size : ' + str(len(vocab_list)))
print('Top 20 frequent words :')
print(vocab_list[:20])
print('\nBottom 20 frequent words :')
print(vocab_list[-20:])
print('\nSimilar words to diabetic :')
for word in get_similar_words(w2v, 'diabetic'):
    print("{0} {1:.3f}".format(word[0], word[1]))

In [None]:
#########################
# 4. DATASET
# 4-1. Custom Dataset
#########################
# Be able to run all the sections below if all preprocessed data already exists
import numpy as np
import csv
from gensim.models import Word2Vec
import numpy as np
DATA_PATH = 'mimic-iii-1.4'
PAD_CHAR = '*'

from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, filename):        
        data = []
        with open(filename, "r") as file:
            csv_reader = csv.DictReader(file, delimiter=',') # the 1st as the fieldnames
            for row in csv_reader:
                data.append(row)
        self.data = data
        self.idx2word, self.word2idx = self.load_lookup(f'{DATA_PATH}/vocab.csv', padding=True) # padding=True
        self.idx2code, self.code2idx = self.load_lookup(f'{DATA_PATH}/codes_top.csv')
    
    def load_file(self, filename):
        tokens = set()
        with open(filename, 'r') as vocabfile:
            for i, line in enumerate(vocabfile):
                line = line.rstrip()
                if line != '':
                    tokens.add(line.strip())
        return tokens

    # lookup for word or code (e.g. idx2token={1:'date'}, token2idx={'date':1})
    def load_lookup(self, filename, padding=False):
        tokens = self.load_file(filename)
        idx2token = {}
        if padding:  # padding at index 0
            idx2token[0] = PAD_CHAR
        for w in sorted(tokens):
            idx2token[len(idx2token)] = w
        token2idx = {w:i for i, w in idx2token.items()}
        return idx2token, token2idx
    
    # sequence=['date', 'birth', 'name'] -> token2idx={'date':1, 'birth':2} -> [1, 2, 0]
    def to_index(self, sequence, token2idx):
        ints = []
        for word in sequence:
            try:
                id = token2idx[word]
            except:
                id = 0 # only for 'TEXT', not 'ICD9_CODE'
            ints.append(id)
        return ints # e.g. [0, 1, 2]

    def to_multi_hot(self, labels):
        multi_hot_labels = [0] * len(self.idx2code)
        for idx in labels:
            multi_hot_labels[idx] = 1
        return multi_hot_labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        data = self.data[index]
        text = data['TEXT'].split()
        text = self.to_index(text, self.word2idx)
        
        labels = data['ICD9_CODE'].split(';')
        labels = self.to_index(labels, self.code2idx)
        labels = self.to_multi_hot(labels)
        return torch.tensor(text, dtype=torch.long), torch.tensor(labels, dtype=torch.float)
    
    def get_hadm_id(self, index):
        return self.data[index]['HADM_ID']
    
    def get_text(self, index):
        return self.data[index]['TEXT']
    
    def get_code(self, index):
        return self.data[index]['ICD9_CODE']

In [None]:
#########################
# 4-2. Dataloader
#########################
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
BATCH_SIZE = 32

# 4-2-1.Collate the list of samples into batches
def collate_fn(data):
    text, labels = zip(*data)
    text = pad_sequence(text, batch_first=True)
    labels = torch.stack(labels, 0)   
    return text, labels

# 4-2-2.Load Embeddings
def load_embeddings(embed_file):
    embed_file = '%s/%s' % (DATA_PATH, embed_file)
    W = []
    with open(embed_file) as f:
        for line in f:
            line = line.rstrip().split()
            vec = np.array(line[1:]).astype(np.float) # 0.06135951727628708 0.008149753324687481 ... (e.g. index=0:aabdominal)
            vec = vec / float(np.linalg.norm(vec) + 1e-6) # normalize the embeddings
            W.append(vec)
        # Add a vector of unknown gaussian embedding
        vec = np.random.randn(len(W[-1])) # len(W[-1]) = 100, e.g. array([-1.49401501, 1.00950034, ...])
        vec = vec / float(np.linalg.norm(vec) + 1e-6) # normalize (gaussian) the unknown (random) embedding
        W.append(vec)
    W = np.array(W)
    return W

train_set = CustomDataset(f'{DATA_PATH}/train_top.csv')
valid_set = CustomDataset(f'{DATA_PATH}/valid_top.csv')
test_set = CustomDataset(f'{DATA_PATH}/test_top.csv')
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, collate_fn=collate_fn)

# 4-2-2.Check: Length of Split TOP Data
print('Length of Split TOP Data :')
for (name, length) in zip(['train', 'valid', 'test'], [train_set, valid_set, test_set]):
    print('{:s} {:d}'.format(name, len(length)))

In [None]:
#########################
# 5. TRAIN & EVALUATION
# 5-1. Evaluation
#########################
'''
# Module Installation
!{sys.executable} -m pip install sklearn
'''
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

# Device to Train on
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# 5-1-1. Evaluate
def eval(model, test_loader):
    model.eval()
    y_true = torch.LongTensor()
    y_prob = torch.LongTensor()
    y_pred = torch.LongTensor()
    for sequences, labels in test_loader:
        output = model.forward(sequences)
        y_true = torch.cat((y_true, labels.detach().to(device)), dim=0)
        y_prob = torch.cat((y_prob, output.detach().to(device)), dim=0)
        
        o = torch.ones(size=output.shape, dtype=torch.int)
        z = torch.zeros(size=output.shape, dtype=torch.int)
        y_hat = torch.where(output > 0.5, o, z)
        y_pred = torch.cat((y_pred, y_hat.detach().to(device)), dim=0)
    
    # Precision, Recall, Fb (more general F1-Score), RocAuc
    p, r, f, _ = precision_recall_fscore_support(y_true, y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_prob, average='micro')
    return p, r, f, roc_auc

In [None]:
#########################
# 5-2. Train
#########################
import torch.optim as optim
import torch.nn as nn

# 5-2-1. Loss Function
criterion = nn.BCELoss() # =F.binary_cross_entropy(y_hat, labels)

# 5-2-2. Train
def train(model, train_loader, test_loader, n_epochs, lr=0.001, model_name='CAML'):
    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.train()
    
    print('Model Name: ' + model_name)
    for epoch in range(n_epochs):
        train_loss = 0
        for sequences, labels in train_loader:
            optimizer.zero_grad()
            sequences.to(device)
            y_hat = model(sequences)
            loss = criterion(y_hat, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss = train_loss / len(train_loader)
        print('Epoch: {} \t Training Loss: {:.6f}'.format(epoch+1, train_loss))
        p, r, f, roc_auc = eval(model, test_loader)
        print('Epoch: {} \t Validation p: {:.2f}, r: {:.2f}, f: {:.2f}, roc_auc: {:.2f}'.format(epoch+1, p, r, f, roc_auc))

In [None]:
#########################
# 6. CNN MODEL
# 6-1. CAML
#########################
from math import floor
from torch.nn.init import xavier_uniform_
import torch.nn.functional as F

# 6-1-1. CAML
class CAML(nn.Module):
    def __init__(self, Y=50, w2v_model_name='w2v.embed', embed_size=100, num_filter_maps=60, kernel_size=18, dropout=0.2):
        super(CAML, self).__init__()
        
        # Embed layer
        W = torch.Tensor(load_embeddings(w2v_model_name))
        self.embed = nn.Embedding(num_embeddings=W.size()[0], embedding_dim=W.size()[1], padding_idx=0) # padding_idx=0
        self.embed.weight.data = W.clone()
        self.embed_drop = nn.Dropout(p=dropout)

        # Conv layer (bias=True (default))
        self.conv = nn.Conv1d(in_channels=embed_size, out_channels=num_filter_maps, kernel_size=kernel_size, padding=int(floor(kernel_size/2)))
        xavier_uniform_(self.conv.weight)

        # Context Vectors for Attention
        self.U = nn.Linear(in_features=num_filter_maps, out_features=Y)
        xavier_uniform_(self.U.weight)

        # Final layer: Create a Matrix to use for the L Binary Classifiers
        self.final = nn.Linear(in_features=num_filter_maps, out_features=Y)
        xavier_uniform_(self.final.weight)
        
    def forward_embed(self, text):
        text = self.embed(text)
        text = self.embed_drop(text)
        return text
        
    def forward_conv(self, text):
        text = self.conv(text)
        text = torch.tanh(text)
        return text
        
    def forward_calc_atten(self, text):
        alpha = torch.matmul(self.U.weight, text)
        alpha = F.softmax(alpha, dim=2) # = normalized exponential function
        return alpha
        
    def forward_aply_atten(self, alpha, text):
        v = torch.matmul(alpha, text)
        return v
    
    def forward_linear(self, v):
        m = torch.mul(self.final.weight, v)
        s = torch.sum(m, dim=2)
        s = s + self.final.bias
        y_hat = torch.sigmoid(s)
        return y_hat
        
    def forward(self, text):
        # Get Embeddings and Apply Dropout
        text = self.forward_embed(text)

        # Apply Convolution and Nonlinearity (tanh)
        text = text.transpose(1, 2) # (BATCH_SIZE, embed_size, seq_len) <- (BATCH_SIZE, seq_len, embed_size)
        text = self.forward_conv(text)
        
        # Calculate Attention
        alpha = self.forward_calc_atten(text)
        
        # Apply Attention
        text = text.transpose(1, 2) # (BATCH_SIZE, seq_len, num_filter_maps) <- (BATCH_SIZE, num_filter_maps, seq_len) 
        v = self.forward_aply_atten(alpha, text)
        
        # Final layer Classification
        y_hat = self.forward_linear(v)        
        return y_hat

In [None]:
#########################
# 7. RNN MODELs
# 7-1. LSTM / GRU
#########################
# 7-1-1. RNN
class RNN(nn.Module):
    def __init__(self, Y=50, w2v_model_name='w2v.embed', cell_type='lstm', embed_size=100, hidden_size=60, num_layers=1, dropout=0.2):
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Embed layer
        W = torch.Tensor(load_embeddings(w2v_model_name))
        self.embed = nn.Embedding(num_embeddings=W.size()[0], embedding_dim=W.size()[1], padding_idx=0) # padding_idx=0
        self.embed.weight.data = W.clone()
        self.embed_drop = nn.Dropout(p=dropout)
        
        # Recurrent layer ('lstm' or 'gru')
        self.cell_type = cell_type
        if self.cell_type == 'lstm':
            self.rnn = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers)
        else:
            self.rnn = nn.GRU(input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers)
        
        # Final layer: Create a Matrix to use for the L Binary Classifiers
        self.linear = nn.Linear(in_features=hidden_size, out_features=Y)
        self.activation = nn.Sigmoid()
        
    def forward_embed(self, text):
        text = self.embed(text)
        text = self.embed_drop(text)
        return text
    
    def forward_recur(self, text, hiddens):
        output, hiddens = self.rnn(text, hiddens) # last hidden: output[-1], hiddens[0] or hiddens
        return hiddens[0] if self.cell_type == 'lstm' else hiddens
    
    def forward_linear(self, text):
        text = self.linear(text)
        y_hat = self.activation(text) #.view(BATCH_SIZE, -1)
        return y_hat
        
    def forward(self, text):
        # Initialize Hidden
        hiddens = self.init_hidden(text.size(0))
        
        # Get Embeddings and Apply Dropout
        text = self.forward_embed(text)

        # Apply Recurrent NN
        text = text.transpose(0, 1) # (seq_len, BATCH_SIZE, embed_size) <- (BATCH_SIZE, seq_len, embed_size)
        hidden = self.forward_recur(text, hiddens) # (num_layers, BATCH_SIZE, hidden_size)
        
        # Final layer Classification
        y_hat = self.forward_linear(hidden[-1]) # (BATCH_SIZE, Y) <- (BATCH_SIZE, hidden_size)
        return y_hat

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        return (hidden, hidden) if self.cell_type == 'lstm' else hidden

In [None]:
#########################
# 8. Model Implementation
# 8-1. CAML Model
#########################
# Model Paths
CAML_PATH = '%s/%s' % (DATA_PATH, 'caml.model')
LSTM_PATH = '%s/%s' % (DATA_PATH, 'lstm.model')
GRU_PATH = '%s/%s' % (DATA_PATH, 'gru.model')

# Number of Epochs
n_epochs = 10

# 8-1-1. Train CNN CAML Model
caml = CAML()
caml.to(device)
train(caml, train_loader, test_loader, n_epochs)
torch.save(caml, CAML_PATH)

# 8-1-2. Train RNN LSTM Model
lstm = RNN()
lstm.to(device)
train(lstm, train_loader, test_loader, n_epochs, lr=0.01, model_name='LSTM')
torch.save(lstm, LSTM_PATH)

# 8-1-3. Train RNN GRU Model
gru = RNN(cell_type='gru')
gru.to(device)
train(gru, train_loader, test_loader, n_epochs, lr=0.01, model_name='GRU')
torch.save(gru, GRU_PATH)

In [None]:
#########################
# 9. EXAMIATION
# 9-1. Model Examination
#########################
# Be able to run all the sections below if all preprocessed data and learned models already exist
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
pd.set_option('display.max_colwidth', -1)

# 9-1-1. Examine
def examine(model, test_loader, test_set, codes_top, filename):
    model.eval()
    codes_top_sorted = sorted(codes_top)
    i, max_acc, min_acc, max_idxs, min_idxs, max_preds, min_preds, max_lengths, min_lengths = -1, 1, 0.8, [], [], [], [], [], []
    y_true = torch.LongTensor()
    y_pred = torch.LongTensor()
    columns = ['HADM_ID', 'ICD9_CODE', 'PRED_CODE', 'PRECISION', 'LENGTH', 'TEXT']
    df0 = pd.DataFrame(columns=columns)
    counts_tp = Counter()
    counts_fp = Counter()
    counts_fn = Counter()
    for code in codes_top:
        counts_tp[code] = 0
        counts_fp[code] = 0
        counts_fn[code] = 0

    for sequences, labels in test_loader:
        i += 1
        output = model.forward(sequences)
        y_true = labels.detach().to(device)
        
        o = torch.ones(size=output.shape, dtype=torch.int)
        z = torch.zeros(size=output.shape, dtype=torch.int)
        y_hat = torch.where(output > 0.5, o, z)
        y_pred = y_hat.detach().to(device)
        
        for j, pred in enumerate(y_pred):
            # Precision
            precision = torch.sum(torch.eq(y_true[j], pred).long()) / len(pred)
            idx = i * BATCH_SIZE + j
            preds = torch.where(pred > 0)[0]
            length = len(torch.where(sequences[j] > 0)[0])
            
            true_codes = sorted(test_set.get_code(idx).split(';'))
            pred_codes = sorted([codes_top_sorted[idx] for idx in preds.tolist()])
            df0 = df0.append({'HADM_ID': int(test_set.get_hadm_id(idx)),
                            'ICD9_CODE': str(';'.join(true_codes)),
                            'PRED_CODE': str(';'.join(pred_codes)),
                            'PRECISION': float(precision),
                            'LENGTH': int(length),
                            'TEXT': str(test_set.get_text(idx))}, ignore_index=True)
            
            for pred_code in pred_codes:
                if pred_code in true_codes:
                    counts_tp[pred_code] += 1 # TP
                else:
                    counts_fp[pred_code] += 1 # FP
            for true_code in true_codes:
                if true_code not in pred_codes:
                    counts_fn[true_code] += 1 # FN
    
    cols = ['ICD9_CODE', 'TP', 'FP', 'FN']
    df = pd.DataFrame(columns=cols)
    for tp, fp, fn in zip(counts_tp.items(), counts_fp.items(), counts_fn.items()):
        df = df.append({'ICD9_CODE': str(tp[0]), 'TP': int(tp[1]), 'FP': int(fp[1]), 'FN': int(fn[1])}, ignore_index=True)

    df0.to_csv('%s/%s' % (DATA_PATH, filename + '.csv'), columns=columns, index=False, header=True, quoting=csv.QUOTE_NONE, escapechar='')
    df.to_csv('%s/%s' % (DATA_PATH, filename + '_count.csv'), columns=cols, index=False, header=True, quoting=csv.QUOTE_NONE, escapechar='')
    return df

# Test Dataset
codes_top = (pd.read_csv('%s/%s' % (DATA_PATH, 'codes_top.csv'), header=None, dtype=str)[0]).tolist()
xticks = np.arange(50)

# Model Paths
CAML_PATH = '%s/%s' % (DATA_PATH, 'caml.model')
LSTM_PATH = '%s/%s' % (DATA_PATH, 'lstm.model')
GRU_PATH = '%s/%s' % (DATA_PATH, 'gru.model')

# Load Models
# 9-1-1. CAML Examination
caml = torch.load(CAML_PATH)
caml.eval()
df_caml = examine(caml, test_loader, test_set, codes_top, 'test_stat_caml')
figure(figsize=(15, 3), dpi=80)
plt.plot('ICD9_CODE', 'TP', data=df_caml, linewidth=2, color='green', markersize=4, marker='o', markerfacecolor='lime')
plt.plot('ICD9_CODE', 'FP', data=df_caml, linewidth=2, color='blue', markersize=4, marker='o', markerfacecolor='skyblue')
plt.plot('ICD9_CODE', 'FN', data=df_caml, linewidth=2, color='purple', markersize=4, marker='o', markerfacecolor='plum')
plt.xlabel('ICD9_CODE')
plt.ylabel('COUNT')
plt.xticks(ticks=xticks, rotation=90)
plt.legend()
plt.show()

# 9-1-2. LSTM Examination
lstm = torch.load(LSTM_PATH)
lstm.eval()
df_lstm = examine(lstm, test_loader, test_set, codes_top, 'test_stat_lstm')
figure(figsize=(15, 3), dpi=80)
plt.plot('ICD9_CODE', 'TP', data=df_lstm, linewidth=2, color='green', markersize=4, marker='o', markerfacecolor='lime')
plt.plot('ICD9_CODE', 'FP', data=df_lstm, linewidth=2, color='blue', markersize=4, marker='o', markerfacecolor='skyblue')
plt.plot('ICD9_CODE', 'FN', data=df_lstm, linewidth=2, color='purple', markersize=4, marker='o', markerfacecolor='plum')
plt.xlabel('ICD9_CODE')
plt.ylabel('COUNT')
plt.xticks(ticks=xticks, rotation=90)
plt.legend()
plt.show()

# 9-1-3. GRU Examination
gru = torch.load(GRU_PATH)
gru.eval()
df_gru = examine(gru, test_loader, test_set, codes_top, 'test_stat_gru')
figure(figsize=(15, 3), dpi=80)
plt.plot('ICD9_CODE', 'TP', data=df_gru, linewidth=2, color='green', markersize=4, marker='o', markerfacecolor='lime')
plt.plot('ICD9_CODE', 'FP', data=df_gru, linewidth=2, color='blue', markersize=4, marker='o', markerfacecolor='skyblue')
plt.plot('ICD9_CODE', 'FN', data=df_gru, linewidth=2, color='purple', markersize=4, marker='o', markerfacecolor='plum')
plt.xlabel('ICD9_CODE')
plt.ylabel('COUNT')
plt.xticks(ticks=xticks, rotation=90)
plt.legend()
plt.show()

In [None]:
#########################
# 10. INVESTIGATION
# 10-1. Word Investigation
#########################
# 10-1-1. Word Investigation
def investigate(stat_caml, icd9_code, words):
    columns = ['WORD', 'SIMILAR', 'SIMILARITY', 'BOTH', 'PRED', 'ORIG']
    df = pd.DataFrame(columns=columns)
    for word in words:
        list = [(word, 1)] + get_similar_words(w2v, word)
        for w in list:
            if len(df[df['SIMILAR'] == w[0]]) < 1:
                both, orig, pred = 0, 0, 0
                df_ = stat_caml[stat_caml['TEXT'].str.contains(w[0])]
                for row in df_.itertuples():
                    p = icd9_code in str(row.PRED_CODE).split(';')
                    o = icd9_code in str(row.ICD9_CODE).split(';')
                    if p and o:
                        both += 1
                    else:
                        if p:
                            pred += 1
                        if o:
                            orig += 1
                df = df.append({'WORD': str(word), 'SIMILAR': str(w[0]), 'SIMILARITY': str('{0:.3f}'.format(w[1])), 'BOTH': int(both), 'PRED': int(pred), 'ORIG': int(orig)}, ignore_index=True)
    return df

# 10-1-2. Load Stat
stat_caml = pd.read_csv('%s/%s' % (DATA_PATH, 'test_stat_caml.csv'),
                        usecols=['HADM_ID', 'ICD9_CODE', 'PRED_CODE', 'PRECISION', 'LENGTH', 'TEXT'],
                        dtype={'HADM_ID': np.uint64, 'ICD9_CODE': str, 'PRED_CODE': str, 'ACC': np.float64, 'LENGTH': np.uint64, 'TEXT': str})

# ICD9_CODE = 427.31
icd9_code1 = str(427.31)
words1 = ['atrial', 'fibrillation', 'heart']
print('ICD-9 CODE = ' + icd9_code1 + ' (Atrial Fibrillation related to heart disease)\n')
df1_ = investigate(stat_caml, icd9_code1, words1)
print(df1_)

df1 = df1_[(df1_['BOTH'] > 0) | (df1_['ORIG'] > 0) | (df1_['PRED'] > 0)]
figure(figsize=(10, 2), dpi=80)
xticks = np.arange(df1.shape[0])
plt.plot('SIMILAR', 'BOTH', data=df1, linewidth=2, color='olivedrab', markersize=4, marker='o', markerfacecolor='greenyellow')
plt.plot('SIMILAR', 'PRED', data=df1, linewidth=2, color='teal', markersize=4, marker='o', markerfacecolor='aquamarine')
plt.plot('SIMILAR', 'ORIG', data=df1, linewidth=2, color='rebeccapurple', markersize=4, marker='o', markerfacecolor='mediumpurple')
plt.xlabel('SIMILAR')
plt.ylabel('COUNT')
plt.xticks(ticks=xticks, rotation=90)
plt.legend()
plt.show()

# ICD9_CODE = 99.04
icd9_code2 = str(99.04)
words2 = ['transfusion', 'packed', 'red', 'blood', 'cell']
print('\nICD-9 CODE = ' + icd9_code2 + ' (Transfusion of Packed Cell, red blood cells that have been separated for blood transfusion)\n')
df2_ = investigate(stat_caml, icd9_code2, words2)
print(df2_)

df2 = df2_[(df2_['BOTH'] > 0) | (df2_['ORIG'] > 0) | (df2_['PRED'] > 0)]
figure(figsize=(10, 2), dpi=80)
xticks = np.arange(df2.shape[0])
plt.plot('SIMILAR', 'BOTH', data=df2, linewidth=2, color='olivedrab', markersize=4, marker='o', markerfacecolor='greenyellow')
plt.plot('SIMILAR', 'PRED', data=df2, linewidth=2, color='teal', markersize=4, marker='o', markerfacecolor='aquamarine')
plt.plot('SIMILAR', 'ORIG', data=df2, linewidth=2, color='rebeccapurple', markersize=4, marker='o', markerfacecolor='mediumpurple')
plt.xlabel('SIMILAR')
plt.ylabel('COUNT')
plt.xticks(ticks=xticks, rotation=90)
plt.legend()
plt.show()

In [None]:
#########################
# 11. DETAIL
# 11-1. ICD9_CODE = 427.31
#########################
# 11-1-1. ICD9_CODE = 427.31
text_examples = stat_caml[stat_caml['ICD9_CODE'].str.contains('427.31') &
          stat_caml['PRED_CODE'].str.contains('427.31') &
          stat_caml['TEXT'].str.contains('atrial') &
          stat_caml['TEXT'].str.contains('fibrillation') &
          stat_caml['TEXT'].str.contains('heart') &
          stat_caml['TEXT'].str.contains('respiratory')]
#print(text_examples.sort_values(['LENGTH'], ascending=True))
print(text_examples[text_examples['HADM_ID'] == 155451])