## Model - RNN Based

In [205]:
import re
import os
import time
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
# from gensim.models import KeyedVectors
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, Dataset

import pickle as pkl
import copy

[nltk_data] Downloading package punkt to /Users/nhungle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Get Data

In [2]:
train_tokens = pkl.load(open(os.path.join("../data",
                                          "train_data_tokens.p"), "rb"))

val_tokens = pkl.load(open(os.path.join("../data",
                                          "val_data_tokens.p"), "rb"))

test_tokens = pkl.load(open(os.path.join("../data",
                                          "test_data_tokens.p"), "rb"))

all_tokens = pkl.load(open(os.path.join("../data",
                                          "all_data_tokens.p"), "rb"))

train_target = pkl.load(open(os.path.join("../data", "train_target.p"), "rb"))
val_target = pkl.load(open(os.path.join("../data", "val_target.p"), "rb"))
test_target = pkl.load(open(os.path.join("../data", "test_target.p"), "rb"))
label_mapping = pkl.load(open(os.path.join("../data", "target_mapping.p"), "rb"))

## Dictionary

In [3]:
class Dictionary(object):
    def __init__(self, all_tokens, max_vocab_size, PAD_IDX, UNK_IDX):
        res = self.buildVocab(all_tokens, max_vocab_size, PAD_IDX, UNK_IDX)
        # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
        self.id2token = res[1]
        # token2id: dictionary where keys represent tokens and corresponding values represent indices
        self.token2id = res[0]
    
    def buildVocab(self, all_tokens, max_vocab_size, PAD_IDX, UNK_IDX):
        token_counter = Counter(all_tokens)
        vocab, count = zip(*token_counter.most_common(max_vocab_size))
        id2token = list(vocab)
        token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
        id2token = ['<pad>', '<unk>'] + id2token
        token2id['<pad>'] = PAD_IDX 
        token2id['<unk>'] = UNK_IDX
        return token2id, id2token
    
    def __len__(self):
        return len(self.id2token)
 

In [4]:
from collections import Counter

max_vocab_size = 30000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1
corpus = Dictionary(all_tokens, max_vocab_size, PAD_IDX, UNK_IDX)
id2token = corpus.id2token
token2id = corpus.token2id

In [5]:
corpus.__len__()

30002

### Convert token (word) to ids

For each dataset, each sample of tokens (i.e., words) will be represented as index of that word in the dictionary

In [6]:
class Token2IndexDataset(object):
    def __init__(self, tokens_data):
        self.indices_data = self.token2index_dataset(tokens_data)
        
    def token2index_dataset(self, tokens_data):
        indices_data = []
        for tokens in tokens_data:
            index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
            indices_data.append(index_list)
        return indices_data

In [7]:
train_data_indices = Token2IndexDataset(train_tokens).indices_data
val_data_indices= Token2IndexDataset(val_tokens).indices_data
test_data_indices= Token2IndexDataset(test_tokens).indices_data

In [8]:
# Lets check the dictionary by loading random token from it
import random
random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 13024 ; token fev
Token fev; token id 13024


## Embedding

When using deep learning methods on NLP tasks, we usually utilize [word embedding](https://en.wikipedia.org/wiki/Word_embedding). To put it briefly, word embedding represent words, or tokens, in a vocabulary as a distributed numerical vector. There are a lot of methods to obtain a word embedding, with some of the most famous shallow models being Word2Vec, GloVe, and FastText while the deeper models are BERT, RoBERTa, T5. It is not difficult to find a general purpose word embedding trained by one of the aforementioned methods on the Internet that's been trained with a massive amount of data. It is usually a good idea to use these pre-trained embedding to save yourself some time and computing resource.

In [9]:
#! pip install gensim

In [10]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

In [11]:
_ = glove2word2vec(os.path.join("../data",'glove.6B.50d.txt'), 'tmp_file')
glove_embedding = KeyedVectors.load_word2vec_format('tmp_file')

In [12]:
# Double check the dimension is 50
len(glove_embedding['brain'])

50

#### Find similar words

The word embedding vectors can help us find words with similar meanings. Word similarities can be measured by [Cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity). The function below looks up the most similar words to a given word:

In [13]:
glove_embedding.similar_by_word('surgery', topn=5)

[('underwent', 0.8644347190856934),
 ('arthroscopic', 0.8504809141159058),
 ('undergoing', 0.8430145382881165),
 ('reconstructive', 0.8339141607284546),
 ('surgeries', 0.8272889852523804)]

In [14]:
class Token2VectorDataset(object):
    def __init__(self, tokens_data, embedding,):
        self.tokens_data = tokens_data
        self.embedding = embedding
        self.UNK_IDX = UNK_IDX
        self.indices_data = self.token2vector_dataset()
        
    def token2vector_dataset(self):
        indices_data = []
        for tokens in self.tokens_data:
            index_list = [self.embedding[token] if token in self.embedding.vocab else UNK_IDX
                          for token in tokens]
            indices_data.append(index_list)
        return indices_data

In [15]:
train_data_vectors = Token2VectorDataset(train_tokens, glove_embedding).indices_data
val_data_vectors = Token2VectorDataset(val_tokens, glove_embedding).indices_data
test_data_vectors = Token2VectorDataset(test_tokens, glove_embedding).indices_data

### Difference between train_data_indices and train_data_vectors

Both train_data_indices and train_data_vectors have 1571 data points, each represent a sentence (or a text data)

However, for train_data_indices, each sample is a list of token, each token represents a word in the corpus.

For train_data_vectors, each sample is a list of arrays, each array is 50-dimension vector that represents the distance of that word in the embedding space 

## Data Loader

In [16]:

import numpy as np
import torch
from torch.utils.data import Dataset

class MedTranscriptDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]


### Decide MAX_SENTENCE_LENGTh

In [17]:
lengths = [len(i) for i in train_data_indices]
np.mean(lengths)

473.9898154042011

In [18]:
MAX_SENTENCE_LENGTH = 600

In [19]:
def medtranscript_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
#     batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    #print(data_list)
    #print(length_list)
    #print(label_list)
    return [torch.from_numpy(np.array(data_list)),
            torch.LongTensor(length_list),
            torch.LongTensor(label_list)]

In [20]:
BATCH_SIZE = 32

train_dataset = MedTranscriptDataset(train_data_indices, train_target)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=medtranscript_collate_func,
                                           shuffle=True)

val_dataset = MedTranscriptDataset(val_data_indices, val_target)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=medtranscript_collate_func,
                                           shuffle=True)

test_dataset = MedTranscriptDataset(test_data_indices, test_target)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=medtranscript_collate_func,
                                           shuffle=False)

In [21]:
data, lengths, labels = next(iter(test_loader))
print(data)
print(labels)

tensor([[   94,    70,  1720,  ...,    12,   795,   784],
        [   39, 26780,   725,  ...,     0,     0,     0],
        [   94,    70, 11367,  ...,     0,     0,     0],
        ...,
        [   94,    70,    21,  ...,   212,   870,    12],
        [ 2001,     5,    39,  ...,     0,     0,     0],
        [   39,   651,    18,  ...,    52,     3,    28]])
tensor([4, 4, 4, 4, 4, 2, 0, 0, 4, 4, 1, 0, 4, 0, 1, 1, 4, 4, 3, 2, 0, 3, 3, 0,
        1, 3, 4, 1, 1, 0, 0, 0])


## Model

For this challenge, we will be exploring two variants of RNN: vanilla (or Elman) RNN and LSTM (Long-short term memory).

- Each input word is represented by a vector of dimension ```embedding_dim```. Check out ```nn.Embedding``` to see how to initialize embeddings randomly.
- Your model should take the following input parameters
    - ```hidden_dim```: The number of features in the hidden state h of your RNN layer
    - ```output_dim```: Number of output classes
    - ```vocab_size``` Size of your vocabulary. 
    - ```embedding_dim```: Dimension of word embeddings
- Your model should consist of an RNN layer (you can use either ```nn.RNN``` or ```nn.LSTM```) followed by a linear layer.
- $h_{0}$ (and $c$ if you use LSTM) should be initialized as a zero vector of dimension ```hidden_dim```. You might want to check out ```nn.Parameter```

#### Investigating three layers: 
- Embedding
- RNN
- Fully Connected

In [22]:
x, x_len, y = next(iter(train_loader))

In [23]:
x.shape

torch.Size([32, 600])

In [24]:
hidden_dim=40
output_dim=5
vocab_size=len(corpus)
embedding_dim=50
rnn='RNN'

In [25]:
emb = nn.Embedding(vocab_size, embedding_dim, padding_idx=PAD_IDX)
rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
fc = nn.Linear(hidden_dim, output_dim)

In [26]:
x_embed = emb(x)

In [27]:
x.shape

torch.Size([32, 600])

In [28]:
# emb_x: [batch_size, len_x, embedding_dim]
x_embed.shape

torch.Size([32, 600, 50])

In [29]:
x_packed = pack_padded_sequence(x_embed, x_len, batch_first=True, enforce_sorted=False)

In [30]:
#x_packed

In [31]:
output_packed, hidden = rnn(x_packed)

In [35]:
from torch.nn.utils.rnn import pad_packed_sequence
output_padded, output_lengths = pad_packed_sequence(output_packed, batch_first=True)

In [42]:
output_padded.shape

torch.Size([32, 600, 40])

In [43]:
# need to reshape before passing through fully connected layer
hidden = hidden.view(-1, hidden_dim)
output_padded = output_padded.view(-1, hidden_dim)

In [44]:
output_padded.shape

torch.Size([19200, 40])

In [46]:
hidden.shape

torch.Size([32, 40])

In [47]:
logits_output_padded = fc(output_padded)
logits_output_padded.shape

torch.Size([19200, 5])

In [49]:
logits_hidden = fc(hidden)
logits_hidden.shape

torch.Size([32, 5])

### RNN Model

In [50]:
embedding_dim=50
hidden_dim=40
output_dim=5
num_layers=1
rnn_dropout= 0.1
options = {
            'num_embeddings': len(corpus),
            'embedding_dim': embedding_dim,
            'padding_idx': PAD_IDX,
            'input_size': embedding_dim,
            'hidden_size': hidden_dim,
            'num_layers': num_layers,
            'rnn_dropout': rnn_dropout,
            'output_size': output_dim
        }


In [51]:
class RNNLanguageModel(nn.Module):
    """
    This model combines embedding, rnn and projection layer into a single model
    """
    def __init__(self, options):
        #optioins: a dictionary with key = argument, value = value of that argument
        super().__init__()
        
        # create each LM part here 
        self.lookup = nn.Embedding(num_embeddings=options['num_embeddings'], 
                                   embedding_dim=options['embedding_dim'], 
                                   padding_idx=options['padding_idx'])
        self.rnn = nn.RNN(options['input_size'],
                          options['hidden_size'],
                          options['num_layers'],
                          dropout=options['rnn_dropout'],
                          batch_first=True)
#           If we want to predict the next word in the context, then
#           we want output to have options['num_embeddings']
#         self.projection = nn.Linear(options['hidden_size'],
#                                     options['num_embeddings'])
        self.projection = nn.Linear(options['hidden_size'],
                                    options['output_size'])

        
    def forward(self, x, x_len):
        """
        Forward method process the input from token ids to logits
        """
        x_embed = self.lookup(x) 
        #rnn gives u all outputs / hidden as it has so far.
        x_packed = pack_padded_sequence(x_embed,
                                        x_len,
                                        batch_first=True,
                                        enforce_sorted=False)
        output, hidden = self.rnn(x_packed) #
#         output = output.reshape(-1, options['hidden_size'])
        hidden = hidden.view(-1, options['hidden_size'])
        logits = self.projection(hidden) 
        #rnn_outpus[0] = size of vocab, as we want to predict this specific word given the chain of words
        
        return logits

In [52]:
current_device = 'cuda' if torch.cuda.device_count() > 0 else 'cpu'

In [53]:
rnn_model = RNNLanguageModel(options).to(current_device)

#criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# When we may want to think about the sum loss
#criterion = nn.CrossEntropyLoss(ignore_index=wiki_dict.get_id('<pad>'), reduction='sum')

# model_parameters = [p for p in rnn_model.parameters() if p.requires_grad]
# optimizer = torch.optim.Adam(model_parameters, lr=lr)

  "num_layers={}".format(dropout, num_layers))


### Test steps

In [54]:
x, x_len, y = next(iter(train_loader))
x, y = x.to(current_device), y.to(current_device)

In [55]:
logits = rnn_model(x, x_len)

In [56]:
logits.shape

torch.Size([32, 5])

In [57]:
loss_fn = nn.CrossEntropyLoss()
outputs = rnn_model(x, x_len)
loss = loss_fn(outputs.squeeze(), labels)

In [58]:
loss

tensor(1.6525, grad_fn=<NllLossBackward>)

In [59]:
outputs.squeeze().shape

torch.Size([32, 5])

In [60]:
outputs.shape

torch.Size([32, 5])

In [61]:
labels

tensor([4, 4, 4, 4, 4, 2, 0, 0, 4, 4, 1, 0, 4, 0, 1, 1, 4, 4, 3, 2, 0, 3, 3, 0,
        1, 3, 4, 1, 1, 0, 0, 0])

In [62]:
pred = outputs.data.max(-1)[1]
pred

tensor([3, 1, 4, 2, 2, 1, 4, 0, 1, 0, 2, 3, 3, 1, 4, 4, 3, 1, 1, 4, 3, 2, 2, 1,
        4, 3, 3, 0, 4, 1, 4, 0])

In [64]:
predictions = list(pred.cpu().numpy())
truths = list(labels.cpu().numpy())

# Train

In [65]:
def train(model,
          train_loader=train_loader,
          test_loader=val_loader, 
          learning_rate=0.001,
          num_epoch=1,
          print_every=100,
          device=current_device):
    
    # Define the best weights
    best_val_loss = np.inf
    best_model_state_dict = model.state_dict()
    val_losses = []
    train_losses = []
    
    # Training steps
    start_time = time.time()
    loss_fn = nn.CrossEntropyLoss()
    model_parameters = [p for p in rnn_model.parameters() if p.requires_grad]
    optimizer = torch.optim.Adam(model_parameters,
                                 lr=learning_rate,
                                 weight_decay=10**(-5))
    #optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=10**(-5))
    for epoch in range(num_epoch):
        model.train()
        train_loss = []
        
        for i, (data, data_len, labels) in enumerate(train_loader):
            data, data_len, labels = data.to(device), data_len.to(device), labels.to(device)
            outputs = model(data, data_len)
            model.zero_grad()
            loss = loss_fn(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())

             # report performance
            if (i + 1) % print_every == 0:
                print('Train set | epoch: {:3d} | {:6d}/{:6d} batches | Loss: {:6.4f}'.format(
                    epoch, i + 1, len(train_loader), loss.item()))
                
        train_losses.append(sum(train_loss) / len(train_loss)) 
    
        # Evaluate after every epoch
        correct = 0
        total = 0
        model.eval()

        predictions = []
        truths = []
        valid_losses = []

        with torch.no_grad():
            for i, (data, data_len, labels) in enumerate(test_loader):
                data, data_len, labels = data.to(device), data_len.to(device), labels.to(device)
                outputs = model(data, data_len)
                pred = outputs.data.max(-1)[1]
                predictions += list(pred.cpu().numpy())
                truths += list(labels.cpu().numpy())
                total += labels.size(0)
                correct += (pred == labels).sum()
                loss = loss_fn(outputs.squeeze(), labels)
                valid_losses.append(loss.item())
               
            correct = correct.item()
            acc = (100 * correct / total)
#             auc = roc_auc_score(truths, predictions)
            avg_val_loss = sum(valid_losses) / len(valid_losses)
            elapse = time.strftime('%H:%M:%S', time.gmtime(int((time.time() - start_time))))
            print('Val set | Val Loss: {:6.4f} | Accuracy: {:6.4f} | time elapse: {:>9}'.format(
                avg_val_loss, acc, elapse))

            
            if avg_val_loss <= best_val_loss:
                best_val_loss = avg_val_loss
                best_model_state_dict = copy.deepcopy(model.state_dict())
                # model.state_dict()
                #copy.deepcopy(self.model.state_dict())

        val_losses.append(avg_val_loss)
        
    return train_losses, val_losses, best_val_loss, best_model_state_dict

In [67]:
train_losses, val_losses, best_val_loss, best_model_state_dict = train(model=rnn_model,
      train_loader=train_loader,
      test_loader=val_loader, 
      learning_rate=0.005,
      num_epoch=10,
      print_every=100)

Val set | Val Loss: 1.2406 | Accuracy: 63.3308 | time elapse:  00:00:19
Val set | Val Loss: 1.2469 | Accuracy: 63.4072 | time elapse:  00:00:38
Val set | Val Loss: 1.2596 | Accuracy: 63.1780 | time elapse:  00:00:58
Val set | Val Loss: 1.2785 | Accuracy: 62.7960 | time elapse:  00:01:17
Val set | Val Loss: 1.2889 | Accuracy: 63.4072 | time elapse:  00:01:37
Val set | Val Loss: 1.3024 | Accuracy: 63.2544 | time elapse:  00:01:55
Val set | Val Loss: 1.3358 | Accuracy: 62.4141 | time elapse:  00:02:14
Val set | Val Loss: 1.3418 | Accuracy: 63.0252 | time elapse:  00:02:34
Val set | Val Loss: 1.3642 | Accuracy: 63.2544 | time elapse:  00:02:53
Val set | Val Loss: 1.3804 | Accuracy: 63.1780 | time elapse:  00:03:14


## RNN model with pretrained-embedded layer

### Get pretrained_embedded data_vectors

In [117]:
class Token2VectorDataset(object):
    def __init__(self, tokens_data, embedding,):
        self.tokens_data = tokens_data
        self.embedding = embedding
        self.UNK_IDX = UNK_IDX
        self.indices_data = self.token2vector_dataset()
        
    def token2vector_dataset(self):
        indices_data = []
        for tokens in self.tokens_data:
            index_list = [self.embedding[token] if token in self.embedding.vocab else UNK_IDX
                          for token in tokens]
            indices_data.append(index_list)
        return indices_data
train_data_vectors = Token2VectorDataset(train_tokens, glove_embedding).indices_data
val_data_vectors = Token2VectorDataset(val_tokens, glove_embedding).indices_data
test_data_vectors = Token2VectorDataset(test_tokens, glove_embedding).indices_data

In [240]:
class MedTranscriptDataset_Glove(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, tokens_data, target_list, embedding=glove_embedding):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.tokens_data = tokens_data
        self.target_list = target_list
        assert (len(self.tokens_data) == len(self.target_list))
        self.embedding = embedding
        self.UNK_IDX = UNK_IDX

    def __len__(self):
        return len(self.target_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        sentence = self.tokens_data[key]
        embedded_token_idx = [self.embedding[token] for token in sentence
                              if token in self.embedding.vocab]
        label = self.target_list[key]
        embedded_token_idx_tensor = torch.from_numpy(np.array(embedded_token_idx))
        return [embedded_token_idx_tensor, label]
    
def pad_collate_glove(batch):
    batch = filter(lambda x:x is not None, batch)
    (xx, yy) = zip(*batch)
    x_lens = [len(x) for x in xx]
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)

    return xx_pad, torch.as_tensor(x_lens), torch.LongTensor(yy)

In [241]:
def generate_dataset(tokens, target, embedding=glove_embedding, BATCH_SIZE=32):
    dataset = MedTranscriptDataset_Glove(tokens,
                                     target)
    dataloader_embedded = torch.utils.data.DataLoader(
                                               dataset=dataset, 
                                               batch_size=BATCH_SIZE,
                                               collate_fn=pad_collate_glove,
                                               shuffle=True)
    
    return dataset, dataloader_embedded

In [246]:
train_dataset, train_dataloader_embedded = generate_dataset(train_tokens, train_target)

val_dataset, val_dataloader_embedded = generate_dataset(val_tokens, val_target)

test_dataset, test_dataloader_embedded = generate_dataset(test_tokens, test_target)