# ENSEMBLE RNN for BIO Classification

## Data Prep and Torchtext.legacy

We begin by importing the needed python packages. Since the data is in CoNLL format, we can conveniently load the data with torchtext.

In [51]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import copy 
from tqdm import tqdm
from torchtext.legacy import data, datasets

import spacy
import numpy as np

import time
import random 

#importing custom helper functions
from helpers import *

#Seed setting over all possible random components.
seed_everything()

In [52]:
TEXT = data.Field(lower = False) #depending on case sensative or not 
LABELS = data.Field(unk_token = None)

train_data = datasets.SequenceTaggingDataset(
                path='./data/NERdata/train.tsv',
                fields=[('text', TEXT),
                        ('labels', LABELS)])   

valid_data = datasets.SequenceTaggingDataset(
                path='./data/NERdata/test.tsv',
                fields=[('text',  TEXT),
                        ('labels', LABELS)])

fields = (("text", TEXT), ("labels", LABELS))

## Exploratory Data Analysis


The first example is displayed below. 

In [53]:
print(vars(train_data.examples[0]))

{'text': ['Identification', 'of', 'APC2', ',', 'a', 'homologue', 'of', 'the', 'adenomatous', 'polyposis', 'coli', 'tumour', 'suppressor', '.'], 'labels': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'O', 'O']}


What are the respective sizes of the train and test data? 

In [54]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")

Number of training examples: 5424
Number of validation examples: 940


## Building the dictionary and assignment embeddings (Downloading of the embedding will take a bit of time.) 

For this particular part, we have built a library that associates to each word a numerical value. Then once that is complete, we then assign a word embedding to it with the Glove embedding library. Embedding words gives us a head start when it comes to training our models. By assigning word embeddings we are able to learn at much quicker rate.

In [55]:
#builds a vocab library for the training data. 
TEXT.build_vocab(train_data, 
                 vectors = "glove.6B.300d",
                 unk_init = torch.Tensor.normal_)

#builds a vocab library for the labels 
LABELS.build_vocab(train_data)

These are the sizes for each respective vocab. The additional term in the LABEL dictionary is the padding term. The padding term is important because it will help us later on when we work with sentences with non-uniform length. 

In [56]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABELS vocabulary: {len(LABELS.vocab)}")
print(f"LABELS vocabulary: {LABELS.vocab.itos}")

Unique tokens in TEXT vocabulary: 9286
Unique tokens in LABELS vocabulary: 4
LABELS vocabulary: ['<pad>', 'O', 'I', 'B']


# Baseline Checking
We need to establish a baseline that we wish to overcome. So we look at the percentages of each respective class and calculate the accuracy based on if our model purely guessed. Since the classes are obviously unbalanced we have got to do weighted guessing.

In [57]:
print("Tag\t\tCount\t\tPercentage\n")
percentage = 0 
for tag, count, percent in tag_percentage(LABELS.vocab.freqs.most_common()):
    print(f"{tag}\t\t{count}\t\t{percent*100:4.1f}%") 
    percentage+=(percent)**2
print(f"The baseline to beat is therefore : {round(percentage,4)*100}%")

Tag		Count		Percentage

O		124452		91.7%
I		6115		 4.5%
B		5134		 3.8%
The baseline to beat is therefore : 84.45%


## Construct an iterator with BucketIterator

Since the sentences vary in all sorts of lengths, padding can get out of control. Hence, we elect to use BucketIterator to load sentences of similar length in order make for a more efficient training process. 

In [58]:
BATCH_SIZE = 128

#assigns the devices for which there will be training on. 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#the iterator to batch cycle through the dataset 
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    device = device)

## Defining the GRU module

We first start off with a modified GRU module. In particular, we opted for the inclusion of a linear layer post embedding. This is often a reccommended trick to ensure that the model copes with the word embedding well. 

The reason that we are starting out with GRU is because it much lighter to train than LSTM and Transformer models. By doing this, it allows us to better save on time and cost. This allows us to focus on other things like experimenting with different architectures, training routines and so forth.



In [59]:
class GRUIOB(nn.Module):
    def __init__(self, 
                 input_dim, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim, 
                 n_layers,
                 bidirectional, 
                 dropout, 
                 pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        
        self.gru = nn.GRU(embedding_dim, 
                        hidden_dim, 
                        num_layers = n_layers, 
                        bidirectional = bidirectional, 
                        bias = True,
                        dropout = dropout if n_layers > 1 else 0)
        
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        #the times 2 addresses the doubling in 
        
        #post embedding fully connected layer
        self.emfc = nn.Linear(embedding_dim, embedding_dim)
        
        #layernorm out of interest 
        self.layer_norm = nn.LayerNorm(embedding_dim, elementwise_affine=False)

        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        #embedded = [sent len, batch size, emb dim]
        
        # coded included for trials with layer norming
        # embedded = self.layer_norm(embedded.permute(1, 0, 2))
        # embedded = embedded.permute(1, 0, 2) 
        # # #layer_normie takes in batch first so we gotta remember to swap it
        
        post_emb = self.emfc(embedded) 
        #feed forward connection 
            
        outputs, hidden= self.gru(post_emb)
        #output = [sent len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hidden dim]
    
        predictions = self.fc(self.dropout(outputs))
        #predictions = [sent len, batch size, output dim]
        #outputs the final layer output for each word of the sentence 
        #does NOT include softmax. That is handled by the criterion algorithm
        return predictions

# Training
In terms of the training, in order to counter the problem of exploding gradients, we also included gradient clipping with in the RNN training algorithms. The cut off value is 0.3. 


## Initialisation - GRU

Here we intialise the Bidirectional GRU with 32 embedding dimensions with 3 stacked layers. This design decision was based off several trials that were conducted with varying degrees of hidden dimensions and stacked layers. The bidirectional was chosen because we would like to incorporate information from all points of the sentence. Intuitively, the entity of a particular word is likely influenced by words at all points of the sentence.  


In [60]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 32
OUTPUT_DIM = len(LABELS.vocab)
N_LAYERS = 2 #number of stacked layers 2 to 3 is pretty good 
BIDIRECTIONAL = True #we elect to use a bidirectional version 
DROPOUT = 0.25
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 40

model = GRUIOB(INPUT_DIM, 
                EMBEDDING_DIM, 
                HIDDEN_DIM, 
                OUTPUT_DIM, 
                N_LAYERS, 
                BIDIRECTIONAL, 
                DROPOUT, 
                PAD_IDX)

#define model intializer 
def initialiser(model): 
    #normal initialisation 
    def init_weights(m):
        for name, param in m.named_parameters():
            nn.init.normal_(param.data, mean = 0, std = 0.1)

    #applying the normal weights        
    model.apply(init_weights)

    #embeddding initialisation
    pretrained_embeddings = TEXT.vocab.vectors
    model.embedding.weight.data.copy_(pretrained_embeddings) 

    #pad token embedding to zero vector
    model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

initialiser(model)

#optimiser 
optimizer = optim.Adam(model.parameters()) 

#ensuring that the pad tokens are ignored during the cross entropy calculation 
TAG_PAD_IDX = LABELS.vocab.stoi[LABELS.pad_token]

#criterion 
criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

#sends the models to the GPU 
model = model.to(device)
criterion = criterion.to(device)

## Main Training Phase

In [61]:
#import necessary training functions
from helpers import train, evaluate, epoch_time, count_parameters

best_valid_loss = float('inf')
start_time = time.time() 
loop =  tqdm(range(N_EPOCHS),leave=False)
for epoch in loop:
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, TAG_PAD_IDX)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1.1-model.pt')

    loop.set_description(f"Epoch [{epoch+1}/{N_EPOCHS}]")
    loop.set_postfix(train_loss=train_loss, valid_loss=valid_loss)

end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print("GRU Model Performance:")
print(f'Total Epochs: {epoch+1:02}  | Totals Time: {epoch_mins}m {epoch_secs}s')
print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
print(f'Val. Loss: {valid_loss:.3f}  |  Val. Acc: {valid_acc*100:.2f}%')
print(f'The model has {count_parameters(model):,} trainable parameters')

#Model assignment for ensemble  
model_GRU = copy.deepcopy(model)

                                                                                                    

GRU Model Performance:
Total Epochs: 40  | Totals Time: 0m 33s
Train Loss: 0.006 | Train Acc: 99.80%
Val. Loss: 0.172  |  Val. Acc: 97.42%
The model has 2,959,304 trainable parameters




We see here is that the performance is quite good with ~97 percent accuracy in the test set. Comparing this with our initial baseline of 84.45%, this indicates that our model is actually meaningful. 

In [62]:
class RNNIOB(nn.Module):
    def __init__(self, 
                 input_dim, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim, 
                 n_layers,
                 bidirectional, 
                 dropout, 
                 pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.RNN(embedding_dim, 
                        hidden_dim, 
                        num_layers = n_layers, 
                        bidirectional = bidirectional, 
                        bias = True,
                        dropout = dropout if n_layers > 1 else 0, 
                        nonlinearity = 'relu')
        
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        #the times 2 addresses the doubling in 
        
        #post embedding fully connected layer
        self.emfc = nn.Linear(embedding_dim, embedding_dim)

        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        #embedded = [sent len, batch size, emb dim]

        post_emb = self.emfc(embedded) 
        #feed forward connection post embedding
            
        outputs, hidden= self.rnn(post_emb)
        #output = [sent len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hidden dim]
    
        predictions = self.fc(self.dropout(outputs))
        #predictions = [sent len, batch size, output dim]
        #outputs the final layer output for each word of the sentence 
        #does NOT include softmax. That is handled by the criterion algorithm
        return predictions

#KEEP THE INITIALISATION FROM BEFORE. 

model = RNNIOB(INPUT_DIM, 
                EMBEDDING_DIM, 
                HIDDEN_DIM, 
                OUTPUT_DIM, 
                N_LAYERS, 
                BIDIRECTIONAL, 
                DROPOUT, 
                PAD_IDX)

#intialiser
initialiser(model)

#optimiser 
optimizer = optim.Adam(model.parameters()) 

#ensuring that the pad tokens are ignored during the cross entropy calculation 
TAG_PAD_IDX = LABELS.vocab.stoi[LABELS.pad_token]

#criterion 
criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

#sends the models to the GPU 
model = model.to(device)
criterion = criterion.to(device)

#main training lopp 
best_valid_loss = float('inf')
start_time = time.time() 
loop =  tqdm(range(N_EPOCHS),leave=False)
for epoch in loop:
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, TAG_PAD_IDX)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2.1-model.pt')

    loop.set_description(f"Epoch [{epoch+1}/{N_EPOCHS}]")
    loop.set_postfix(train_loss=train_loss, valid_loss=valid_loss)
    
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print("RNN Model Performance")
print(f'Total Epochs: {epoch+1:02}  | Totals Time: {epoch_mins}m {epoch_secs}s')
print(f'Train Loss: {train_loss:.3f}| Train Acc: {train_acc*100:.2f}%')
print(f'Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
print(f'The model has {count_parameters(model):,} trainable parameters') 

#Model assignment for ensemble  
model_RNN = copy.deepcopy(model)

                                                                                                    

RNN Model Performance
Total Epochs: 40  | Totals Time: 0m 33s
Train Loss: 0.009| Train Acc: 99.70%
Val. Loss: 0.199 |  Val. Acc: 97.24%
The model has 2,904,008 trainable parameters




Simply with the inclusion of relu activations in the RNN, we are getting comparable results to GRUs. ~97 accuracy. 

# LSTM 
Now we try a bidirectional LSTM with 3 stacked layers with hidden state embeddings that have dimension 32. 

In [63]:
class LSTMIOB(nn.Module):
    def __init__(self, 
                 input_dim, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim, 
                 n_layers, 
                 bidirectional, 
                 dropout, 
                 pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim, 
                            num_layers = n_layers, 
                            bidirectional = bidirectional,
                            dropout = dropout if n_layers > 1 else 0)
        
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        #the times 2 addresses the doubling in 

        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        #embedded = [sent len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.lstm(embedded)
        #output = [sent len, batch size, hid dim * n directions]
        #hidden/cell = [n layers * n directions, batch size, hidden dim]
        
        predictions = self.fc(self.dropout(outputs))
        #predictions = [sent len, batch size, output dim]
        
        return predictions

model = LSTMIOB(INPUT_DIM, 
                EMBEDDING_DIM, 
                HIDDEN_DIM, 
                OUTPUT_DIM, 
                N_LAYERS, 
                BIDIRECTIONAL, 
                DROPOUT, 
                PAD_IDX)

##intialiser
initialiser(model)

#optimiser 
optimizer = optim.Adam(model.parameters()) 

#ensuring that the pad tokens are ignored during the cross entropy calculation 
TAG_PAD_IDX = LABELS.vocab.stoi[LABELS.pad_token]

#criterion 
criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

#sends the models to the GPU 
model = model.to(device)
criterion = criterion.to(device)

#main training loop
best_valid_loss = float('inf')
start_time = time.time() 
loop =  tqdm(range(N_EPOCHS),leave=False)
for epoch in loop:
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, TAG_PAD_IDX)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut3.1-model.pt')

    loop.set_description(f"Epoch [{epoch+1}/{N_EPOCHS}]")
    loop.set_postfix(train_loss=train_loss, valid_loss=valid_loss)

end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print("LSTM Model Performance")
print(f'Total Epochs: {epoch+1:02}  | Totals Time: {epoch_mins}m {epoch_secs}s')
print(f'Train Loss: {train_loss:.3f}| Train Acc: {train_acc*100:.2f}%')
print(f'Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
print(f'The model has {count_parameters(model):,} trainable parameters')

#Model assignment for ensemble 
model_LSTM = copy.deepcopy(model)

                                                                                                    

LSTM Model Performance
Total Epochs: 40  | Totals Time: 0m 32s
Train Loss: 0.004| Train Acc: 99.88%
Val. Loss: 0.205 |  Val. Acc: 97.14%
The model has 2,896,652 trainable parameters




The same story is told with the lstm model compared to the GRU. They are modelled very similarly. 

# Ensemble Methods

Since these models are so great, we can look to combine them together to form an even more powerful model.

In [64]:
class MyEnsemble(nn.Module):
    def __init__(self, modelA, modelB, modelC, output_dim):
        super(MyEnsemble, self).__init__()
        self.modelA = modelA
        self.modelB = modelB
        self.modelC = modelC
        self.output = nn.Linear(output_dim*3, output_dim)
    def forward(self, text):
        x1 = self.modelA(text)
        x2 = self.modelB(text)
        x3 = self.modelC(text)
        x = torch.cat((x1, x2, x3), dim=2)  
        final = self.output(x)
        return final

#freeze models 
for param in model_GRU.parameters():
    param.requires_grad = False

for param in model_LSTM.parameters():
    param.requires_grad = False

for param in model_RNN.parameters():
    param.requires_grad = False

#intialise model 
model = MyEnsemble(model_GRU,model_RNN, model_LSTM ,OUTPUT_DIM)

#optimiser 
optimizer = optim.Adam(model.parameters()) 

#ensuring that the pad tokens are ignored during the cross entropy calculation 
TAG_PAD_IDX = LABELS.vocab.stoi[LABELS.pad_token]

#criterion 
criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

#sends the models to the GPU 
model = model.to(device)
criterion = criterion.to(device)

N_EPOCHS = 40

best_valid_loss = float('inf')
start_time = time.time() 
loop =  tqdm(range(N_EPOCHS),leave=False)
for epoch in loop:
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, TAG_PAD_IDX)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4.1-model.pt')

    loop.set_description(f"Epoch [{epoch+1}/{N_EPOCHS}]")
    loop.set_postfix(train_loss=train_loss, valid_loss=valid_loss)

end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)

print(f'Total Epochs: {epoch+1:02}  | Totals Time: {epoch_mins}m {epoch_secs}s')
print(f'Train Loss: {train_loss:.3f}| Train Acc: {train_acc*100:.2f}%')
print(f'Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
print(f'The model has {count_parameters(model):,} trainable parameters (linear layer trained only)')

                                                                                                    

Total Epochs: 40  | Totals Time: 0m 40s
Train Loss: 0.005| Train Acc: 99.89%
Val. Loss: 0.175 |  Val. Acc: 97.31%
The model has 52 trainable parameters (linear layer trained only)




We see that the ensemble comes out as one of the better models. However, it could be debated that for simplicity sake, it would have been better to train either the LSTM or GRUs strictly instead. 

# Sample checking for the training data

In [65]:
example_index = 1
sentence = vars(train_data.examples[example_index])['text']
actual_tags = vars(train_data.examples[example_index])['labels'] 
tokens, pred_tags, unks = tag_sentence(model, 
                                       device, 
                                       sentence, 
                                       TEXT, 
                                       LABELS)

print("Pred. Tag\tActual Tag\tCorrect?\tToken\n")

for token, pred_tag, actual_tag in zip(tokens, pred_tags, actual_tags):
    correct = '✔' if pred_tag == actual_tag else '✘'
    print(f"{pred_tag}\t\t{actual_tag}\t\t{correct}\t\t{token}")

Pred. Tag	Actual Tag	Correct?	Token

O		O		✔		The
B		B		✔		adenomatous
I		I		✔		polyposis
I		I		✔		coli
I		I		✔		(
I		I		✔		APC
I		I		✔		)
I		I		✔		tumour
O		O		✔		-
O		O		✔		suppressor
O		O		✔		protein
O		O		✔		controls
O		O		✔		the
O		O		✔		Wnt
O		O		✔		signalling
O		O		✔		pathway
O		O		✔		by
O		O		✔		forming
O		O		✔		a
O		O		✔		complex
O		O		✔		with
O		O		✔		glycogen
O		O		✔		synthase
O		O		✔		kinase
O		O		✔		3beta
O		O		✔		(
O		O		✔		GSK
O		O		✔		-
O		O		✔		3beta
O		O		✔		)
O		O		✔		,
O		O		✔		axin
O		O		✔		/
O		O		✔		conductin
O		O		✔		and
O		O		✔		betacatenin
O		O		✔		.


The results are great considering the perfect matches across the table.

# Sample checking for the test data

In [66]:
example_index = 1
sentence = vars(valid_data.examples[example_index])['text']
actual_tags = vars(valid_data.examples[example_index])['labels'] 
tokens, pred_tags, unks = tag_sentence(model, 
                                       device, 
                                       sentence, 
                                       TEXT, 
                                       LABELS)

print("Pred. Tag\tActual Tag\tCorrect?\tToken\n")

for token, pred_tag, actual_tag in zip(tokens, pred_tags, actual_tags):
    correct = '✔' if pred_tag == actual_tag else '✘'
    print(f"{pred_tag}\t\t{actual_tag}\t\t{correct}\t\t{token}")

Pred. Tag	Actual Tag	Correct?	Token

B		B		✔		Ataxia
I		I		✔		-
I		I		✔		telangiectasia
O		O		✔		(
B		B		✔		A
I		I		✔		-
I		I		✔		T
O		O		✔		)
O		O		✔		is
O		O		✔		a
B		B		✔		recessive
I		I		✔		multi
I		I		✔		-
I		I		✔		system
I		I		✔		disorder
O		O		✔		caused
O		O		✔		by
O		O		✔		mutations
O		O		✔		in
O		O		✔		the
O		O		✔		ATM
O		O		✔		gene
O		O		✔		at
O		O		✔		11q22
O		O		✔		-
O		O		✔		q23
O		O		✔		(
O		O		✔		ref
O		O		✔		.
O		O		✔		3
O		O		✔		)
O		O		✔		.
