# RNN model comparison between LSTM and GRU

### Leqi Zhao @ Cornell University

In [1]:
import torch
from torchtext import data
from torchtext import datasets
import random

### 1. Get the data

We get the dataset from IMDB and separate the data into training, testing and validation parts.

In [2]:
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

TEXT = data.Field(tokenize='spacy') # input: review
LABEL = data.LabelField(tensor_type=torch.FloatTensor) # output: sentiment

train, test = datasets.IMDB.splits(TEXT, LABEL) # split the training and testing data

train, valid = train.split(random_state=random.seed(SEED)) # further split the training data to training and validation parts

We transfer the the vocabulary to vectors with:

algo: glove

token number: 6 billion

output dimension: 100

The result vectors are about 862MB

In [3]:
TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train)

Create train, valid, test, iterators

In [4]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

### 2. Build the model

In [5]:
import torch.nn as nn

Here we use a RNN architecture called Long Short-Term Memory (LSTM). The hidden layer state can be thought of as a "memory" of the words seen by the model. However, with the information forwarding along the layer sequence, the much ealier information is easily to lose since the gradient decays exponentially. LSTMs have an extra recurrent state called a cell, which can be thought of as the "memory" of the LSTM and can remember information for many time steps.

#### 2.1 RNN-LSTM

In [6]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        """
        vocab_size: input vocab vector dimension
        embedding_dim: post embedding dimension
        hidden_dim: hidden layer dimension
        output_dim: output dimension, for the sentiment prediction problem, the output should be a one dimensional number
        n_layers: number of layers
        bidirectional: if to have a second RNN processing from last to first
        dropout: set up dropping out probability for each neuron, method to reduce overfitting
        """
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x): #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x)) #embedded = [sent len, batch size, emb dim]
        
        # LSTM has a cell state to better store previous layers' information
        output, (hidden, cell) = self.rnn(embedded) #output = [sent len, batch size, hid dim * num directions] 
                                                    #hidden = [num layers * num directions, batch size, hid. dim]
                                                    #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)) #hidden [batch size, hid. dim * num directions]
    
        return self.fc(hidden.squeeze(0))

#### 2.2 RNN - GRU

We use the RNN Gated Recurrent Unit (GRU), which does not have the "cell" state, for comparison purpose.

In [7]:
class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        """
        vocab_size: input vocab vector dimension
        embedding_dim: post embedding dimension
        hidden_dim: hidden layer dimension
        output_dim: output dimension, for the sentiment prediction problem, the output should be a one dimensional number
        n_layers: number of layers
        bidirectional: if to have a second RNN processing from last to first
        dropout: set up dropping out probability for each neuron, method to reduce overfitting
        """
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x): #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x)) #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded) #output = [sent len, batch size, hid dim * num directions]
                                            #hidden = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)) #hidden [batch size, hid. dim * num directions]
        
        return self.fc(hidden.squeeze(0))

#### 2.3 Define parameters

Define the input parameters.

In [8]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

Define the models.

In [9]:
model_lstm = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
model_gru = GRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

The embedding dimension should be equal to the pre-trained GloVe vectors.

In [10]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([25002, 100])


We replace the initial weights of the embedding layer with the pre-trained embeddings.

In [11]:
model_lstm.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1123,  0.3113,  0.3317,  ..., -0.4576,  0.6191,  0.5304],
        [ 0.0306, -0.0086,  0.1552,  ..., -0.9847,  0.4392,  0.3018],
        [ 0.3614,  0.1344,  0.0411,  ..., -0.1543, -1.0218, -0.5138]])

In [12]:
model_gru.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1123,  0.3113,  0.3317,  ..., -0.4576,  0.6191,  0.5304],
        [ 0.0306, -0.0086,  0.1552,  ..., -0.9847,  0.4392,  0.3018],
        [ 0.3614,  0.1344,  0.0411,  ..., -0.1543, -1.0218, -0.5138]])

### 3. Train the model

#### 3.1 Define parameters and functions

Choose the optimizer Adam since it adapts the learning rate for each parameter, giving para that updated more frequently lower learning rates and para updated infrequently higher learning rates.

In [13]:
import torch.optim as optim

optimizer_lstm = optim.Adam(model_lstm.parameters())
optimizer_gru = optim.Adam(model_gru.parameters())

We define the criterion and place the model and criterion on the GPU

In [14]:
criterion = nn.BCEWithLogitsLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_lstm = model_lstm.to(device)
model_gru = model_gru.to(device)

criterion = criterion.to(device)

Define a function to evaluate prediction accuracy

In [15]:
import torch.nn.functional as F

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [16]:
def train(model, iterator, optimizer, criterion):
    
    """
    model: either LSTM or GRU
    iterator: number of iterations
    optimizer: tells the learning rate
    criterion: method to evaluate loss
    """
    
    epoch_loss = 0
    epoch_acc = 0
    
    # turn on the dropout option
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        # update the optimizer
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [17]:
def evaluate(model, iterator, criterion):
    
    """
    model: either LSTM or GRU
    iterator: number of iterations
    criterion: method to evaluate loss
    """
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

#### 3.2 Train for LSTM

In [18]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss_lstm, train_acc_lstm = train(model_lstm, train_iterator, optimizer_lstm, criterion)
    valid_loss_lstm, valid_acc_lstm = evaluate(model_lstm, valid_iterator, criterion)
    torch.cuda.empty_cache()
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss_lstm:.3f}, Train Acc: {train_acc_lstm*100:.2f}%, Val. Loss: {valid_loss_lstm:.3f}, Val. Acc: {valid_acc_lstm*100:.2f}%')

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.668, Train Acc: 59.04%, Val. Loss: 0.595, Val. Acc: 69.58%
Epoch: 02, Train Loss: 0.691, Train Acc: 52.55%, Val. Loss: 0.693, Val. Acc: 50.92%
Epoch: 03, Train Loss: 0.663, Train Acc: 59.79%, Val. Loss: 0.538, Val. Acc: 74.32%
Epoch: 04, Train Loss: 0.579, Train Acc: 69.31%, Val. Loss: 0.576, Val. Acc: 80.51%
Epoch: 05, Train Loss: 0.414, Train Acc: 82.99%, Val. Loss: 0.328, Val. Acc: 86.76%


Testing dataset prediction result for LSTM

In [19]:
test_loss_lstm, test_acc_lstm = evaluate(model_lstm, test_iterator, criterion)
torch.cuda.empty_cache()
print(f'Test Loss: {test_loss_lstm:.3f}, Test Acc: {test_acc_lstm*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.358, Test Acc: 84.95%


#### 3.3 Train for GRU

In [20]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    train_loss_gru, train_acc_gru = train(model_gru, train_iterator, optimizer_gru, criterion)
    valid_loss_gru, valid_acc_gru = evaluate(model_gru, valid_iterator, criterion)
    torch.cuda.empty_cache()
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss_gru:.3f}, Train Acc: {train_acc_gru*100:.2f}%, Val. Loss: {valid_loss_gru:.3f}, Val. Acc: {valid_acc_gru*100:.2f}%')

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.633, Train Acc: 62.25%, Val. Loss: 0.391, Val. Acc: 83.81%
Epoch: 02, Train Loss: 0.342, Train Acc: 85.43%, Val. Loss: 0.310, Val. Acc: 88.10%
Epoch: 03, Train Loss: 0.219, Train Acc: 91.63%, Val. Loss: 0.244, Val. Acc: 90.40%
Epoch: 04, Train Loss: 0.162, Train Acc: 93.95%, Val. Loss: 0.238, Val. Acc: 90.50%
Epoch: 05, Train Loss: 0.113, Train Acc: 96.07%, Val. Loss: 0.270, Val. Acc: 90.19%


Testing dataset prediction result for GRU

In [21]:
test_loss_gru, test_acc_gru = evaluate(model_gru, test_iterator, criterion)
torch.cuda.empty_cache()
print(f'Test Loss: {test_loss_gru:.3f}, Test Acc: {test_acc_gru*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.326, Test Acc: 87.77%


### 4. Model comparison

Under the testing dataset, GRU has better accuracy than LSTM.

Define functions to predict sentence sentiment with previously trained models, return value will be a number between 0 to 1. Close to 1 => positive sentiment; Close to 0 => negative sentiment.

In [22]:
import spacy
nlp = spacy.load('en')

def predict_sentiment_lstm(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = F.sigmoid(model_lstm(tensor))
    return prediction.item()

def predict_sentiment_gru(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = F.sigmoid(model_gru(tensor))
    return prediction.item()

#### 4.1 Positive review example:

In [25]:
predict_sentiment_lstm("The movie scene is beautiful, and I really enjoy the movie soundtrack!")



0.9468914866447449

In [26]:
predict_sentiment_gru("The movie scene is beautiful, and I really enjoy the movie soundtrack!")



0.9870263934135437

#### 4.2 Negative review example:

In [27]:
predict_sentiment_lstm("The content is boring...I cannot help myself to sleep.")



0.06164019927382469

In [28]:
predict_sentiment_gru("The content is boring...I cannot help myself to sleep.")



0.09542155265808105