## Pytorch Implementation of RNN for Text Classification

In [4]:
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchtext import data

In [2]:
# load data
text = []
label = []
for line in open("../datasets/sentiment.txt"):
    line = line.rstrip('\n').split('\t')
    text.append(line[0])
    label.append(int(line[1]))
text = np.array(text)
label = np.array(label)

In [26]:
# text pre-processing and vectorization in pytorch
# build vocabulary with the spacy tokenizer and lowercasing
TEXT = data.Field(sequential=True, tokenize='spacy', tokenizer_language='en_core_web_sm', lower=True, batch_first=True)
LABEL = data.LabelField(dtype=torch.float, batch_first=True)
fields = [('text', TEXT), ('label', LABEL)]
examples = []
for i in range(len(text)):
    examples.append(data.Example.fromlist([text[i], label[i]], fields))
sentimentDataset = data.Dataset(examples, fields)
train_data, test_data = sentimentDataset.split(split_ratio=0.8)
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

In [42]:
# examine vocabulary
print(len(TEXT.vocab))
print(TEXT.vocab.freqs.most_common(10))
# by default, the first token is unknown, the second is padding
print(TEXT.vocab.itos[:10])
print(len(LABEL.vocab))
print(LABEL.vocab.itos[:2])

4613
[('.', 2117), ('the', 1554), (',', 1025), ('and', 863), ('i', 816), (' ', 798), ('a', 720), ('it', 619), ('is', 617), ('to', 529)]
['<unk>', '<pad>', '.', 'the', ',', 'and', 'i', ' ', 'a', 'it']
2
[0, 1]


In [46]:
# construct datasets and batches
train_iterator, test_iterator = data.BucketIterator.splits((train_data, test_data), batch_size=32, 
                                                           sort_key=lambda x: len(x.text), 
                                                           sort_within_batch=False)

In [60]:
# build an RNN with embedding layer
class RNN(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=1):
        # vocab_size: size of the vocabulary
        # embedding_dim: size of the word embeddings
        # hidden_dim: size of the hidden states
        # output_dim: size of the outputs
        # n_layers: number of layers in the RNN (i.e. num of stacked RNNs, set to 1 by default)
        # bidirectional: whether the RNN is bidirectional (default to False)
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # can use RNN, LSTM, or GRU here
        #self.rnn = nn.RNN(embedding_dim, hidden_dim, n_layers, batch_first=True)
        #self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, text):
        embedded = self.embedding(text)
        # Basic RNN or GRU produce two outputs: output and hidden. The first has hidden states at every time step and the second has the final hidden states
        output, hidden = self.rnn(embedded)
        # However, if you are using LSTM , there will be three outputs and the third one is the final cell state
        #output, (hidden, cell) = self.rnn(embedded)
        return self.fc(hidden.squeeze(0))

In [61]:
# specify model parameters and training parameters
vocab_size = len(TEXT.vocab)
embedding_dim = 64
hidden_dim = 128
# note that output_dim is 1 because we are doing binary classification
output_dim = 1
model = RNN(vocab_size, embedding_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()


In [62]:
# train the model and print out validation loss after each epoch
epochs = 10
for epoch in range(epochs):
    # training mode
    model.train()
    for batch in train_iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        loss.backward()
        optimizer.step()
    print("Epoch: {}, Training Loss: {}".format(epoch, loss.item()))
    
    # evaluation mode
    model.eval()
    with torch.no_grad():
        for batch in test_iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
        print("Epoch: {}, Validation Loss: {}".format(epoch, loss.item()))

Epoch: 0, Training Loss: 0.695453941822052
Epoch: 0, Validation Loss: 0.6955385208129883
Epoch: 1, Training Loss: 0.6828354001045227
Epoch: 1, Validation Loss: 0.7000489234924316
Epoch: 2, Training Loss: 0.6877161264419556
Epoch: 2, Validation Loss: 0.6898696422576904
Epoch: 3, Training Loss: 0.6021620035171509
Epoch: 3, Validation Loss: 0.7490665316581726
Epoch: 4, Training Loss: 0.44912979006767273
Epoch: 4, Validation Loss: 0.8792015910148621
Epoch: 5, Training Loss: 0.36820706725120544
Epoch: 5, Validation Loss: 0.7942952513694763
Epoch: 6, Training Loss: 0.636217474937439
Epoch: 6, Validation Loss: 0.9420352578163147
Epoch: 7, Training Loss: 0.16370218992233276
Epoch: 7, Validation Loss: 0.9668347835540771
Epoch: 8, Training Loss: 0.22463662922382355
Epoch: 8, Validation Loss: 1.1872559785842896
Epoch: 9, Training Loss: 0.10992171615362167
Epoch: 9, Validation Loss: 1.3811120986938477


In [75]:
# Of course, you can also do bidirectional RNNs, which is just a matter of setting bidirectional=True in the RNN constructor.
# here is an example of bidirectional LSTM
class BiLSTM(nn.Module):
        
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, bidirectional=True, batch_first=True)
        # note that the input dimension of the linear layer is doubled because we are concatenating the final hidden states from both directions
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)
        # the following line concatenates the final hidden states from both directions
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        return self.fc(hidden.squeeze(0))

In [76]:
# specify model parameters and training parameters
vocab_size = len(TEXT.vocab)
embedding_dim = 64
hidden_dim = 128
# note that output_dim is 1 because we are doing binary classification
output_dim = 1
model = BiLSTM(vocab_size, embedding_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [77]:
# train the model and print out validation loss after each epoch
epochs = 10
for epoch in range(epochs):
    # training mode
    model.train()
    for batch in train_iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        loss.backward()
        optimizer.step()
    print("Epoch: {}, Training Loss: {}".format(epoch, loss.item()))
    
    # evaluation mode
    model.eval()
    with torch.no_grad():
        for batch in test_iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
        print("Epoch: {}, Validation Loss: {}".format(epoch, loss.item()))

Epoch: 0, Training Loss: 0.7456362247467041
Epoch: 0, Validation Loss: 0.6658830046653748
Epoch: 1, Training Loss: 0.5884069800376892
Epoch: 1, Validation Loss: 0.5843148231506348
Epoch: 2, Training Loss: 0.6707966327667236
Epoch: 2, Validation Loss: 0.6619852185249329
Epoch: 3, Training Loss: 0.26380395889282227
Epoch: 3, Validation Loss: 0.5596750378608704
Epoch: 4, Training Loss: 0.1876998245716095
Epoch: 4, Validation Loss: 0.66705721616745
Epoch: 5, Training Loss: 0.42112839221954346
Epoch: 5, Validation Loss: 0.6943915486335754
Epoch: 6, Training Loss: 0.22854258120059967
Epoch: 6, Validation Loss: 0.9938416481018066
Epoch: 7, Training Loss: 0.27132993936538696
Epoch: 7, Validation Loss: 0.9798004627227783
Epoch: 8, Training Loss: 0.039236899465322495
Epoch: 8, Validation Loss: 0.6700077056884766
Epoch: 9, Training Loss: 0.1196562647819519
Epoch: 9, Validation Loss: 0.8626105189323425
