# RNN for sentiment classification

In [1]:
import pandas as pd
import torch
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
from torchtext import vocab as V
from sklearn.model_selection import train_test_split
from collections import Counter
import time 
import random

torch.backends.cudnn.deterministic = True

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



## General Settings

In [2]:
random_seed = 123
torch.manual_seed(random_seed)

vocabulary_size = 2000
learning_rate = 1e-4
batch_size = 128
num_epochs = 15
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

emnbedding_dim = 128
hidden_dim = 256
output_dim = 1

## Dataset

In [3]:
## load IMDB dataset

# text = data.Field(tokenize = 'spacy')
# label = data.LabelField(dtype = torch.float)
# train_data, test_data = datasets.IMDB.splits(text, label)
# train_data, valid_data = train_data.split(random_state=random_seed.seed(random_seed),
#                                           split_ratio=0.8)

train_iter, test_iter = datasets.IMDB(split=('train', 'test'))
# train_iter, valid_iter = train_iter.random_split(total_length=25000, weights={"train": 0.8, "valid": 0.2}, seed=0)

train_data = list(train_iter)
# valid_data = list(valid_iter)
test_data = list(test_iter)

print(f'Num Train: {len(train_data)}')
# print(f'Num Valid: {len(valid_data)}')
print(f'Num Test: {len(test_data)}')

Num Train: 12500
Num Test: 25000


In [4]:
tokenizer = data.utils.get_tokenizer('spacy')

# counter = Counter()
# for label, line in train_iter:
#     counter.update(tokenizer(line))
# vocab = V.vocab(counter, min_freq=10, specials=('<unk>', '<pad>'))

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)


def get_vocab(train_datapipe):
    v = V.build_vocab_from_iterator(yield_tokens(train_datapipe),
                                            specials=['<UNK>', '<PAD>'],
                                            max_tokens=vocabulary_size+2)
    v.set_default_index(v['<UNK>'])
    return v

# train_iter = IMDB(split='train')
vocab = get_vocab(train_iter)




In [5]:
len(vocab)

2002

In [6]:
from torch.utils.data import DataLoader 
from torch.nn.utils.rnn import pad_sequence 

text_transform = lambda x: [vocab['<BOS>']] + [vocab[token] for token in tokenizer(x)] + [vocab['<EOS>']]
# label_transform = lambda x: 1 if x == 'pos' else 0

def collate_batch(batch): 
    label_list, text_list = [], [] 
  
    for (_label, _text) in batch: 
        # label_list.append(label_transform(_label)) 
        label_list.append(_label) 
        processed_text = torch.tensor(text_transform(_text)) 
        text_list.append(processed_text) 
    return torch.tensor(label_list), pad_sequence(text_list, padding_value=3.0) 


In [18]:
from torch.utils.data import Sampler

class BatchSamplerSimilarLength(Sampler):
  def __init__(self, dataset, batch_size, indices=None, shuffle=True):
    self.batch_size = batch_size
    self.shuffle = shuffle
    # get the indices and length
    self.indices = [(i, len(tokenizer(s[1]))) for i, s in enumerate(dataset)]
    # if indices are passed, then use only the ones passed (for ddp)
    if indices is not None:
       self.indices = torch.tensor(self.indices)[indices].tolist()

  def __iter__(self):
    if self.shuffle:
       random.shuffle(self.indices)

    pooled_indices = []
    # create pool of indices with similar lengths
    for i in range(0, len(self.indices), self.batch_size * 100):
      pooled_indices.extend(sorted(self.indices[i:i + self.batch_size * 100], key=lambda x: x[1]))
    self.pooled_indices = [x[0] for x in pooled_indices]

    # yield indices for current batch
    batches = [self.pooled_indices[i:i + self.batch_size] for i in
               range(0, len(self.pooled_indices), self.batch_size)]

    if self.shuffle:
        random.shuffle(batches)
    for batch in batches:
        yield batch

  def __len__(self):
    return len(self.pooled_indices) // self.batch_size

# def batch_sampler():
#     indices = [(i, len(tokenizer(s[1]))) for i, s in enumerate(train_list)]
#     random.shuffle(indices)
#     pooled_indices  = []
#     # create pool of indices with similar lengths 
#     for i in range(0, len(indices), batch_size * 100):
#         pooled_indices.extend(sorted(indices[i:i + batch_size * 100], key=lambda x: x[1]))

#     pooled_indices = [x[0] for x in pooled_indices]

#     # yield indices for current batch
#     for i in range(0, len(pooled_indices), batch_size):
#         yield pooled_indices[i:i + batch_size]

# sample_dataloader = DataLoader(list(train_iter), 
#                                batch_sampler=BatchSamplerSimilarLength(dataset=list(train_iter), batch_size=batch_size),
#                                collate_fn=collate_batch)
sample_dataloader = DataLoader(list(train_iter), 
                               batch_size=batch_size,
                               collate_fn=collate_batch, 
                                shuffle=True)


print(next(iter(sample_dataloader)))

(tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1]), tensor([[   0,    0,    0,  ...,    0,    0,    0],
        [1672,   15,   10,  ...,   56,    0,  159],
        [   4,    0,  210,  ...,  215,   80,    0],
        ...,
        [   3,    3,    3,  ...,    3,    3,    3],
        [   3,    3,    3,  ...,    3,    3,    3],
        [   3,    3,    3,  ...,    3,    3,    3]]))


In [8]:
train_loader = DataLoader(list(train_iter),
                          batch_sampler=BatchSamplerSimilarLength(dataset=list(train_iter), batch_size=batch_size),
                          collate_fn=collate_batch, shuffle=True)
# valid_loader = DataLoader(list(valid_iter),
#                           batch_sampler=BatchSamplerSimilarLength(dataset=list(valid_iter), batch_size=batch_size),
#                           collate_fn=collate_batch)
test_loader = DataLoader(list(test_iter),
                          batch_sampler=BatchSamplerSimilarLength(dataset=list(test_iter), batch_size=batch_size),
                          collate_fn=collate_batch, shuffle=True)

## Model

In [9]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):

        #[sentence len, batch size] => [sentence len, batch size, embedding size]
        embedded = self.embedding(text)

        #[sentence len, batch size, embedding size] => 
        #  output: [sentence len, batch size, hidden size]
        #  hidden: [1, batch size, hidden size]
        output, hidden = self.rnn(embedded)
        
        return self.fc(hidden.squeeze(0)).view(-1)

In [10]:
input_dim = len(vocab)
embedding_dim = 64
hidden_dim = 128
output_dim = 1

torch.manual_seed(random_seed)
model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## Training

In [11]:
def compute_binary_accuracy(model, data_loader, device):
    model.eval()
    correct_pred, num_examples = 0,0
    with torch.no_grad():
        for batch_idx, (label, text) in enumerate(data_loader):
            logits = model(text)
            predicted_labels = (torch.sigmoid(logits) > 0.5).long()
            num_examples += label.size(0)
            correct_pred += (predicted_labels == label.long()).sum()

    return correct_pred.float()/num_examples * 100

In [12]:
start_time = time.time()

for epoch in range(num_epochs):
    model.train()
    for batch_idx, (label, text) in enumerate(train_loader):
        
        ### FORWARD AND BACK PROP
        logits = model(text)
        cost = F.binary_cross_entropy_with_logits(logits, label.float())
        optimizer.zero_grad()
        
        cost.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{num_epochs:03d} | '
                   f'Batch {batch_idx:03d}/{len(list(train_loader)):03d} | '
                   f'Cost: {cost:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_binary_accuracy(model, train_loader, device):.2f}%')
              # f'\nvalid accuracy: '
              # f'{compute_binary_accuracy(model, valid_loader, device):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_binary_accuracy(model, test_loader, device):.2f}%')

Epoch: 001/015 | Batch 000/098 | Cost: 0.8433
Epoch: 001/015 | Batch 050/098 | Cost: 0.2869
training accuracy: 100.00%
Time elapsed: 1.06 min
Epoch: 002/015 | Batch 000/098 | Cost: 0.0662
Epoch: 002/015 | Batch 050/098 | Cost: 0.0349
training accuracy: 100.00%
Time elapsed: 2.20 min
Epoch: 003/015 | Batch 000/098 | Cost: 0.0155
Epoch: 003/015 | Batch 050/098 | Cost: 0.0116
training accuracy: 100.00%
Time elapsed: 3.24 min
Epoch: 004/015 | Batch 000/098 | Cost: 0.0088
Epoch: 004/015 | Batch 050/098 | Cost: 0.0080
training accuracy: 100.00%
Time elapsed: 4.34 min
Epoch: 005/015 | Batch 000/098 | Cost: 0.0054
Epoch: 005/015 | Batch 050/098 | Cost: 0.0049
training accuracy: 100.00%
Time elapsed: 5.56 min
Epoch: 006/015 | Batch 000/098 | Cost: 0.0054
Epoch: 006/015 | Batch 050/098 | Cost: 0.0038
training accuracy: 100.00%
Time elapsed: 6.76 min
Epoch: 007/015 | Batch 000/098 | Cost: 0.0037
Epoch: 007/015 | Batch 050/098 | Cost: 0.0030
training accuracy: 100.00%
Time elapsed: 7.78 min
Epoch:

In [13]:
## Predict sentiment from sentence

import spacy
nlp = spacy.load('en_core_web_sm')

def predict_sentiment(model, sentence):
    # based on:
    # https://github.com/bentrevett/pytorch-sentiment-analysis/blob/
    # master/2%20-%20Upgraded%20Sentiment%20Analysis.ipynb
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [vocab.get_stoi()[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()


In [14]:
print('Probability positive:')
predict_sentiment(model, "I really love this movie. This movie is so great!")

Probability positive:


0.9982149600982666

In [15]:
## Something wrong with the dataset
## let's try using csv