# IMDB reviews sentiment analysis with RNN

## Data processing for RNN input

### 1. Load data and create datasets for processing

In [5]:
from torch import nn
from torchtext.datasets import IMDB

train_ds = IMDB(split='train')
test_ds = IMDB(split='test')

train_ds = list(train_ds)
test_ds = list(test_ds)

In [6]:
from torch.utils.data.dataset import random_split
import torch

torch.manual_seed(13)
train_ds, valid_ds = random_split(train_ds, [0.8, 0.2])
print(f'Train dataset size: {len(train_ds)}, Valid dataset size: {len(valid_ds)}')

Train dataset size: 20000, Valid dataset size: 5000


In [49]:
# Let's see how it looks
train_ds[2002]

(2,
 'Dick Clement and Ian La Frenais have a solid hit rate as far as their TV work is concerned. However, their film work has been much more chequered (2008\'s The Bank Job was fine, the previous year\'s Across The Universe decidedly weak, for instance).<br /><br />Still Crazy, fortunately, is a solid success. It has a great story, excellent performances, a lot of humour, fabulous music and, above everything else, real heart.<br /><br />I savour "moments", and this film has one of them - just when everything is going pear-shaped at the festival reunion performance...<br /><br />Hugely enjoyable.')

### 2. Now we find unique words using text preprocessor from previous project

In [23]:
import re
from collections import Counter, OrderedDict

# Get text words, ignore html tags, add emojis at the end
def tokenizer(text):
    text = re.sub(r'<[^>]*>', '', text)
    emots = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub(r'[\W]+', ' ', text.lower()) + ' '.join(emots).replace('-', ''))
    tokenized_text = text.split()
    return tokenized_text

# Hashmap of <word, counts>
token_counts = Counter()

for label, review in train_ds:
    token_counts.update(tokenizer(review))
    
print(f'Vocab size: {len(token_counts)}')

Vocab size: 69161


### 3. We will encode unique tokens into numbers  

For that, we use the `Vocab` class form torchtext that allow us to create such a mapping and encode the entire dataset.

The point of the having the ordered words by occurrences is to ensure that the tokens are indexed based on their frequency of occurrence, with more common tokens getting lower integer values, which is a typical practice in NLP to optimize learning.

In [35]:
sorted_token_counts = sorted(token_counts.items(), key = lambda item: item[1], reverse=True)
ordered_token_map = OrderedDict(sorted_token_counts)

from torchtext import vocab

vocab = vocab.vocab(ordered_token_map)
vocab.insert_token('<pad>', 0)
vocab.insert_token('<unk>', 1) # index 1 is the placeholder for unknown tokens (not the case of all the words from IMDB reviews)
vocab.set_default_index(1)

In [43]:
print([vocab[token] for token in tokenizer('The film was not SO good :( but...')])

[2, 20, 14, 24, 37, 50, 19, 10088]


Define functions for transformations

In [51]:
def text_pipeline(text):
    return [vocab[token] for token in tokenizer(text)]

def label_pipeline(label):
    # label 2 is positive review, 1 is negative
    return 1. if label == 2 else 0.

Put together the processed labels and texts

In [52]:
def collate_batch_fn(batch):
    labels, texts, lengths = [], [], []
    for label, text in batch:
        labels.append(label_pipeline(label))
        tokens_vocab = torch.tensor(text_pipeline(text), dtype=torch.int64)
        texts.append(tokens_vocab)
        lengths.append(tokens_vocab.size(0))
    labels = torch.tensor(labels, dtype=torch.int64)
    lengths = torch.tensor(lengths, dtype=torch.int64)
    # pad consecutive elements that are to be combined into a batch with placeholder values (0s)
    # so that all sequences within a batch have the same shape
    # this method pads every tensor as much as necessary to match the max size
    padded_texts = torch.nn.utils.rnn.pad_sequence(texts, batch_first=True)
    
    return padded_texts, labels, lengths

Let's see how this works

In [53]:
from torch.utils.data import DataLoader

example_dl = DataLoader(train_ds, batch_size=4, shuffle=True, collate_fn=collate_batch_fn)

padded_texts, labels, lengths = next(iter(example_dl))
print('Labels: ', labels)
print('Lengths (not padded): ', lengths)
print('Shapes (padded): ', padded_texts.shape)
print('Textx (padded): ', padded_texts)

Labels:  tensor([0, 0, 0, 0])
Lengths (not padded):  tensor([156, 210, 187, 136])
Shapes (padded):  torch.Size([4, 210])
Textx (padded):  tensor([[ 1313,  4623,    10,   115,  1659,  4413,    10,    65,    81,     3,
            25,   125,    27,   114,    17,    35,  1284,   212,    10,    51,
            66,    89,    11,     7,     4,    18,    12,   235,   166,    50,
            15,     4,   226,    19,     8,    13,  3904,  2354,     3,  1145,
           704,    10,   215,     2,  1172,   341,   288,   571,     4,  3918,
           243, 12870,  4241,   502,   253,    17,    35, 14472,   171,     3,
            37,   125,  5372,  6442,     2,  1229,   618,   399,    10,    51,
            21,   373,     2,    30,    12,    13,    24,  1389,  5266,     7,
             4,   192, 12146,   120,   275,    66,     8,    47,    22,   149,
            24,  1432,   197,     6, 18618,   202,     4,    18,   110,    37,
          1064,    22,    51,    21,   795,     8,     3,    37,  1481, 

### 4. Create DataLoaders for mini-batches

In [None]:
batch_size = 16
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_batch_fn)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_batch_fn)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_batch_fn)

## Building RNN

In [None]:
import torch.nn as nn

class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2, output_size=1):
        super().__init__()
        
        self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        