# IMDB sentiment analysis using RNN

In [17]:
from torchtext.datasets import IMDB
from torch.utils.data.dataset import random_split
import torch
import torch.nn as nn

In [6]:
train_dataset = IMDB(split='train')
test_dataset = IMDB(split='test')

In [9]:
## Step 1: creating the datasets
torch.manual_seed(1)
train_dataset, valid_dataset = random_split(list(train_dataset), [20000, 5000])

In [10]:
test_dataset = list(test_dataset)

In [12]:
## Step 2: finding unique tokens (words)
import re
from collections import Counter, OrderedDict

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
  text = re.sub('[\W]+', ' ', text.lower()) +\


In [13]:
token_counts = Counter()
for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)

print('Vocab-size:', len(token_counts))

Vocab-size: 69023


In [14]:
## Step 3: encoding each unique token into integers
from torchtext.vocab import vocab
sorted_by_freq_tuples = sorted(
    token_counts.items(), key=lambda x: x[1], reverse=True
)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

In [15]:
print([vocab[token] for token in ['this','is','an','example']])

[11, 7, 35, 457]


 If we have q tokens (that is, the size of token_counts
passed to Vocab, which in this case is 69,023), then all tokens that haven’t been seen before, and are
thus not included in token_counts, will be assigned the integer 1 (a placeholder for the unknown to-
ken). In other words, the index 1 is reserved for unknown words. Another reserved value is the integer
0, which serves as a placeholder, a so-called padding token, for adjusting the sequence length. 

In [21]:
## Step 3-A: defining the functions for transformation
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1. if x == 2 else 0.

In [22]:
## Step 3-B: wrap the encode and transformation function
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True
    )
    return padded_text_list, label_list, lengths

In [23]:
## Taking a small batch
from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)

In [24]:
text_batch, label_batch, length_batch = next(iter(dataloader))
print(text_batch)
print(label_batch)
print(length_batch)
print(text_batch.shape)

tensor([[   35,  1739,     7,   449,   721,     6,   301,     4,   787,     9,
             4,    18,    44,     2,  1705,  2460,   186,    25,     7,    24,
           100,  1874,  1739,    25,     7, 34415,  3568,  1103,  7517,   787,
             5,     2,  4991, 12401,    36,     7,   148,   111,   939,     6,
         11598,     2,   172,   135,    62,    25,  3199,  1602,     3,   928,
          1500,     9,     6,  4601,     2,   155,    36,    14,   274,     4,
         42945,     9,  4991,     3,    14, 10296,    34,  3568,     8,    51,
           148,    30,     2,    58,    16,    11,  1893,   125,     6,   420,
          1214,    27, 14542,   940,    11,     7,    29,   951,    18,    17,
         15994,   459,    34,  2480, 15211,  3713,     2,   840,  3200,     9,
          3568,    13,   107,     9,   175,    94,    25,    51, 10297,  1796,
            27,   712,    16,     2,   220,    17,     4,    54,   722,   238,
           395,     2,   787,    32,    27,  5236,  

In [25]:
# dividing all three datasets into data loaders with a batch size of 32
batch_size = 32
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,collate_fn=collate_batch)

Given a set of tokens of size n+2 (n is the size of the token set, plus index 0 is reserved for the padding placeholder, and 1 is for the words not present in the token set), an embedding matrix of size (n+2) × embedding_dim will be created where each row of this matrix represents numeric features associated with a token. Therefore, when an integer index, i, is given as input to the embedding, it will look up the corresponding row of the matrix at index i and return the numeric features. 

In [26]:
embedding = nn.Embedding(
    num_embeddings=10,
    embedding_dim=3, padding_idx=0
)

In [28]:
# a batch of 2 samples of 4 indices each
text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
print(embedding(text_encoded_input))

tensor([[[-0.4651, -0.3203,  2.2408],
         [ 0.3824, -0.3446, -0.3531],
         [-0.0251, -0.5973, -0.2959],
         [ 0.8356,  0.4025, -0.6924]],

        [[-0.0251, -0.5973, -0.2959],
         [ 0.9124, -0.4643,  0.3046],
         [ 0.3824, -0.3446, -0.3531],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)
