# RNN sentiment classification on IMDB csv

In [1]:
import pandas as pd
import torch
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
from torchtext import vocab as V
from sklearn.model_selection import train_test_split
from collections import Counter
import time 
import random

torch.backends.cudnn.deterministic = True

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



## General settings

In [2]:
random_seed = 123
torch.manual_seed(random_seed)

vocabulary_size = 20000
learning_rate = 1e-4
batch_size = 128
num_epochs = 15
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

embedding_dim = 128
hidden_dim = 256
output_dim = 1

## Data

In [3]:
# !wget https://github.com/rasbt/python-machine-learning-book-2nd-edition/raw/master/code/ch08/movie_data.csv.gz

In [4]:
# !gunzip -f movie_data.csv.gz

In [5]:
df = pd.read_csv('movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [6]:
df['sentiment'].value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

## Vocab

In [7]:
import spacy
tokenizer = data.utils.get_tokenizer('spacy')

# counter = Counter()
# for label, line in train_iter:
#     counter.update(tokenizer(line))
# vocab = V.vocab(counter, min_freq=10, specials=('<unk>', '<pad>'))

def yield_tokens(data_iter):
    for row in data_iter:
        # yield tokenizer(row['review'])
        yield tokenizer(row[1])


def get_vocab(train_datapipe):
    v = V.build_vocab_from_iterator(yield_tokens(train_datapipe),
                                            specials=['<UNK>', '<PAD>'],
                                            max_tokens=vocabulary_size+2)
    v.set_default_index(v['<UNK>'])
    return v

# train_iter = IMDB(split='train')
# vocab = get_vocab(df.iterrows())   # very slow
vocab = get_vocab(df.itertuples())




In [8]:
len(vocab)

20002

In [9]:

class IMDBDataset(torch.utils.data.Dataset):
    
    def __init__(self, df, tokenizer):
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        review = self.df.iloc[idx]["review"]
        label = self.df.iloc[idx]["sentiment"]

        tokens = self.tokenizer(review)
        return label, tokens, len(tokens)

In [10]:
from torch.utils.data import DataLoader 
from torch.nn.utils.rnn import pad_sequence 

# text_transform = lambda x: [vocab['<BOS>']] + [vocab[token] for token in tokenize(x)] + [vocab['<EOS>']]  
# tokenizer already used above in the Dataset class
text_transform = lambda x: [vocab['<BOS>']] + [vocab[token] for token in x] + [vocab['<EOS>']]
# label_transform = lambda x: 1 if x == 'pos' else 0

def collate_batch(batch): 
    # Sort the batch in the descending order req for packed padded seq
    # sorted_batch = sorted(batch, key=lambda x: x[2], reverse=True)
    
    label_list, text_list, length_list = [], [] , []
  
    for (_label, _text, _length) in batch: 
        
        # label_list.append(label_transform(_label)) 
        label_list.append(_label)
        processed_text = torch.tensor(text_transform(_text)) 
        text_list.append(processed_text) 
        length_list.append(_length)
    return torch.tensor(label_list), pad_sequence(text_list, padding_value=3.0), torch.LongTensor(length_list)


In [11]:
## create train val test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=0)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=0)

train_dataset = IMDBDataset(train_df, tokenizer)
val_dataset = IMDBDataset(val_df, tokenizer)
test_dataset = IMDBDataset(test_df, tokenizer)

# create the train and test dataloaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

In [12]:
x,y,z = next(iter(train_loader))
print(x.shape, x)
print(y.shape, y)
print(z.shape, z)

torch.Size([128]) tensor([0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
        1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
        1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
        1, 1, 1, 0, 1, 0, 1, 0])
torch.Size([1188, 128]) tensor([[   0,    0,    0,  ...,    0,    0,    0],
        [6563, 1063,  320,  ...,  348,  925,   25],
        [6001,   15,   11,  ...,   91,  469,   75],
        ...,
        [   3,    3,    3,  ...,    3,    3,    3],
        [   3,    3,    3,  ...,    3,    3,    3],
        [   3,    3,    3,  ...,    3,    3,    3]])
torch.Size([128]) tensor([  61,  375,  272,  243,  584,  207,  164,   75,  376,  116,  311,  179,
         531,  341,  277,   75,  150,  395,   46,   68,  674,  234,  267,  159,
         371,  168,  

## Model

In [17]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        # self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text, text_length):

        #[sentence len, batch size] => [sentence len, batch size, embedding size]
        embedded = self.embedding(text)

        # packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, text_length)
        
        #[sentence len, batch size, embedding size] => 
        #  output: [sentence len, batch size, hidden size]
        #  hidden: [1, batch size, hidden size]
        # output, hidden = self.rnn(embedded)
        output, (hidden, cell) = self.rnn(embedded)
        # packed_output, (hidden, cell) = self.rnn(packed)
        
        return self.fc(hidden.squeeze(0)).view(-1)

In [18]:
input_dim = len(vocab)

torch.manual_seed(random_seed)
model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## Training

In [19]:
def compute_binary_accuracy(model, data_loader, device):
    model.eval()
    correct_pred, num_examples = 0,0
    with torch.no_grad():
        for batch_idx, (label, text, text_length) in enumerate(data_loader):
            logits = model(text, text_length)
            predicted_labels = (torch.sigmoid(logits) > 0.5).long()
            num_examples += label.size(0)
            correct_pred += (predicted_labels == label.long()).sum()

    return correct_pred.float()/num_examples * 100

In [20]:
start_time = time.time()

for epoch in range(num_epochs):
    model.train()
    for batch_idx, (label, text, text_length) in enumerate(train_loader):
        
        ### FORWARD AND BACK PROP
        logits = model(text, text_length)
        cost = F.binary_cross_entropy_with_logits(logits, label.float())
        optimizer.zero_grad()
        
        cost.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{num_epochs:03d} | '
                   f'Batch {batch_idx:03d}/{len(list(train_loader)):03d} | '
                   f'Cost: {cost:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_binary_accuracy(model, train_loader, device):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_binary_accuracy(model, val_loader, device):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_binary_accuracy(model, test_loader, device):.2f}%')

Epoch: 001/015 | Batch 000/282 | Cost: 0.6926
Epoch: 001/015 | Batch 050/282 | Cost: 0.6932
Epoch: 001/015 | Batch 100/282 | Cost: 0.6946
Epoch: 001/015 | Batch 150/282 | Cost: 0.6970
Epoch: 001/015 | Batch 200/282 | Cost: 0.6933
Epoch: 001/015 | Batch 250/282 | Cost: 0.6937
training accuracy: 50.18%
valid accuracy: 51.08%
Time elapsed: 35.11 min
Epoch: 002/015 | Batch 000/282 | Cost: 0.6923


KeyboardInterrupt: 

In [None]:
## Predict sentiment from sentence

import spacy
nlp = spacy.load('en_core_web_sm')

def predict_sentiment(model, sentence):
    # based on:
    # https://github.com/bentrevett/pytorch-sentiment-analysis/blob/
    # master/2%20-%20Upgraded%20Sentiment%20Analysis.ipynb
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(DEVICE)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [None]:
print('Probability positive:')
predict_sentiment(model, "I really love this movie. This movie is so great!")

In [None]:
print('Probability negative:')
predict_sentiment(model, "I really hate this movie. It is really bad and sucks!")