# IMDB sentiment analysis with RNNs

Kaggle: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [1]:
import pandas as pd
import os
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
#from spellchecker import SpellChecker
from tqdm import tqdm
# allows to have a progress bar in pandas, useful for long processing operations
tqdm.pandas()
from collections import Counter
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("a")

import wandb
wandb.login(key=secret_value_0)
wandb.init(project='sentiment_analysis', save_code=True)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlorenzozanolin-52[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.2
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240129_184501-lfgnfl5a[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mdeep-snowflake-13[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/lorenzozanolin-52/sentiment_analysis[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/lorenzozanolin-52/sentiment_analysis/runs/lfgnfl5a[0m


Read the dataset and observe the first 5 rows.

In [2]:
data = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Lucky us, the dataset is well-balanced.

In [3]:
data.sentiment.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

Transform the labels to 0 and 1.

In [4]:
def transform_label(label):
    return 1 if label == 'positive' else 0


data['label'] = data['sentiment'].progress_apply(transform_label)
data.head

100%|██████████| 50000/50000 [00:00<00:00, 423971.99it/s]


<bound method NDFrame.head of                                                   review sentiment  label
0      One of the other reviewers has mentioned that ...  positive      1
1      A wonderful little production. <br /><br />The...  positive      1
2      I thought this was a wonderful way to spend ti...  positive      1
3      Basically there's a family where a little boy ...  negative      0
4      Petter Mattei's "Love in the Time of Money" is...  positive      1
...                                                  ...       ...    ...
49995  I thought this movie did a down right good job...  positive      1
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative      0
49997  I am a Catholic taught in parochial elementary...  negative      0
49998  I'm going to have to disagree with the previou...  negative      0
49999  No one expects the Star Trek movies to be high...  negative      0

[50000 rows x 3 columns]>

## Preprocessing

- In classic NLP, the text is often preprocessed to remove tokens that might confuse the classifier
- Below you can find some examples of possible preprocessing techniques
- Feel free to modify them to improve the results of your classifier

In [5]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
stopwords = set(stopwords.words('english'))

def rm_link(text):
    return re.sub(r'http\S+', '', text)


# handle case like "shut up okay?Im only 10 years old"
# become "shut up okay Im only 10 years old"
def rm_punct2(text):
    # return re.sub(r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]', ' ', text)
    return re.sub(r'[\"\#\$\%\&\'\(\)\*\+\/\:\;\<\=\>\@\[\\\]\^\_\`\{\|\}\~]', ' ', text)


def rm_html(text):
    # remove html tags
    text = re.sub(r'<.*?>', '', text)
    # remove <br /> tags
    return re.sub(r'<br />', '', text)


def space_bt_punct(text):
    pattern = r'([.,!?-])'
    s = re.sub(pattern, r' \1 ', text)  # add whitespaces between punctuation
    s = re.sub(r'\s{2,}', ' ', s)  # remove double whitespaces
    return s


def rm_number(text):
    return re.sub(r'\d+', '', text)


def rm_whitespaces(text):
    return re.sub(r'\s+', ' ', text)


def rm_nonascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)


def rm_emoji(text):
    emojis = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE
    )
    return emojis.sub(r'', text)

def rm_contractions(text): #removes contracted form and insert normal form, i.e. he's -> he is
    text=re.sub("isn't",'is not',text)
    text=re.sub("he's",'he is',text)
    text=re.sub("wasn't",'was not',text)
    text=re.sub("there's",'there is',text)
    text=re.sub("couldn't",'could not',text)
    text=re.sub("won't",'will not',text)
    text=re.sub("they're",'they are',text)
    text=re.sub("she's",'she is',text)
    text=re.sub("There's",'there is',text)
    text=re.sub("wouldn't",'would not',text)
    text=re.sub("haven't",'have not',text)
    text=re.sub("That's",'That is',text)
    text=re.sub("you've",'you have',text)
    text=re.sub("He's",'He is',text)
    text=re.sub("what's",'what is',text)
    text=re.sub("weren't",'were not',text)
    text=re.sub("we're",'we are',text)
    text=re.sub("hasn't",'has not',text)
    text=re.sub("you'd",'you would',text)
    text=re.sub("shouldn't",'should not',text)
    text=re.sub("let's",'let us',text)
    text=re.sub("they've",'they have',text)
    text=re.sub("You'll",'You will',text)
    text=re.sub("i'm",'i am',text)
    text=re.sub("we've",'we have',text)
    text=re.sub("it's",'it is',text)
    text=re.sub("don't",'do not',text)
    text=re.sub("that´s",'that is',text)
    text=re.sub("I´m",'I am',text)
    text=re.sub("it’s",'it is',text)
    text=re.sub("she´s",'she is',text)
    text=re.sub("he’s'",'he is',text)
    text=re.sub('I’m','I am',text)
    text=re.sub('I’d','I did',text)
    text=re.sub("he’s'",'he is',text)
    text=re.sub('there’s','there is',text)
    return text

def spell_correction(text):
    # if too slow: return text
    return text
    # https://pypi.org/project/pyspellchecker/
    spell = 0#SpellChecker()
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            candidate = spell.correction(word)
            if candidate is not None:
                corrected_text.append(candidate)
            else:
                corrected_text.append(word)
        else:
            corrected_text.append(word)
    return ' '.join(corrected_text)

def clean_pipetext(text):
    text = text.lower()
    no_link = rm_link(text)
    no_cont = rm_contractions(no_link)
    no_html = rm_html(no_cont)
    space_punct = space_bt_punct(no_html)
    no_punct = rm_punct2(space_punct)
    no_number = rm_number(no_punct)
    no_whitespaces = rm_whitespaces(no_number)
    no_nonasci = rm_nonascii(no_whitespaces)
    no_emoji = rm_emoji(no_nonasci)
    spell_corrected = spell_correction(no_emoji)
    return spell_corrected

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


Let's clean the reviews first:

In [6]:
data['review'] = data['review'].progress_apply(clean_pipetext)

100%|██████████| 50000/50000 [00:28<00:00, 1771.56it/s]


We now tokenize and remove stopwords (i.e. the, a, an, etc.) and lemmatize the words (i.e. running -> run, better -> good, etc.).

In [7]:
!python3 -m nltk.downloader wordnet
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

# preprocessing
def tokenize(text):
    return word_tokenize(text)


def rm_stopwords(text):
    return [i for i in text if i not in stopwords]


def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(t) for t in text]
    # make sure lemmas does not contains stopwords
    return rm_stopwords(lemmas)


def preprocess_pipetext(text):
    tokens = tokenize(text)
    no_stopwords = rm_stopwords(tokens)
    lemmas = lemmatize(no_stopwords)
    return ' '.join(lemmas)

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/sh

In [8]:
data['review'] = data['review'].progress_apply(preprocess_pipetext)

100%|██████████| 50000/50000 [03:31<00:00, 236.89it/s]


Let's check the result.

In [9]:
data.head()

Unnamed: 0,review,sentiment,label
0,one reviewer mentioned watching oz episode hoo...,positive,1
1,wonderful little production . filming techniqu...,positive,1
2,thought wonderful way spend time hot summer we...,positive,1
3,basically family little boy jake think zombie ...,negative,0
4,petter mattei love time money visually stunnin...,positive,1


## Embedding

- ANNs cannot process text input
- Input tokens must be mapped to integers using a vocabulary
- In this example, we build a vocabulary manually, but you can also replace this code with an [embedding layer](https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html)

In [10]:
# get all processed reviews
reviews = data.review.values
# merge into single variable, separated by whitespaces
words = ' '.join(reviews)
# obtain list of words
words = words.split()
# build vocabulary
counter = Counter(words)
# only keep top 2000 words
vocab = sorted(counter, key=counter.get, reverse=True)[:2000]
int2word = dict(enumerate(vocab, 2))
int2word[0] = '<PAD>'
int2word[1] = '<UNK>'
word2int = {word: id for id, word in int2word.items()}

In [11]:
reviews_enc = [[word2int[word] if word in word2int else word2int['<UNK>'] for word in review.split()] for review in tqdm(reviews, desc='encoding')]

encoding: 100%|██████████| 50000/50000 [00:02<00:00, 22798.08it/s]


Because we have to build batch, we have to pad the reviews to the same length. We will pad the reviews with <PAD> token.
**Because we use RNNs, we need to left pad and not right pad the sequence.**

In [12]:
# left padding sequences
def pad_features(reviews, pad_id, seq_length=128):
    # features = np.zeros((len(reviews), seq_length), dtype=int)
    features = np.full((len(reviews), seq_length), pad_id, dtype=int)

    for i, row in enumerate(reviews):
        start_index = max(0, seq_length - len(row))
        # if seq_length < len(row) then review will be trimmed
        features[i, start_index:] = np.array(row)[:min(seq_length, len(row))]

    return features


seq_length = 128
features = pad_features(reviews_enc, pad_id=word2int['<PAD>'], seq_length=seq_length)

## Split the data

In [13]:
labels = data.label.to_numpy()

# train test split
train_size = .75  # we will use 75% of whole data as train set
val_size = .5  # and we will use 50% of test set as validation set

# stratify will make sure that train and test set have same distribution of labels
train_x, test_x, train_y, test_y = train_test_split(features, labels, test_size=1 - train_size, stratify=labels)

# split test set into validation and test set
val_x, test_x, val_y, test_y = train_test_split(test_x, test_y, test_size=val_size, stratify=test_y)

Define the datasets and dataloaders.

In [14]:
# define batch size
batch_size = 128

wandb.log({'batch_size': batch_size})

# create tensor datasets
train_dataset = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_dataset = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_dataset = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# create dataloaders
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

Define the model.

In [15]:
import torch.nn as nn

class ManyToOneRNN(nn.Module):
    def __init__(self, input_size, emb_size, hidden_size, num_layers, num_classes):
        super(ManyToOneRNN, self).__init__()
        # Embedding layer to convert input indices to dense vectors
        self.embedding = nn.Embedding(input_size, emb_size) #in this case input size is the size of the vocabulary
        # set the hidden size and the number of layers for the RNN
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        # we will use a RNN with a final Fully connected layer to predict the output (class positive or negative)
        self.rnn = nn.RNN(emb_size, hidden_size, num_layers, batch_first=True) # (batch, sequence_length, X)
        self.fc = nn.Linear(hidden_size, num_classes)
        # weights initialization for the fully connected layer
        #nn.init.xavier_normal_(self.fc.weight)
    
    def forward(self, x):
        # Embedding layer to convert input indices to dense vectors
        x = self.embedding(x)
        # RNN layer will output the hidden state and the output
        rnn_out, h_n = self.rnn(x)
        # Assuming h_n is a tuple of hidden states from all layers
        # Concatenate the hidden states from all layers (assuming the last layer [-1])
        h_n = h_n[-1].squeeze(0)
        # Pass the concatenated hidden states through the fully connected layer
        out = self.fc(h_n)
        
        return out

Instantiate the model.

In [16]:
#wandb.init()
emb_size = 256  # Dimension of the word embeddings
hidden_size = 128  # Number of features in the hidden state
output_size = 1  # Dimension of the output, e.g., for a binary classification problem

wandb.log({'Embedding Size': emb_size})

model = ManyToOneRNN(input_size=len(word2int),emb_size=emb_size,hidden_size=hidden_size,num_layers=1,num_classes=output_size)
device = 'cuda' if torch.cuda.is_available() else 'cpu'  #we want to move the net on the GPU
model = model.to(device)
if device == 'cuda':
    model = torch.nn.DataParallel(model) # if multiple GPUs use them
    
print(model)

DataParallel(
  (module): ManyToOneRNN(
    (embedding): Embedding(2002, 256)
    (rnn): RNN(256, 128, batch_first=True)
    (fc): Linear(in_features=128, out_features=1, bias=True)
  )
)


Define the training loop.

In [17]:
lr = 0.0001
o = 'a'

if o == 'a':
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.001)
    wandb.log({'optimizer':'Adam'})
elif o == 'r':
    optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, alpha=0.9)
    wandb.log({'optimizer':'RMSprop'})

criterion = nn.BCEWithLogitsLoss()

num_epochs = 50

wandb.log({'lr': lr})
wandb.log({'epochs': num_epochs})

for epoch in range(num_epochs):
    # Set the model to training mode
    model.train()
    # Variable for total loss in each epoch
    total_loss = 0.0
    
    # Iterate through the training data
    for inputs, labels in train_loader:
        inputs = inputs.to(device)  #move data on the GPU
        labels = labels.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        
        # Reshape the labels
        labels = labels.view(-1, 1)  # Change dimensions to [batch_size, 1]
            
        # Compute the loss
        loss = criterion(outputs, labels.float())  # Convert labels to float
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Update the total loss
        total_loss += loss.item()
    
    # Calculate the average loss per epoch
    Training_Loss = total_loss / len(train_loader)
    
    # Print the average loss per epoch during training
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {Training_Loss:.4f}')
    
    # Set the model to evaluation mode
    model.eval()
    
    # Variables for total loss and number of correct predictions
    total_loss = 0.0
    correct_predictions = 0

    # Iterate through the validation data
    with torch.no_grad():  # Disable gradient computation during evaluation
        for inputs, labels in valid_loader:
            inputs = inputs.to(device)  #move data on the GPU
            labels = labels.to(device)
            
            # Forward pass
            
            outputs = model(inputs)
            
            # Reshape the labels
            labels = labels.view(-1, 1)  # Change dimensions to [batch_size, 1]
                
            # Compute the loss
            loss = criterion(outputs, labels.float())  # Convert labels to float
            
            # Update the total loss
            total_loss += loss.item()
            
            # Calculate the number of correct predictions
            threshold = 0.5
            predicted_labels = (torch.sigmoid(outputs) > threshold).float()
            correct_predictions += (predicted_labels == labels.float()).sum().item()

    
    # Calculate the average loss per epoch during evaluation
    average_loss = total_loss / len(valid_loader)
    
    # Calculate accuracy
    accuracy = correct_predictions / len(valid_loader.dataset)
    
    # Print evaluation metrics
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {average_loss:.4f}, Validation Accuracy: {accuracy:.4f}')
    
    # Log metrics using WandB
    wandb.log({"Epoch": epoch+1,"Training Loss": Training_Loss, "Validation Loss": average_loss, "Validation Accuracy": accuracy})

Epoch [1/50], Training Loss: 0.6923
Epoch [1/50], Validation Loss: 0.6879, Validation Accuracy: 0.5453
Epoch [2/50], Training Loss: 0.6779
Epoch [2/50], Validation Loss: 0.6744, Validation Accuracy: 0.5598
Epoch [3/50], Training Loss: 0.6092
Epoch [3/50], Validation Loss: 0.5595, Validation Accuracy: 0.7261
Epoch [4/50], Training Loss: 0.5417
Epoch [4/50], Validation Loss: 0.5589, Validation Accuracy: 0.7246
Epoch [5/50], Training Loss: 0.5356
Epoch [5/50], Validation Loss: 0.5322, Validation Accuracy: 0.7512
Epoch [6/50], Training Loss: 0.4911
Epoch [6/50], Validation Loss: 0.5289, Validation Accuracy: 0.7488
Epoch [7/50], Training Loss: 0.4775
Epoch [7/50], Validation Loss: 0.5033, Validation Accuracy: 0.7683
Epoch [8/50], Training Loss: 0.4550
Epoch [8/50], Validation Loss: 0.4992, Validation Accuracy: 0.7733
Epoch [9/50], Training Loss: 0.4436
Epoch [9/50], Validation Loss: 0.4767, Validation Accuracy: 0.7824
Epoch [10/50], Training Loss: 0.4445
Epoch [10/50], Validation Loss: 0.49

Evaluate the model on the test set.

In [18]:
# Set the model to evaluation mode
model.eval()

# Variables for total loss and number of correct predictions
total_loss = 0.0
correct_predictions = 0

# Iterate through the test data
with torch.no_grad():  # Disable gradient computation during evaluation
    for inputs, labels in test_loader:
        inputs = inputs.to(device)  #move data on the GPU
        labels = labels.to(device)
        # Forward pass
        outputs = model(inputs)
        
        # Reshape the model's output
        outputs = outputs.view(-1)  # Change dimensions from [batch_size, 1] to [batch_size]
        
        # Compute the loss
        loss = criterion(outputs, labels.float())  # Convert labels to float
        
        # Update the total loss
        total_loss += loss.item()
        
        # Calculate the number of correct predictions
        threshold = 0.5
        predicted_labels = (torch.sigmoid(outputs) > threshold).float()
        correct_predictions += (predicted_labels == labels.float()).sum().item()
        
# Calculate the average loss for the test set
average_loss = total_loss / len(test_loader)

# Calculate accuracy on the test set
accuracy = correct_predictions / len(test_loader.dataset)

print(f'Test Loss: {average_loss:.4f}, Test Accuracy: {accuracy:.4f}')

# Log metrics using WandB
wandb.log({"Test Loss": average_loss, "Test Accuracy": accuracy})
wandb.finish()

Test Loss: 0.4079, Test Accuracy: 0.8462


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:      Embedding Size ▁
[34m[1mwandb[0m:               Epoch ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
[34m[1mwandb[0m:       Test Accuracy ▁
[34m[1mwandb[0m:           Test Loss ▁
[34m[1mwandb[0m:       Training Loss ██▇▆▅▅▄▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁
[34m[1mwandb[0m: Validation Accuracy ▁▁▅▅▆▆▆▇▇▇▇▇▇▇███▇████▇█████████████████
[34m[1mwandb[0m:     Validation Loss ██▅▅▄▄▃▃▃▂▂▃▂▂▂▂▁▂▁▁▂▁▂▁▁▁▁▁▂▁▁▂▁▁▁▁▁▂▂▂
[34m[1mwandb[0m:          batch_size ▁
[34m[1mwandb[0m:              epochs ▁
[34m[1mwandb[0m:                  lr ▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:      Embedding Size 256
[34m[1mwandb[0m:               Epoch 50
[34m[1mwandb[0m:       Test Accuracy 0.84624
[34m[1mwandb[0m:           Test Loss 0.4079
[34m[1mwandb[0m:       Training