In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from datasets import load_dataset, load_from_disk
import os
import sys
import math

In [3]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class IMDBDataset(Dataset):
    def __init__(self, sentences_list, labels_list, tokenizer, max_length=512):
        super(IMDBDataset, self).__init__()

        self.sentences = sentences_list
        self.labels = labels_list
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        sentence = self.sentences[index]
        label = self.labels[index]

        encoding = self.tokenizer(sentence,
                                 truncation=True,
                                 padding='max_length',
                                 max_length=self.max_length,
                                 return_tensors='pt')

        # Calculate the number of tokens in sentence for custom loss
        # Reducing it by 2 because first and last tokens are CLS and SEP
        num_tokens = (encoding['attention_mask'] == 1).sum() - 2
        return {
            "input_ids": encoding['input_ids'].squeeze(0),
            "attention_mask": encoding['attention_mask'].squeeze(0),
            "num_tokens": num_tokens,
            "label": torch.tensor(label)
        }


def get_datasets(sentences_list, labels_list, tokenizer, max_length=512):
    dataset = IMDBDataset(sentences_list, labels_list, tokenizer, max_length)
    return dataset

def get_dataloaders(dataset, batch_size, shuffle=True):
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

In [4]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

model_name = 'distilbert-base-uncased'

def get_tokenizer():
    print(f'Using model {model_name}')
    tokenizer_obj = DistilBertTokenizer.from_pretrained(model_name)
    return tokenizer_obj

def get_pretrained_model(num_labels):
    print(f'Using model {model_name}')
    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    return model


In [5]:
ds = load_dataset("stanfordnlp/imdb")

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Running on {}".format(device))

Running on cuda


### Data Analysis

In [7]:
ds.keys()

dict_keys(['train', 'test', 'unsupervised'])

In [8]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [9]:
# huggingface dataset already has train test split of data
# train_size, test_size, val_size = 1000, 100, 100
train_size, test_size, val_size = 25000, 2500, 100

train_df = ds['train'].to_pandas()
train_df = train_df.sample(n=train_size)
all_test_df = ds['test'].to_pandas()

# Create Validation and Test dataframes
test_df = all_test_df.sample(n=test_size)
val_df = all_test_df.sample(n=val_size)

In [10]:
train_df.shape

(25000, 2)

In [11]:
test_df.shape

(2500, 2)

In [12]:
val_df.shape

(100, 2)

In [13]:
train_df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,12500
0,12500


In [14]:
test_df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,1281
1,1219


In [15]:
val_df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,54
1,46


In [16]:
# Looks like the class distribution is good. No need to balance the dataset

In [17]:
train_df.head()

Unnamed: 0,text,label
15743,My name is John Mourby and this is my story ab...,1
14701,Mother Night is one of my favorite novels and ...,1
14391,I can remember this movie from when i was a sm...,1
1211,"She may have an Oscar and a Golden Globe, but ...",0
17440,And a rather Unexpected plot line too-for the ...,1


In [18]:
test_df.head()

Unnamed: 0,text,label
8781,Hollywood had a long love affair with bogus Ar...,0
12457,Why do people make bad movies? Didn't anyone w...,0
14001,Great little ground-breaking movie (in 1955) a...,1
10045,From time to time it's very advisable for the ...,0
11409,"this show is pretty alright and fun to watch, ...",0


In [19]:
val_df.head()

Unnamed: 0,text,label
8213,"This movie clearly has an agenda, which could ...",0
1813,A very carelessly written film. Poor character...,0
13047,"Jim Henson's The Muppet Movie is a charming, f...",1
18919,"I'm not a fan of scratching, but I really dug ...",1
21432,John Carpenter shows how much he loves the 195...,1


In [20]:
def get_sentence_len(row):
    ''' Function to get the length of sentence based on space as delimiter
    Not perfect function but decent for approximation
    '''
    sentence = row['text']
    return len(sentence.split(' '))

In [21]:
train_df['text_len'] = train_df.apply(get_sentence_len, axis=1)

In [22]:
test_df['text_len'] = test_df.apply(get_sentence_len, axis=1)

In [23]:
val_df['text_len'] = val_df.apply(get_sentence_len, axis=1)

In [24]:
train_df.text_len.describe()

Unnamed: 0,text_len
count,25000.0
mean,233.77672
std,173.715418
min,10.0
25%,127.0
50%,174.0
75%,284.0
max,2470.0


In [25]:
test_df.text_len.describe()

Unnamed: 0,text_len
count,2500.0
mean,231.06
std,166.980655
min,8.0
25%,127.0
50%,175.0
75%,286.0
max,1167.0


### Data preprocessing

In [26]:
tokenizer_obj = get_tokenizer()

Using model distilbert-base-uncased


In [27]:
# Initialize pytorch datasets
train_dataset = get_datasets(train_df['text'].to_list(), train_df['label'].to_list(), tokenizer_obj, max_length=512)
test_dataset = get_datasets(test_df['text'].to_list(), test_df['label'].to_list(), tokenizer_obj, max_length=512)
val_dataset = get_datasets(val_df['text'].to_list(), val_df['label'].to_list(), tokenizer_obj, max_length=512)

In [28]:
# Initialize pytorch dataloaders
train_dataloader = get_dataloaders(train_dataset, batch_size=8, shuffle=True)
test_dataloader = get_dataloaders(test_dataset, batch_size=8, shuffle=True)
val_dataloader = get_dataloaders(val_dataset, batch_size=8, shuffle=True)

In [29]:
# check dataloader
data_iter = iter(train_dataloader)
sample = next(data_iter)
print(sample['input_ids'].shape, sample['attention_mask'].shape, sample['label'].shape)

torch.Size([8, 512]) torch.Size([8, 512]) torch.Size([8])


In [30]:
len(train_dataloader), len(test_dataloader), len(val_dataloader)

(3125, 313, 13)

In [31]:
sample

{'input_ids': tensor([[  101,  2304,  4215,  ...,     0,     0,     0],
         [  101,  2034,  2009,  ...,     0,     0,     0],
         [  101,  1037,  2177,  ...,     0,     0,     0],
         ...,
         [  101,  1000,  1996,  ...,     0,     0,     0],
         [  101, 19522,  1006,  ...,     0,     0,     0],
         [  101,  2022,  7420,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'num_tokens': tensor([167,  62, 182, 470, 510, 156, 137, 136]),
 'label': tensor([1, 1, 0, 1, 1, 1, 1, 0])}

### Initialize Hyperparameters

In [32]:
learning_rate = 1e-5
momentum = 0.9
num_epochs = 2
batch_size = 8

In [33]:
num_total_steps = len(train_dataloader)
print(f'Total number of training steps in each epoch: {num_total_steps}')

Total number of training steps in each epoch: 3125


### Training code

In [34]:
class CustomLoss(nn.Module):
    def __init__(self, parameter=None):
        super().__init__()
        self.parameter = parameter

    def forward(self, pred_logits, target, num_tokens):
        loss_fn = nn.BCEWithLogitsLoss()
        pred_logits = pred_logits.view(-1)
        target = target.float()
        loss = loss_fn(pred_logits, target)

        # Simple Custom loss logic
        # Intuition behind is Longer the length of the review(in terms of words/tokens) more information is given by the reviewer
        # Also from the data analysis before, on average a review will have 225+ tokens
        # This loss function penalizes wrong predictions on longer reviews

        if self.parameter is not None:
            loss += torch.sum(self.parameter * (num_tokens)/512)
        return loss

custom_loss = CustomLoss(parameter=0.01)

In [35]:
def train(data_loader, num_epochs, custom_loss_fn=None):
    model = get_pretrained_model(num_labels=1)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    if custom_loss_fn:
        print("Using Custom loss function")
    else:
        print("Using default loss from Distilbert")

    for epoch in range(num_epochs):
      model.train()
      train_loss = 0
      for step_num, input_encoding in enumerate(data_loader):
          input_ids = input_encoding['input_ids']
          attention_masks = input_encoding['attention_mask']
          labels = input_encoding['label']
          num_tokens = input_encoding['num_tokens']

          input_ids = input_ids.to(device)
          attention_masks = attention_masks.to(device)
          labels = labels.to(device)
          num_tokens = num_tokens.to(device)

          preds = model(input_ids, attention_mask=attention_masks, labels=labels.float())

          if custom_loss_fn is not None:
              loss = custom_loss_fn(preds.logits, labels, num_tokens)
          else:
              loss = preds.loss

          train_loss += loss.item()
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()

          if (step_num+1)%500 == 0:
              print(f"Epoch: {epoch+1}/{num_epochs}, Step: {step_num+1}/{num_total_steps}, Training Loss so far: {train_loss/((step_num+1)*batch_size)}")

      print(f"Epoch: {epoch+1}/{num_epochs},Total Training Loss: {train_loss/(len(train_dataset))}")

      # Calculate Validation at the end of every epoch
      model.eval()
      val_loss = 0
      with torch.no_grad():
          for step_num, input_encoding in enumerate(val_dataloader):
              input_ids = input_encoding['input_ids']
              attention_masks = input_encoding['attention_mask']
              labels = input_encoding['label']
              num_tokens = input_encoding['num_tokens']

              input_ids = input_ids.to(device)
              attention_masks = attention_masks.to(device)
              labels = labels.to(device)
              num_tokens = num_tokens.to(device)

              preds = model(input_ids, attention_mask=attention_masks, labels=labels.float())

              if custom_loss_fn is not None:
                  val_loss += custom_loss_fn(preds.logits, labels, num_tokens)
              else:
                  val_loss += preds.loss

          print(f"Epoch: {epoch+1}/{num_epochs}, Total Validation Loss: {val_loss.item()/len(val_dataset)}")

    return model


def evaluate(model, data_loader):
    model.eval()
    acc = 0
    with torch.no_grad():
        for step_num, input_encoding in enumerate(test_dataloader):
            input_ids = input_encoding['input_ids']
            attention_masks = input_encoding['attention_mask']
            labels = input_encoding['label']
            num_tokens = input_encoding['num_tokens']

            input_ids = input_ids.to(device)
            attention_masks = attention_masks.to(device)
            labels = labels.to(device)
            num_tokens = num_tokens.to(device)

            outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
            preds = torch.argmax(outputs.logits, dim=1)

            acc += (labels == preds).sum().item()

    print(f"Accuracy: {acc/len(test_dataset)}")


In [36]:
# Main driver for training the model

# Default loss by Distilbert
model = train(train_dataloader, num_epochs)
evaluate(model, test_dataloader)

Using model distilbert-base-uncased


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using default loss from Distilbert
Epoch: 1/2, Step: 500/3125, Training Loss so far: 0.01618453865277115
Epoch: 1/2, Step: 1000/3125, Training Loss so far: 0.013165959330712212
Epoch: 1/2, Step: 1500/3125, Training Loss so far: 0.011716782977309777
Epoch: 1/2, Step: 2000/3125, Training Loss so far: 0.011008835651598929
Epoch: 1/2, Step: 2500/3125, Training Loss so far: 0.010475481856794795
Epoch: 1/2, Step: 3000/3125, Training Loss so far: 0.010137087587851663
Epoch: 1/2,Total Training Loss: 0.01007238722306909
Epoch: 1/2, Total Validation Loss: 0.0047422066330909726
Epoch: 2/2, Step: 500/3125, Training Loss so far: 0.006032561307125434
Epoch: 2/2, Step: 1000/3125, Training Loss so far: 0.005886572257662919
Epoch: 2/2, Step: 1500/3125, Training Loss so far: 0.005764175744264018
Epoch: 2/2, Step: 2000/3125, Training Loss so far: 0.005684829934060872
Epoch: 2/2, Step: 2500/3125, Training Loss so far: 0.005770949694255978
Epoch: 2/2, Step: 3000/3125, Training Loss so far: 0.00573253157226

In [37]:
# Using custom loss
model = train(train_dataloader, num_epochs, custom_loss)
evaluate(model, test_dataloader)

Using model distilbert-base-uncased


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using Custom loss function
Epoch: 1/2, Step: 500/3125, Training Loss so far: 0.0527622605022043
Epoch: 1/2, Step: 1000/3125, Training Loss so far: 0.044707871700637045
Epoch: 1/2, Step: 1500/3125, Training Loss so far: 0.04153955115781476
Epoch: 1/2, Step: 2000/3125, Training Loss so far: 0.03986409278702922
Epoch: 1/2, Step: 2500/3125, Training Loss so far: 0.0385454803770408
Epoch: 1/2, Step: 3000/3125, Training Loss so far: 0.03760046991131579
Epoch: 1/2,Total Training Loss: 0.03725962341696024
Epoch: 1/2, Total Validation Loss: 0.0225205397605896
Epoch: 2/2, Step: 500/3125, Training Loss so far: 0.02427990456856787
Epoch: 2/2, Step: 1000/3125, Training Loss so far: 0.024623101046308875
Epoch: 2/2, Step: 1500/3125, Training Loss so far: 0.024518101801164448
Epoch: 2/2, Step: 2000/3125, Training Loss so far: 0.02372416638210416
Epoch: 2/2, Step: 2500/3125, Training Loss so far: 0.02390312594641
Epoch: 2/2, Step: 3000/3125, Training Loss so far: 0.024065256814161936
Epoch: 2/2,Total T

### Evaluation code