In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from datasets import load_dataset, load_from_disk
import os
import sys
import math

In [3]:
sys.path.append("../src")

In [4]:
from data_preprocessing import get_datasets, get_dataloaders
from model import get_tokenizer, get_pretrained_model

In [5]:
data_directory = '../data/'
ds = None

if not os.path.exists(data_directory):
    ds = load_dataset("stanfordnlp/imdb")
    ds.save_to_disk(data_directory)
else:
    ds = load_from_disk(data_directory)

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Running on {}".format(device))

Running on cpu


### Data Analysis

In [7]:
ds.keys()

dict_keys(['train', 'test', 'unsupervised'])

In [8]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [9]:
# huggingface dataset already has train test split of data 

train_df = ds['train'].to_pandas()
test_df = ds['test'].to_pandas()

In [10]:
train_df.shape

(25000, 2)

In [11]:
test_df.shape

(25000, 2)

In [12]:
train_df.label.value_counts()

label
0    12500
1    12500
Name: count, dtype: int64

In [13]:
test_df.label.value_counts()

label
0    12500
1    12500
Name: count, dtype: int64

In [14]:
# Looks like the class distribution is good. No need to balance the dataset

In [15]:
train_df.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [16]:
test_df.head()

Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


In [17]:
def get_sentence_len(row):
    ''' Function to get the length of sentence based on space as delimiter
    Not perfect function but decent for approximation
    '''
    sentence = row['text']
    return len(sentence.split(' '))

In [18]:
train_df['text_len'] = train_df.apply(get_sentence_len, axis=1)

In [19]:
test_df['text_len'] = test_df.apply(get_sentence_len, axis=1)

In [20]:
train_df.text_len.describe()

count    25000.000000
mean       233.776720
std        173.715418
min         10.000000
25%        127.000000
50%        174.000000
75%        284.000000
max       2470.000000
Name: text_len, dtype: float64

In [21]:
test_df.text_len.describe()

count    25000.000000
mean       228.515160
std        168.866127
min          4.000000
25%        126.000000
50%        172.000000
75%        277.000000
max       2278.000000
Name: text_len, dtype: float64

### Data preprocessing

In [22]:
tokenizer_obj = get_tokenizer()

Using model distilbert-base-uncased


In [23]:
# Initialize pytorch datasets
train_dataset = get_datasets(train_df['text'].to_list(), train_df['label'].to_list(), tokenizer_obj, max_length=512)
test_dataset = get_datasets(test_df['text'].to_list(), test_df['label'].to_list(), tokenizer_obj, max_length=512)

In [24]:
# Initialize pytorch dataloaders
train_dataloader = get_dataloaders(train_dataset, batch_size=8, shuffle=True)
test_dataloader = get_dataloaders(test_dataset, batch_size=8, shuffle=True)

In [25]:
# check dataloader 
data_iter = iter(train_dataloader)
sample = next(data_iter)
print(sample['input_ids'].shape, sample['attention_mask'].shape, sample['label'].shape)

torch.Size([8, 512]) torch.Size([8, 512]) torch.Size([8])


In [26]:
sample

{'input_ids': tensor([[  101,  2023,  2003,  ...,     0,     0,     0],
         [  101,  2023,  3185,  ...,     0,     0,     0],
         [  101,  1996,  2034,  ...,  2021,  2012,   102],
         ...,
         [  101, 10889,  2204,  ...,     0,     0,     0],
         [  101, 21877, 28210,  ...,     0,     0,     0],
         [  101,  2023,  2003,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'num_tokens': tensor([ 80,  74, 510, 368, 510, 211, 161, 248]),
 'label': tensor([1, 1, 0, 1, 1, 1, 0, 0])}

### Initialize model

In [27]:
model = get_pretrained_model(num_labels=1)

Using model distilbert-base-uncased


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


### Initialize Hyperparameters 

In [29]:
learning_rate = 1e-5
momentum = 0.9
num_epochs = 2
batch_size = 8

In [30]:
num_total_steps = math.ceil(len(train_dataloader) // batch_size)
print(f'Total number of training steps in each epoch: {num_total_steps}')

Total number of training steps in each epoch: 390


### Training code

In [37]:
# def custom_loss(outputs, labels):
#     # outputs: raw logits from the model (shape: [batch_size, 1])
#     # labels: binary labels (shape: [batch_size], values: 0 or 1)
#     loss_fn = nn.BCEWithLogitsLoss()
#     outputs = outputs.view(-1)  # ensure shape [batch_size]
#     labels = labels.float()     # convert labels to float
#     loss = loss_fn(outputs, labels)
#     return loss


class CustomLoss(nn.Module):
    def __init__(self, parameter=None):
        super().__init__()
        self.parameter = parameter # Example of a parameter

    def forward(self, pred_logits, target, num_tokens):
        # Implement your custom loss logic here
        loss_fn = nn.BCEWithLogitsLoss()
        pred_logits = pred_logits.view(-1)
        target = target.float()
        loss = loss_fn(pred_logits, target)
        
        if self.parameter is not None:
            loss += torch.sum(self.parameter * (num_tokens)/512)
        return loss

custom_loss = CustomLoss(parameter=0.01)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [40]:
model.train()
for epoch in range(num_epochs):
    for step_num, input_encoding in enumerate(train_dataloader):
        input_ids = input_encoding['input_ids']
        attention_masks = input_encoding['attention_mask']
        labels = input_encoding['label']
        num_tokens = input_encoding['num_tokens']
        
        preds = model(input_ids, attention_mask=attention_masks, labels=labels)
        print(preds.loss)
        loss = custom_loss(preds.logits, labels, num_tokens)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step_num%10 == 0:
            print(f"Epoch: {epoch}/{num_epochs}, Step: {step_num}/{num_total_steps}, Training Loss: {loss.item()}")
            break
            

tensor(0.4009, grad_fn=<MseLossBackward0>)
Epoch: 0/2, Step: 0/390, Training Loss: 0.7412158250808716
tensor(0.3851, grad_fn=<MseLossBackward0>)
Epoch: 1/2, Step: 0/390, Training Loss: 0.7178337574005127


### Evaluation code

In [60]:
with torch.no_grad():
    for step_num, input_encoding in enumerate(test_dataloader):
        input_ids = input_encoding['input_ids']
        attention_masks = input_encoding['attention_mask']
        labels = input_encoding['label']

        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        preds = torch.argmax(outputs.logits, dim=1)

        print(labels)
        print(preds)
        print((labels == preds).sum())

        break

tensor([1, 0, 0, 0, 1, 0, 0, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 0])
tensor(2)
