# Fine-tuning a BERT model for IMDb review classification

In [1]:
import gzip
import shutil
import time
import requests

import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.nn.functional as F
#import torchtext

# from Hugging Face
import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification
from triton.language import dtype

## Data preparation
### Loading Dataset

Already downloaded (for previous Logistic Regression and RNN based models). 

In [2]:
path = "../supervised-learning/imdb-review-classification/movie_data.csv"
df = pd.read_csv(path)
df.sample(5)

Unnamed: 0,review,sentiment
1876,"After seeing Forever Hollywood, it would be na...",0
5569,The mod squad gets started 'after' the formati...,0
31978,I thought the movie was actually pretty good. ...,1
46975,This is a real eye candy. A world made of floa...,1
21490,"Let me get this straight:<br /><br />""Hotshot ...",0


Sentiment == 1 means positive review, 0 for negatives.  

The dataset is balanced:

In [3]:
print(f'Length of dataset: {df.shape[0]}')
print(f'Number of positive and negative reviews: {df[df['sentiment'] == 1].shape[0]}, {df[df['sentiment'] == 0].shape[0]}')

Length of dataset: 50000
Number of positive and negative reviews: 25000, 25000


### Splitting Dataset: Train, Validation and Test subsets

We will use 70% for training, 10% for validation and 20% for testing.

In [4]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1310)
valid_size = df.shape[0] * 0.1
valid_frac_in_train_df = valid_size / train_df.shape[0]
train_df, valid_df = train_test_split(train_df, test_size=valid_frac_in_train_df, random_state=1310)

print(f'Train size: {train_df.shape[0]}')
print(f'Valid size: {valid_df.shape[0]}')
print(f'Test size: {test_df.shape[0]}')

Train size: 35000
Valid size: 5000
Test size: 10000


## Tokenizing the dataset

We will tokenize the texts into individual word tokens using the tokenizer provided by the pre-trained model class.

In [5]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_df['review'].values.tolist(), padding=True, truncation=True)
valid_encodings = tokenizer(valid_df['review'].values.tolist(), padding=True, truncation=True)
test_encodings = tokenizer(test_df['review'].values.tolist(), padding=True, truncation=True)

In [6]:
train_encodings[0]

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

### Dataset class and DataLoader

In [9]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        super(IMDbDataset, self).__init__()
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        # create a hashmap to hold the input tokens, attention masks and label
        item = {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.int32)
        return item
        
    def __len__(self):
        return len(self.labels)

In [10]:
train_dataset = IMDbDataset(train_encodings, train_df['sentiment'].values)
valid_dataset = IMDbDataset(valid_encodings, valid_df['sentiment'].values)
test_dataset = IMDbDataset(test_encodings, test_df['sentiment'].values)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=16, shuffle=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

## Fine-tuning pre-trained BERT LLM
### General settings

In [8]:
torch.backends.cudnn.deterministic = True
torch.manual_seed(1310) # for reproducibility

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

num_epochs = 3

### Loading the BERT model

The downstream task we want to fine-tune the BERT model on is **sequence classification**.  

`'distilbert-base-uncased'` is a streamlined, lightweight and uncased version of the BERT base model. It offers a smaller size while maintaining strong performance, making it more computationally efficient for tasks without sacrificing much accuracy.

In [12]:
bert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
bert_model.to(device)
bert_model

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 