# Fine-tuning a BERT model for IMDb review classification

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import torch

# from Hugging Face
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

## Data preparation
### Loading Dataset

Already downloaded (for previous Logistic Regression and RNN based models). 

In [2]:
path = "../supervised-learning/imdb-review-classification/movie_data.csv"
df = pd.read_csv(path)
df = df.sample(frac=0.5, random_state=1).reset_index(drop=True)
df.sample(5)

Unnamed: 0,review,sentiment
13574,Because of the 1988 Writers Guild of America s...,0
19604,Aside from a few good moments of fairly raw vi...,0
22161,I should no longer be surprised when critics m...,1
1703,Prepare to meet your Messiah - they call him M...,1
376,Greetings again from the darkness. Based on th...,1


Sentiment == 1 means positive review, 0 for negatives.  

The dataset is balanced:

In [3]:
print(f'Length of dataset: {df.shape[0]}')
print(f'Number of positive and negative reviews: {df[df['sentiment'] == 1].shape[0]}, {df[df['sentiment'] == 0].shape[0]}')

Length of dataset: 25000
Number of positive and negative reviews: 12471, 12529


### Splitting Dataset: Train, Validation and Test subsets

We will use 70% for training, 10% for validation and 20% for testing.

In [4]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1310)
valid_size = df.shape[0] * 0.1
valid_frac_in_train_df = valid_size / train_df.shape[0]
train_df, valid_df = train_test_split(train_df, test_size=valid_frac_in_train_df, random_state=1310)

print(f'Train size: {train_df.shape[0]}')
print(f'Valid size: {valid_df.shape[0]}')
print(f'Test size: {test_df.shape[0]}')

Train size: 17500
Valid size: 2500
Test size: 5000


## Tokenizing the dataset

We will tokenize the texts into individual word tokens using the tokenizer provided by the pre-trained model class.

In [5]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_df['review'].values.tolist(), padding=True, truncation=True)
valid_encodings = tokenizer(valid_df['review'].values.tolist(), padding=True, truncation=True)
test_encodings = tokenizer(test_df['review'].values.tolist(), padding=True, truncation=True)

In [6]:
train_encodings[0]

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

### Dataset class and DataLoader

In [7]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        super(IMDbDataset, self).__init__()
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        # create a hashmap to hold the input tokens, attention masks and label
        item = {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.int32)
        return item
        
    def __len__(self):
        return len(self.labels)

In [8]:
train_dataset = IMDbDataset(train_encodings, train_df['sentiment'].values)
valid_dataset = IMDbDataset(valid_encodings, valid_df['sentiment'].values)
test_dataset = IMDbDataset(test_encodings, test_df['sentiment'].values)

batch_size = 8

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Fine-tuning pre-trained BERT model
### General settings

In [9]:
torch.backends.cudnn.deterministic = True
torch.manual_seed(1310) # for reproducibility

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

num_epochs = 3

### Loading the BERT model

The downstream task we want to fine-tune the BERT model on is **sequence classification**.  

`'distilbert-base-uncased'` is a streamlined, lightweight and uncased version of the BERT base model. It offers a smaller size while maintaining strong performance, making it more computationally efficient for tasks without sacrificing much accuracy.

In [10]:
bert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
bert_model.to(device)
bert_model

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### Defining accuracy function and optimizer

In [11]:
def accuracy(model, dataloader, device):
    with torch.no_grad():
        correct_pred, num_examples = 0, 0
        
        # compute accuracy by batches for RAM or VRAM limitations
        for batch_idx, batch in enumerate(dataloader):
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']
            
            pred_labels = torch.argmax(logits, 1)
            num_examples += labels.size(0)
            correct_pred += (pred_labels == labels).sum()
        
        return correct_pred.float()/num_examples * 100

In [12]:
optimizer = torch.optim.AdamW(bert_model.parameters(), lr=5e-5)

### Training

In [13]:
import time

start_time = time.time()

for epoch in range(num_epochs):
    # set for training
    bert_model.train()
    for batch_idx, batch in enumerate(train_dataloader):
        # get data
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # reset gradients for each step
        optimizer.zero_grad()
        
        # 1. Forward
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels.long())
        loss, logits = outputs['loss'], outputs['logits']
        
        # 2. Backward and take step
        loss.backward()
        optimizer.step()
        
        # Monitoring
        if not batch_idx % 250:
            print(f'Epoch {epoch + 1}/{num_epochs} .. '
                  f'Batch {batch_idx}/{len(train_dataloader)} .. '
                  f'Loss: {loss.item():.4f}')
            
    # set for eval
    bert_model.eval()
    
    with torch.no_grad():
        print(f'Training accuracy: {accuracy(bert_model, train_dataloader, device):.2f}%\n'
              f'Validation accuracy: {accuracy(bert_model, valid_dataloader, device):.2f}%')
        
    print(f'Time taken: {(time.time() - start_time)/60:.2f} minutes')
    
print(f'Total time taken: {(time.time() - start_time)/60:.2f} minutes')

Epoch 1/3 .. Batch 0/2188 .. Loss: 0.6782
Epoch 1/3 .. Batch 250/2188 .. Loss: 0.2148
Epoch 1/3 .. Batch 500/2188 .. Loss: 0.1224
Epoch 1/3 .. Batch 750/2188 .. Loss: 0.6774
Epoch 1/3 .. Batch 1000/2188 .. Loss: 0.1526
Epoch 1/3 .. Batch 1250/2188 .. Loss: 0.2069
Epoch 1/3 .. Batch 1500/2188 .. Loss: 0.1844
Epoch 1/3 .. Batch 1750/2188 .. Loss: 0.4709
Epoch 1/3 .. Batch 2000/2188 .. Loss: 0.0547
Training accuracy: 96.23%
Validation accuracy: 91.60%
Time taken: 63.75 minutes
Epoch 2/3 .. Batch 0/2188 .. Loss: 0.0077
Epoch 2/3 .. Batch 250/2188 .. Loss: 0.0832
Epoch 2/3 .. Batch 500/2188 .. Loss: 0.0517
Epoch 2/3 .. Batch 750/2188 .. Loss: 0.0131
Epoch 2/3 .. Batch 1000/2188 .. Loss: 0.1785
Epoch 2/3 .. Batch 1250/2188 .. Loss: 0.0117
Epoch 2/3 .. Batch 1500/2188 .. Loss: 0.0265
Epoch 2/3 .. Batch 1750/2188 .. Loss: 0.0650
Epoch 2/3 .. Batch 2000/2188 .. Loss: 0.0398
Training accuracy: 97.47%
Validation accuracy: 90.52%
Time taken: 128.29 minutes
Epoch 3/3 .. Batch 0/2188 .. Loss: 0.1813

In [16]:
model_save_path = "./finetuned_imdb_bert"
bert_model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f'Model and tokenizer saved to {model_save_path}')

Model and tokenizer saved to ./finetuned_imdb_bert


### Evaluating

In [17]:
print(f'Test accuracy: {accuracy(bert_model, test_dataloader, device):.2f}%')

Test accuracy: 91.28%


The fine-tuned DistilBERT model achieved a validation accuracy of 93% and a test accuracy of 91%, demonstrating strong sentiment classification performance. While accuracy could likely improve with training on the full dataset, this approach prioritized efficiency, as training on the complete data would have been more computationally expensive.