# Hyperparameter tuning of DistillBERT with Classification Head

This notebook aims to improve on the limitation of the paper,
https://doi.org/10.48550/arXiv.1905.05583, where the authors did not conduct hyperparameter tuning for their neural networks. For our project, we attempt to choose values for TRAIN_BATCH_SIZE and LEARNING_RATE variables used in 'DistillBERT_finetuning_2_target.ipynb' notebook, since learning rate affects convergence while training batch size affects generalisation of model. We used a subset of the training data, and utilized Bayesian Optimisation.

In [1]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6


In [2]:
# Import libraries
from google.colab import drive
import numpy as np
import datetime
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader, SequentialSampler
from transformers import DistilBertModel, DistilBertTokenizer, AdamW
import re

# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Import data and extract mini-set (500 rows) for hyperparametertuning. train_split was created with shuffling so no need shuffle again.
drive.mount('/content/drive')
train_split = pd.read_csv('/content/drive/MyDrive/train_split.csv')

Mounted at /content/drive


In [8]:
# Extracting subset of train data
mini_set = train_split[0:500]
val_set = train_split[500:551]

# Convert to binary classification
def good_bad(row):
  if row < 5:
    return 0
  else:
    return 1

mini_set['rating'] = mini_set['rating'].apply(good_bad)
val_set['rating'] = val_set['rating'].apply(good_bad)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_set['rating'] = mini_set['rating'].apply(good_bad)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_set['rating'] = val_set['rating'].apply(good_bad)


In [22]:
from bayes_opt import BayesianOptimization

# Model and custom dataset classes similar to main notebook
# Define Custom Dataset
class CustomDataset(Dataset):
    ''' Custom dataset class defined to create '''

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.Content = dataframe.Content.to_numpy()
        self.targets = dataframe.rating.to_numpy()
        self.max_len = max_len

    # __len__ and __getitem__ methods to create map-style dataset to be interfaced by torch DataLoader method
    def __len__(self):
        return len(self.Content)

    def __getitem__(self, index):
        # Data preprocessing code to convert to lower-cased, remove trailing whitespace, html tags and urls
        Content = str(self.Content[index]).lower()
        Content = re.sub(r'<[^>]+>', '', Content)
        Content = re.sub(r'https://\S+|www\.\S+', '', Content)
        Content = re.sub(r'br\s', '', Content)
        Content = " ".join(Content.split())

        rating = self.targets[index]

        # Tokenisation of text
        inputs = self.tokenizer.encode_plus(
            Content,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            pad_to_max_length=True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(rating, dtype=torch.int)
        }

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 2)

    # Note: DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    # It is a tensor of shape (batch_size, sequence_length, hidden_size=768)
    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

# Function to calcuate the accuracy of the model
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

# Training Parameters
MAX_LEN = 512
EPOCHS = 5
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create validation set (fixed for all experiments)
test_params = {'batch_size': 1,
                'shuffle': False,
                'sampler': SequentialSampler(val_set),
                'num_workers': 0
                }
val_data = CustomDataset(val_set, tokenizer, MAX_LEN)
testing_loader = DataLoader(val_data, **test_params)


def train(lr_exponent_val, train_batch):
    # Ensure train_batch and lr_exponent_val are discrete
    train_batch = int(train_batch)
    lr_exponent_val = int(lr_exponent_val)
    lr = 1*10**-(lr_exponent_val)

    # Create Dataset and Dataloader
    paramtune_set = CustomDataset(mini_set, tokenizer, MAX_LEN)
    train_params = {'batch_size': train_batch,
                    'shuffle': True,
                    'num_workers': 0
                    }
    paramtune_loader = DataLoader(paramtune_set, **train_params)

    # Initialize model
    model = DistillBERTClass()
    model.to(device)

    # Creating the loss function and optimizer
    loss_function = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)

    # Training loop over mini_set
    for epoch in range(EPOCHS):
        model.train()
        for _,data in enumerate(paramtune_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)

            outputs = model(ids, mask)
            loss = loss_function(outputs, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Evaluating model accuracy over test set
    model.eval()
    n_correct,nb_val_examples = 0,0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask)
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)
            nb_val_examples+=targets.size(0)

    run_accu = (n_correct*100)/nb_val_examples

    return run_accu

# Parameters to tune (learning rate and train batch size)
pbounds = {
    'lr_exponent_val': (3, 7),
    'train_batch': (2,9)
    }

optimizer = BayesianOptimization(
    f=train,
    pbounds=pbounds,
    verbose=2,
    random_state=1,
)

# Run Bayesian Optimisation Algorithm
optimizer.maximize(init_points=15, n_iter=15)

|   iter    |  target   | lr_exp... | train_... |
-------------------------------------------------




| [0m1        [0m | [0m86.27    [0m | [0m4.668    [0m | [0m7.042    [0m |
| [0m2        [0m | [0m37.25    [0m | [0m3.0      [0m | [0m4.116    [0m |
| [0m3        [0m | [0m37.25    [0m | [0m3.587    [0m | [0m2.646    [0m |
| [0m4        [0m | [0m37.25    [0m | [0m3.745    [0m | [0m4.419    [0m |
| [0m5        [0m | [0m74.51    [0m | [0m4.587    [0m | [0m5.772    [0m |
| [95m6        [0m | [95m88.24    [0m | [95m4.677    [0m | [95m6.797    [0m |
| [0m7        [0m | [0m37.25    [0m | [0m3.818    [0m | [0m8.147    [0m |
| [0m8        [0m | [0m37.25    [0m | [0m3.11     [0m | [0m6.693    [0m |
| [95m9        [0m | [95m92.16    [0m | [95m4.669    [0m | [95m5.911    [0m |
| [0m10       [0m | [0m37.25    [0m | [0m3.562    [0m | [0m3.387    [0m |
| [0m11       [0m | [0m74.51    [0m | [0m6.203    [0m | [0m8.778    [0m |
| [0m12       [0m | [0m80.39    [0m | [0m4.254    [0m | [0m6.846    [0m |
| [0m13



| [0m16       [0m | [0m88.24    [0m | [0m5.241    [0m | [0m6.152    [0m |




| [95m17       [0m | [95m96.08    [0m | [95m5.516    [0m | [95m7.357    [0m |




| [0m18       [0m | [0m92.16    [0m | [0m5.361    [0m | [0m8.045    [0m |




| [0m19       [0m | [0m54.9     [0m | [0m6.337    [0m | [0m6.891    [0m |




| [0m20       [0m | [0m94.12    [0m | [0m5.249    [0m | [0m8.98     [0m |




| [0m21       [0m | [0m50.98    [0m | [0m7.0      [0m | [0m2.0      [0m |




KeyboardInterrupt: 

From the results above, we can see that evaluation accuracy is above 90% for trials with learning rate 1e-6 (i.e. lr_exponent_val = 6). For this context, batch size seems to affect accuracy insignificantly.