# BERT Finetuning with Transformers

## Understanding the Basics

In [8]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |████████████████████████████████| 450kB 3.5MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 71.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 35.5MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.38-cp36-none-any.whl size=884629 sha256=54f6d8b9a27f2adc

In [9]:
import torch
import numpy as np
import pandas as pd
from transformers import BertModel, BertTokenizer, BertForSequenceClassification

In [0]:
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [0]:
sentence = 'he likes to play'
# Step 1: Tokenize
tokens = tokenizer.tokenize(sentence)
# Step 2: Add [CLS] and [SEP]
tokens = ['[CLS]'] + tokens + ['[SEP]']
# Step 3: Pad tokens
padded_tokens = tokens + ['[PAD]' for _ in range(20 - len(tokens))]
attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
# Step 4: Segment ids
seg_ids = [0 for _ in range(len(padded_tokens))] #Optional!
# Step 5: Get BERT vocabulary index for each token
token_ids = tokenizer.convert_tokens_to_ids(padded_tokens)

In [0]:
# Convert to pytorch tensors
token_ids = torch.tensor(token_ids).unsqueeze(0)
attn_mask = torch.tensor(attn_mask).unsqueeze(0)
seg_ids = torch.tensor(seg_ids).unsqueeze(0)

# Feed them to bert
hidden_reps, cls_head = bert_model(token_ids, attention_mask = attn_mask,\
                                  token_type_ids = seg_ids)
print(hidden_reps.shape)
print(cls_head.shape)

In [0]:
# Checking that [CLS] representation from hidden_repr is not equal to [CLS] representation from cls_head
torch.all(hidden_reps[0][0].eq(cls_head[0]))

## Dataset Class and Data Loaders

In [4]:
!pip install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9681 sha256=afe72cb3ceea1208fb54eb190499b1b402871a280d49857b34d91d171cff0ffd
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [10]:
import wget
import os

print('Downloading dataset...')

# The URL for the dataset zip file.
url = 'https://raw.githubusercontent.com/theneuralbeing/bert-finetuning-webinar/master/data.zip'

# Download the file and unzip it (if we haven't already)
if not os.path.exists('./data.zip'):
    wget.download(url, './data.zip')
    !unzip -q data.zip

Downloading dataset...


In [0]:
from torch.utils.data import Dataset, DataLoader

In [0]:
class LoadDataset(Dataset):

    def __init__(self, filename, maxlen):

        # Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter=',')

        # Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        # Define the Maxlength for padding/truncating
        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        # Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'review']
        label = self.df.loc[index, 'sentiment']

        # Tokenize the sentence
        tokens = self.tokenizer.tokenize(sentence)

        # Inserting the CLS and SEP token at the beginning and end of the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        
        # Padding/truncating the sentences to the maximum length
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))]
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]']
        
        # Convert the sequence to ids with BERT Vocabulary
        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        
        # Converting the list to a pytorch tensor
        tokens_ids_tensor = torch.tensor(tokens_ids)

        # Obtaining the attention mask
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, label

In [0]:
# Creating instances of training and validation set
train_set = LoadDataset(filename = 'train.csv', maxlen = 64)
val_set = LoadDataset(filename = 'validation.csv', maxlen = 64)

In [0]:
# Creating intsances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size = 32, num_workers = 5)
val_loader = DataLoader(val_set, batch_size = 32, num_workers = 5)

## Building the Model

In [0]:
from torch import nn

In [0]:
class SentimentClassifier(nn.Module):

    def __init__(self, freeze_bert = True):
        super(SentimentClassifier, self).__init__()

        # Instantiating the BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        # Defining layers like dropout and linear
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        # Getting contextualized representations from BERT Layer
        cont_reps, _ = self.bert_layer(seq, attention_mask = attn_masks)

        # Obtaining the representation of [CLS] head
        cls_rep = cont_reps[:, 0]

        # Feeding cls_rep to the classifier layer
        logits = self.classifier(cls_rep)

        return logits

In [0]:
model = SentimentClassifier()

## Training

In [0]:
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss

criterion = BCEWithLogitsLoss()
optimizer = Adam(model.parameters(), lr = 2e-5)

In [24]:
if torch.cuda.is_available():
  device = 'cuda'
else:
  device = 'cpu'

print(device)

cuda


In [0]:
# Defining a function for calculating accuracy
def logits_accuracy(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    preds = (probs > 0.5).long()
    acc = (preds.squeeze() == labels).float().mean()
    return acc

In [0]:
# Defining an evaluation function for training 
def evaluate(net, criterion, val_loader, device):
  
    losses, accuracies = 0, 0
    
    # Setting model to evaluation mode
    net.eval()

    count = 0
    for (seq, attn_masks, labels) in val_loader:
        count += 1

        # Move inputs and targets to device
        seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)

        # Get logit predictions
        val_logits = net(seq, attn_masks)

        # Calculate loss
        val_loss = criterion(val_logits.squeeze(-1), labels.float())
        losses += val_loss.item()

        # Calculate validation accuracy
        accuracies += logits_accuracy(val_logits, labels)

    return losses / count, accuracies / count

In [0]:
from time import time

In [0]:
def train(net, criterion, optimizer, train_loader, val_loader, device, epochs=4, print_every=100):
    
    # Move model to device
    net.to(device)
    # Setting model to training mode
    net.train()

    print('========== ========== STARTING TRAINING ========== ==========')

    for epoch in range(epochs):

        print('\n\n========== EPOCH {} =========='.format(epoch))        
        t1 = time()

        for i, (seq, attn_masks, labels) in enumerate(train_loader):

            # Clear gradients
            optimizer.zero_grad()  

            # Moving tensors to device
            seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)

            # Obtaining the logits from the model
            logits = net(seq,attn_masks)

            # Calculating the loss
            loss = criterion(logits.squeeze(-1), labels.float())

            # Backpropagating the gradients
            loss.backward()

            # Clipping gradients to tackle exploding gradients
            nn.utils.clip_grad_norm_(net.parameters(), 1)

            # Optimization step
            optimizer.step()

            if (i + 1) % print_every == 0:
                print("Iteration {} ==== Loss: {}".format(i+1, loss.item()))

        t2 = time()
        print('Time Taken for Epoch: {}'.format(t2-t1))
        print('\n========== Validating ==========')
        mean_val_loss, mean_val_acc = evaluate(net, criterion, val_loader, device)
        print("Validation Loss: {}\nValidation Accuracy: {}".format(mean_val_loss, mean_val_acc))


In [30]:
%%time
# starting training
train(model, criterion, optimizer, train_loader, val_loader, device, print_every=100)



Iteration 100 ==== Loss: 0.4057803750038147
Iteration 200 ==== Loss: 0.6811727285385132
Iteration 300 ==== Loss: 0.39617377519607544
Iteration 400 ==== Loss: 0.3085844814777374
Iteration 500 ==== Loss: 0.3079783320426941
Iteration 600 ==== Loss: 0.3183799684047699
Iteration 700 ==== Loss: 0.4045353829860687
Time Taken for Epoch: 212.97628712654114

Validation Loss: 0.37708764637598907
Validation Accuracy: 0.8322410583496094


Iteration 100 ==== Loss: 0.1286257952451706
Iteration 200 ==== Loss: 0.5091476440429688
Iteration 300 ==== Loss: 0.25829842686653137
Iteration 400 ==== Loss: 0.11709516495466232
Iteration 500 ==== Loss: 0.14344905316829681
Iteration 600 ==== Loss: 0.09374682605266571
Iteration 700 ==== Loss: 0.2250899374485016
Time Taken for Epoch: 209.75870776176453

Validation Loss: 0.4195776296507977
Validation Accuracy: 0.8349184989929199


Iteration 100 ==== Loss: 0.005587403196841478
Iteration 200 ==== Loss: 0.19125589728355408
Iteration 300 ==== Loss: 0.008362794294953346

In [0]:
# Saving our model
import os

save_path = 'checkpoints'

if not os.path.isdir(save_path):
    os.mkdir(save_path)

torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict()
}, os.path.join(save_path,'model.pth'))

In [42]:
ls checkpoints/

model.pth
