# BERT Fine Tuning

https://github.com/rasbt/machine-learning-book/blob/main/ch16/ch16-part3-bert.ipynb

In [1]:
try:
    import transformers
except:
    !pip install transformers==4.9.1 -q
    import transformers

import gzip
import shutil
import time
import pandas as pd
import requests
import torch
import torch.nn.functional as F
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

[K     |████████████████████████████████| 2.6 MB 42.4 MB/s 
[K     |████████████████████████████████| 880 kB 54.0 MB/s 
[K     |████████████████████████████████| 3.3 MB 59.9 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [5]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f0bd29ae570>

In [6]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_EPOCHS = 3

In [7]:
url = ("https://github.com/rasbt/machine-learning-book/raw/main/ch08/movie_data.csv.gz")
filename = url.split("/")[-1]

with open(filename, "wb") as f:
    r = requests.get(url)
    f.write(r.content)

with gzip.open(filename, "rb") as f_in:
    with open(filename[:-3], "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

In [8]:
df = pd.read_csv(filename[:-3])

df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [9]:
ind0 = 35000
ind1 = 40000
train_texts = df.iloc[:ind0]["review"].values
train_labels = df.iloc[:ind0]["sentiment"].values

valid_texts = df.iloc[ind0:ind1]["review"].values
valid_labels = df.iloc[ind0:ind1]["sentiment"].values

test_texts = df.iloc[ind1:]["review"].values
test_labels = df.iloc[ind1:]["sentiment"].values

In [10]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(list(train_texts), truncation = True, padding = True)
valid_encodings = tokenizer(list(valid_texts), truncation = True, padding = True)
test_encodings = tokenizer(list(test_texts), truncation = True, padding = True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [20]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [21]:
train_dataset = IMDbDataset(train_encodings, train_labels)
valid_dataset = IMDbDataset(valid_encodings, valid_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = 16, shuffle = True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size = 16, shuffle = False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = 16, shuffle = False)

In [22]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
model.to(DEVICE)
model.train()

optim = torch.optim.Adam(model.parameters(), lr = 5e-5)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [23]:
def compute_accuracy(model, data_loader, device):
    with torch.no_grad():
        correct_pred, num_examples = 0, 0
        for batch_idx, batch in enumerate(data_loader):
            ### Prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']
            predicted_labels = torch.argmax(logits, 1)
            num_examples += labels.size(0)
            correct_pred += (predicted_labels == labels).sum()
        
        return correct_pred.float() / num_examples * 100

In [None]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        ### Prepare data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        ### Forward
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs['loss'], outputs['logits']
        
        ### Backward
        optim.zero_grad()
        loss.backward()
        optim.step()
        
        ### Logging
        if not batch_idx % 250:
            print (f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
                   f'Batch {batch_idx:04d}/{len(train_loader):04d} | '
                   f'Loss: {loss:.4f}')
            
    model.eval()
    with torch.set_grad_enabled(False):
        print(f'Training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nValid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 0001/0003 | Batch 0000/2188 | Loss: 0.7090
Epoch: 0001/0003 | Batch 0250/2188 | Loss: 0.1802
Epoch: 0001/0003 | Batch 0500/2188 | Loss: 0.1789
Epoch: 0001/0003 | Batch 0750/2188 | Loss: 0.2818
Epoch: 0001/0003 | Batch 1000/2188 | Loss: 0.1127
Epoch: 0001/0003 | Batch 1250/2188 | Loss: 0.0964
Epoch: 0001/0003 | Batch 1500/2188 | Loss: 0.3175
Epoch: 0001/0003 | Batch 1750/2188 | Loss: 0.0603
Epoch: 0001/0003 | Batch 2000/2188 | Loss: 0.1449
Training accuracy: 94.76%
Valid accuracy: 90.60%
Time elapsed: 38.48 min
Epoch: 0002/0003 | Batch 0000/2188 | Loss: 0.0192
Epoch: 0002/0003 | Batch 0250/2188 | Loss: 0.0328
Epoch: 0002/0003 | Batch 0500/2188 | Loss: 0.3014
Epoch: 0002/0003 | Batch 0750/2188 | Loss: 0.0745
Epoch: 0002/0003 | Batch 1000/2188 | Loss: 0.0340
Epoch: 0002/0003 | Batch 1250/2188 | Loss: 0.0357
Epoch: 0002/0003 | Batch 1500/2188 | Loss: 0.0955
Epoch: 0002/0003 | Batch 1750/2188 | Loss: 0.0398
Epoch: 0002/0003 | Batch 2000/2188 | Loss: 0.0141
Training accuracy: 98.12%
V