* changed by nov05 on 2024-11-28  
* Udacity AWS MLE Nanodegree (ND189)  
  Course 4, 3.15 Excercise: Fine-Tuning BERT    
* local env `conda activate cuda_py310` with cuda enabled  
* [CoLA dataset on KaggleHub](https://www.kaggle.com/datasets/krazy47/cola-the-corpus-of-linguistic-acceptability)  
  The Corpus of Linguistic Acceptability   

In [4]:
## use the repo root folder as working directory
## training data is in data\
%cd ..
%pwd

d:\github\udacity-CD0387-deep-learning-topics-within-computer-vision-nlp-project-starter


'd:\\github\\udacity-CD0387-deep-learning-topics-within-computer-vision-nlp-project-starter'

## Solution: Fine-tune BERT model

In [7]:
import os
# import sys
# import json
from tqdm import tqdm
import wandb

import numpy as np
import pandas as pd
import torch
from torch.optim import AdamW
# import torch.distributed as dist
import torch.utils.data
import torch.utils.data.distributed
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from transformers import BertForSequenceClassification, BertTokenizer # type: ignore
from transformers import get_linear_schedule_with_warmup # type: ignore
from sklearn.model_selection import train_test_split # type: ignore

## log training process with W&B if uncommented
# os.environ['WANDB_MODE'] = 'disabled'

In [None]:
class Config:
    def __init__(self):
        # self.wandb = True
        self.device = torch.device('cpu')
        self.max_len = 64 ## this is the max length of the sentence
        self.epochs = 30
        self.batch_size = 32
        self.opt_lr = 2e-5
        self.opt_weight_decay = 1e-3

config = Config()
config.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"👉 Running on device type: {config.device}")

👉 Running on device type: cuda:0


In [5]:
df = pd.read_csv(
   r"cd0387_common_model_arch_types_fine_tuning\cola_public\raw\in_domain_train.tsv",
   sep="\t",
   header=None,
   usecols=[1, 3],
   names=["label", "sentence"],
)
sentences = df.sentence.values
labels = df.label.values
print(sentences.shape, labels.shape)
df.sample(3)

(8551,) (8551,)


Unnamed: 0,label,sentence
6550,0,Talkative and a bully entered.
8541,1,Where has he put the cake?
7311,1,I know that she runs.


In [6]:
train_df, test_df = train_test_split(df)
train_df.to_csv(r"data\cola\train.csv", index=False)
test_df.to_csv(r"data\cola\test.csv", index=False)

In [None]:
print("Loading BERT tokenizer...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def get_train_data_loader(batch_size):
    dataset = pd.read_csv(os.path.join("data", "cola", "train.csv"))
    sentences = dataset.sentence.values
    labels = dataset.label.values

    input_ids = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(sent, add_special_tokens=True)
        input_ids.append(encoded_sent)

    # pad shorter sentences
    input_ids_padded = []
    for id in input_ids:
        while len(id) < config.max_len:
            id.append(0)
        input_ids_padded.append(id)
    input_ids = input_ids_padded

    # mask; 0: added, 1: otherwise
    attention_masks = []
    # For each sentence...
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)

    # convert to PyTorch data types.
    train_inputs = torch.tensor(input_ids)
    train_labels = torch.tensor(labels)
    train_masks = torch.tensor(attention_masks)

    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    return train_dataloader


def get_test_data_loader(test_batch_size):
    dataset = pd.read_csv(os.path.join("data", "cola", "test.csv"))
    sentences = dataset.sentence.values
    labels = dataset.label.values

    input_ids = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(sent, add_special_tokens=True)
        input_ids.append(encoded_sent)

    # pad shorter sentences
    input_ids_padded = []
    for id in input_ids:
        while len(id) < config.max_len:
            id.append(0)
        input_ids_padded.append(id)
    input_ids = input_ids_padded

    # mask; 0: added, 1: otherwise
    attention_masks = []
    # For each sentence...
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)

    # convert to PyTorch data types.
    train_inputs = torch.tensor(input_ids)
    train_labels = torch.tensor(labels)
    train_masks = torch.tensor(attention_masks)

    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=test_batch_size)

    return train_dataloader


def train():
    train_loader = get_train_data_loader(config.batch_size)
    test_loader = get_test_data_loader(config.batch_size)

    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
        num_labels=2,  # The number of output labels--2 for binary classification.
        output_attentions=False,  # Whether the model returns attentions weights
        output_hidden_states=False,  # Whether the model returns all hidden-states
    )
    for param in model.bert.parameters():
        param.requires_grad = False  # Freeze all BERT layers
    for param in model.bert.encoder.layer[-1].parameters():
        param.requires_grad = True  # Unfreeze the last layer

    model = model.to(config.device)
    ## set up optimizer
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.bert.encoder.layer[-1].named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.01,  # Apply weight decay to weights
        },
        {
            "params": [p for n, p in model.bert.encoder.layer[-1].named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,  # No weight decay for biases and LayerNorms
        },
        {
            "params": [p for n, p in model.classifier.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.01,  # Apply weight decay to classifier weights
        },
        {
            "params": [p for n, p in model.classifier.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,  # No weight decay for classifier biases and LayerNorms
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(1e3),  # You can start with 0 warmup steps, or adjust this
        num_training_steps=len(train_loader)*config.epochs
    )
    total_steps = 0

    for epoch in tqdm(range(config.epochs)):
        print(f"👉 Train Epoch {epoch}:")
        total_loss = 0
        model.train()
        for step, batch in enumerate(train_loader):
            total_steps += 1
            b_input_ids = batch[0].to(config.device)
            b_input_mask = batch[1].to(config.device)
            b_labels = batch[2].to(config.device)
            model.zero_grad()

            outputs = model(b_input_ids, token_type_ids=None, 
                attention_mask=b_input_mask, labels=b_labels)
            loss = outputs[0]

            total_loss += loss.item()
            wandb.log({"train_loss": loss.item()}, step=total_steps)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()
            if step%10==0:
                print(
                    f"Step {total_steps}: " 
                    f"[{step*len(batch[0])}/{len(train_loader.sampler)} " 
                    f"({(100.0*step/len(train_loader)):.0f}%)] " 
                    f"Loss: {loss.item():.6f}"
                )
        scheduler.step()
        eval_accuracy = test(model, test_loader)
        wandb.log({f"eval_accuracy_epoch (%)": eval_accuracy*100}, step=total_steps)
    return model


def test(model, test_loader):
    model.eval()
    _, eval_accuracy = 0, 0
    with torch.no_grad():
        for batch in test_loader:
            b_input_ids = batch[0].to(config.device)
            b_input_mask = batch[1].to(config.device)
            b_labels = batch[2].to(config.device)
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to("cpu").numpy()
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            eval_accuracy += tmp_eval_accuracy
    eval_accuracy /= len(test_loader.dataset)
    print("🟢 Test Accuracy: ", eval_accuracy)
    return eval_accuracy


wandb.init(
    # set the wandb project where this run will be logged
    project="udacity-awsmle-bert-cola",
    config=config
)
train()

Loading BERT tokenizer...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnov05[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011277777777777626, max=1.0…

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/30 [00:00<?, ?it/s]

👉 Train Epoch 0:


  3%|▎         | 1/30 [01:16<36:59, 76.55s/it]

🟢 Test Accuracy:  0.022436272217025256
👉 Train Epoch 1:


  7%|▋         | 2/30 [03:43<54:55, 117.68s/it]

🟢 Test Accuracy:  0.023931648917032453
👉 Train Epoch 2:


 10%|█         | 3/30 [05:09<46:30, 103.36s/it]

🟢 Test Accuracy:  0.024028342448010363
👉 Train Epoch 3:


 13%|█▎        | 4/30 [08:51<1:05:02, 150.10s/it]

🟢 Test Accuracy:  0.024246465064402388
👉 Train Epoch 4:


 17%|█▋        | 5/30 [10:12<52:15, 125.42s/it]  

🟢 Test Accuracy:  0.024644482622148665
👉 Train Epoch 5:


 20%|██        | 6/30 [12:59<55:48, 139.54s/it]

🟢 Test Accuracy:  0.02448819889184716
👉 Train Epoch 6:


 23%|██▎       | 7/30 [15:36<55:42, 145.33s/it]

🟢 Test Accuracy:  0.02484236705763834
👉 Train Epoch 7:


 27%|██▋       | 8/30 [17:02<46:19, 126.34s/it]

🟢 Test Accuracy:  0.02467821292365259
👉 Train Epoch 8:


 27%|██▋       | 8/30 [19:11<52:47, 143.99s/it]


KeyboardInterrupt: 