<a href="https://colab.research.google.com/github/nov05/Google-Colaboratory/blob/master/20241129_finetune_bert_solution_train-failed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

* changed by nov05 on 2024-11-28  
* Udacity AWS MLE Nanodegree (ND189)  
  Course 4, 3.15 Excercise: Fine-Tuning BERT    
* local env `conda activate cuda_py310` with cuda enabled  
* [CoLA dataset on KaggleHub](https://www.kaggle.com/datasets/krazy47/cola-the-corpus-of-linguistic-acceptability)  
  The Corpus of Linguistic Acceptability   

In [1]:
!git clone https://github.com/nov05/udacity-CD0387-deep-learning-topics-within-computer-vision-nlp-project-starter.git

Cloning into 'udacity-CD0387-deep-learning-topics-within-computer-vision-nlp-project-starter'...
remote: Enumerating objects: 217, done.[K
remote: Counting objects: 100% (195/195), done.[K
remote: Compressing objects: 100% (132/132), done.[K
remote: Total 217 (delta 76), reused 170 (delta 56), pack-reused 22 (from 1)[K
Receiving objects: 100% (217/217), 1.40 MiB | 16.83 MiB/s, done.
Resolving deltas: 100% (80/80), done.


In [2]:
## use the repo root folder as working directory
## training data is in data\
%cd /content/udacity-CD0387-deep-learning-topics-within-computer-vision-nlp-project-starter
!pwd

/content/udacity-CD0387-deep-learning-topics-within-computer-vision-nlp-project-starter
/content/udacity-CD0387-deep-learning-topics-within-computer-vision-nlp-project-starter


## Solution: Fine-tune BERT model

In [3]:
import os
# import sys
# import json
from tqdm import tqdm
import wandb

import numpy as np
import pandas as pd
import torch
from torch.optim import AdamW
# import torch.distributed as dist
import torch.utils.data
import torch.utils.data.distributed
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from transformers import BertForSequenceClassification, BertTokenizer # type: ignore
from transformers import get_linear_schedule_with_warmup # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.utils.class_weight import compute_class_weight # type: ignore

## log training process with W&B if uncommented
# os.environ['WANDB_MODE'] = 'disabled'

In [4]:
class Config:
    def __init__(self):
        # self.wandb = True
        self.device = torch.device('cpu')
        self.max_len = 64 ## this is the max length of the sentence
        self.epochs = 30
        self.batch_size = 32
        self.opt_lr = 2e-5
        self.opt_weight_decay = 1e-6
        self.unfreeze_top_layers = True
        self.use_class_weights = True

config = Config()
config.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"👉 Running on device type: {config.device}")

👉 Running on device type: cuda:0


In [5]:
df = pd.read_csv(
   r"/content/udacity-CD0387-deep-learning-topics-within-computer-vision-nlp-project-starter/cd0387_common_model_arch_types_fine_tuning/cola_public/raw/in_domain_train.tsv",
   sep="\t",
   header=None,
   usecols=[1, 3],
   names=["label", "sentence"],
)
sentences = df.sentence.values
labels = df.label.values
print(sentences.shape, labels.shape)
df.sample(3)

(8551,) (8551,)


Unnamed: 0,label,sentence
5667,1,The dastardly surgeon stole the physician's lu...
7193,0,"Will put, this girl in the red coat will put a..."
4927,1,He bought a refrigerator in which to put the b...


In [6]:
## there is some imbalance in the training dataset
sum(labels)/len(labels)

0.704362062916618

In [7]:
!mkdir -p data
!mkdir -p data/cola
train_df, test_df = train_test_split(df, stratify=labels)
train_df.to_csv(r"data/cola/train.csv", index=False)
test_df.to_csv(r"data/cola/test.csv", index=False)

In [8]:
print("Loading BERT tokenizer...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat==labels_flat) / len(labels_flat)


def get_train_data_loader(batch_size):
    dataset = pd.read_csv(os.path.join("data", "cola", "train.csv"))
    sentences = dataset.sentence.values
    labels = dataset.label.values

    input_ids = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(sent, add_special_tokens=True)
        input_ids.append(encoded_sent)

    # pad shorter sentences
    input_ids_padded = []
    for id in input_ids:
        while len(id) < config.max_len:
            id.append(0)
        input_ids_padded.append(id)
    input_ids = input_ids_padded

    # mask; 0: added, 1: otherwise
    attention_masks = []
    # For each sentence...
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)

    # convert to PyTorch data types.
    train_inputs = torch.tensor(input_ids)
    train_labels = torch.tensor(labels)
    train_masks = torch.tensor(attention_masks)

    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    return train_dataloader


def get_test_data_loader(test_batch_size):
    dataset = pd.read_csv(os.path.join("data", "cola", "test.csv"))
    sentences = dataset.sentence.values
    labels = dataset.label.values

    input_ids = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(sent, add_special_tokens=True)
        input_ids.append(encoded_sent)

    # pad shorter sentences
    input_ids_padded = []
    for id in input_ids:
        while len(id)<config.max_len:
            id.append(0)
        input_ids_padded.append(id)
    input_ids = input_ids_padded

    # mask; 0: added, 1: otherwise
    attention_masks = []
    # For each sentence...
    for sent in input_ids:
        att_mask = [int(token_id>0) for token_id in sent]
        attention_masks.append(att_mask)

    # convert to PyTorch data types.
    train_inputs = torch.tensor(input_ids)
    train_labels = torch.tensor(labels)
    train_masks = torch.tensor(attention_masks)

    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=test_batch_size)

    return train_dataloader


def train():
    train_loader = get_train_data_loader(config.batch_size)
    test_loader = get_test_data_loader(config.batch_size)

    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
        num_labels=2,  # The number of output labels--2 for binary classification.
        output_attentions=False,  # Whether the model returns attentions weights
        output_hidden_states=False,  # Whether the model returns all hidden-states
    )
    for param in model.bert.parameters():
        param.requires_grad = False  # Freeze all BERT layers
    if config.unfreeze_top_layers:
        for param in model.bert.encoder.layer[-5:-1].parameters():
            param.requires_grad = True  # Unfreeze the last layer
    model = model.to(config.device)
    ## set up loss function
    if config.use_class_weights:
        class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=labels)
        class_weights = torch.tensor(class_weights, dtype=torch.float).to(config.device)
        loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
    ## set up optimizer
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.classifier.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.01,  # Apply weight decay to classifier weights
        },
        {
            "params": [p for n, p in model.classifier.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,  # No weight decay for classifier biases and LayerNorms
        },
    ]
    if config.unfreeze_top_layers:
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in model.bert.encoder.layer[-5:-1].named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.01,  # Apply weight decay to weights
            },
            {
                "params": [p for n, p in model.bert.encoder.layer[-5:-1].named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,  # No weight decay for biases and LayerNorms
            }
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config.opt_lr)
    # optimizer = AdamW(model.parameters(), lr=config.opt_lr, weight_decay=config.opt_weight_decay)
    if config.unfreeze_top_layers:
        num_warmup_steps = int(1e3)
    else:
        num_warmup_steps = 0
    # num_warmup_steps = int(1e3)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=len(train_loader)*config.epochs
    )
    total_steps = 0

    for epoch in tqdm(range(config.epochs)):
        print(f"👉 Train Epoch {epoch}:")
        loss_epoch = 0
        model.train()
        for step, batch in enumerate(train_loader):
            total_steps += 1
            b_input_ids = batch[0].to(config.device)
            b_input_mask = batch[1].to(config.device)
            b_labels = batch[2].to(config.device)
            model.zero_grad()

            outputs = model(
                b_input_ids,  ## Shape: (batch_size, sequence_length)
                token_type_ids=None, ## Shape: (batch_size, sequence_length)
                attention_mask=b_input_mask, ## Shape: (batch_size, sequence_length)
                labels=b_labels) ## Shape: (batch_size,)

            if config.use_class_weights:
                logits = outputs.logits  ## same with outputs[1]
                loss = loss_fn(logits.view(-1, 2), b_labels.view(-1))
            else:
                loss = outputs.loss  ## same with outputs[0]
            wandb.log({"train_loss": loss.item()}, step=total_steps)
            loss_epoch += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
            optimizer.step()
            if step%10==0:
                print(
                    f"Step {total_steps}: "
                    f"[{step*len(batch[0])}/{len(train_loader.sampler)} "
                    f"({(100.0*step/len(train_loader)):.0f}%)] "
                    f"Loss: {loss.item():.6f}"
                )
        scheduler.step()
        wandb.log({"train_loss_epoch": loss_epoch/config.batch_size}, step=total_steps)
        eval_accuracy = test(model, test_loader)
        wandb.log({f"eval_accuracy_epoch (%)": eval_accuracy*100}, step=total_steps)
    return model


def test(model, test_loader):
    model.eval()
    _, eval_accuracy = 0, 0
    with torch.no_grad():
        for batch in test_loader:
            b_input_ids = batch[0].to(config.device)
            b_input_mask = batch[1].to(config.device)
            b_labels = batch[2].to(config.device)
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to("cpu").numpy()
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            eval_accuracy += tmp_eval_accuracy
    eval_accuracy /= len(test_loader.dataset)
    print("🟢 Test Accuracy (%): ", eval_accuracy*100)
    return eval_accuracy


wandb.init(
    # set the wandb project where this run will be logged
    project="udacity-awsmle-bert-cola",
    config=config
)
train()

Loading BERT tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/30 [00:00<?, ?it/s]

👉 Train Epoch 0:


  3%|▎         | 1/30 [00:42<20:35, 42.60s/it]

🟢 Test Accuracy (%):  0.926458947974383
👉 Train Epoch 1:


  7%|▋         | 2/30 [01:25<19:54, 42.66s/it]

🟢 Test Accuracy (%):  0.9257843419443045
👉 Train Epoch 2:


 10%|█         | 3/30 [02:09<19:24, 43.14s/it]

🟢 Test Accuracy (%):  0.9261216449593438
👉 Train Epoch 3:


 13%|█▎        | 4/30 [02:52<18:41, 43.13s/it]

🟢 Test Accuracy (%):  0.9254470389292655
👉 Train Epoch 4:


 17%|█▋        | 5/30 [03:35<18:00, 43.24s/it]

🟢 Test Accuracy (%):  0.926458947974383
👉 Train Epoch 5:


 20%|██        | 6/30 [04:18<17:17, 43.22s/it]

🟢 Test Accuracy (%):  0.9251097359142261
👉 Train Epoch 6:


 23%|██▎       | 7/30 [05:02<16:35, 43.26s/it]

🟢 Test Accuracy (%):  0.9274708570195007
👉 Train Epoch 7:


 27%|██▋       | 8/30 [05:45<15:52, 43.30s/it]

🟢 Test Accuracy (%):  0.9239853925307622
👉 Train Epoch 8:


 30%|███       | 9/30 [06:28<15:09, 43.33s/it]

🟢 Test Accuracy (%):  0.9289325034180038
👉 Train Epoch 9:


 33%|███▎      | 10/30 [07:12<14:26, 43.31s/it]

🟢 Test Accuracy (%):  0.9388267251924876
👉 Train Epoch 10:


 37%|███▋      | 11/30 [07:55<13:42, 43.28s/it]

🟢 Test Accuracy (%):  1.028436892854573
👉 Train Epoch 11:


 40%|████      | 12/30 [08:38<12:59, 43.28s/it]

🟢 Test Accuracy (%):  1.22216125782543
👉 Train Epoch 12:


 43%|████▎     | 13/30 [09:21<12:15, 43.29s/it]

🟢 Test Accuracy (%):  1.5420369504209541
👉 Train Epoch 13:


 47%|████▋     | 14/30 [10:05<11:32, 43.31s/it]

🟢 Test Accuracy (%):  1.7112506296322945
👉 Train Epoch 14:


 50%|█████     | 15/30 [10:48<10:49, 43.32s/it]

🟢 Test Accuracy (%):  1.859326653234511
👉 Train Epoch 15:


 53%|█████▎    | 16/30 [11:31<10:06, 43.32s/it]

🟢 Test Accuracy (%):  1.9643403252500538
👉 Train Epoch 16:


 57%|█████▋    | 17/30 [12:15<09:23, 43.33s/it]

🟢 Test Accuracy (%):  2.06800478520544
👉 Train Epoch 17:


 60%|██████    | 18/30 [12:58<08:40, 43.33s/it]

🟢 Test Accuracy (%):  2.114552601280852
👉 Train Epoch 18:


 63%|██████▎   | 19/30 [13:41<07:56, 43.32s/it]

🟢 Test Accuracy (%):  2.1499694178599698
👉 Train Epoch 19:


 67%|██████▋   | 20/30 [14:25<07:13, 43.34s/it]

🟢 Test Accuracy (%):  2.179652083183421
👉 Train Epoch 20:


 70%|███████   | 21/30 [15:08<06:30, 43.34s/it]

🟢 Test Accuracy (%):  2.2077606677700223
👉 Train Epoch 21:


 73%|███████▎  | 22/30 [15:52<05:46, 43.34s/it]

🟢 Test Accuracy (%):  2.2591431603943297
👉 Train Epoch 22:


 77%|███████▋  | 23/30 [16:35<05:03, 43.33s/it]

🟢 Test Accuracy (%):  2.2803932503418003
👉 Train Epoch 23:


 80%|████████  | 24/30 [17:18<04:20, 43.33s/it]

🟢 Test Accuracy (%):  2.3114251277254088
👉 Train Epoch 24:


 83%|████████▎ | 25/30 [18:01<03:36, 43.32s/it]

🟢 Test Accuracy (%):  2.324130207958552
👉 Train Epoch 25:


 87%|████████▋ | 26/30 [18:45<02:53, 43.31s/it]

🟢 Test Accuracy (%):  2.338409368928546
👉 Train Epoch 26:


 90%|█████████ | 27/30 [19:28<02:09, 43.31s/it]

🟢 Test Accuracy (%):  2.3635946607181406
👉 Train Epoch 27:


 93%|█████████▎| 28/30 [20:11<01:26, 43.32s/it]

🟢 Test Accuracy (%):  2.355949125710585
👉 Train Epoch 28:


 97%|█████████▋| 29/30 [20:55<00:43, 43.31s/it]

🟢 Test Accuracy (%):  2.3820338922069513
👉 Train Epoch 29:


100%|██████████| 30/30 [21:38<00:00, 43.28s/it]

🟢 Test Accuracy (%):  2.387768043462618





BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

```python
BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (pooler): BertPooler(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (activation): Tanh()
    )
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
)
```

* [Unfreezing all layers of BERT giving good results than freezing and adding custom Forward layer for Fine-Tuning](https://www.reddit.com/r/MLQuestions/comments/1d07qlz/unfreezing_all_layers_of_bert_giving_good_results/)   
* [I just can't fine tune BERT over 40% accuracy for text-classification task](https://www.reddit.com/r/MachineLearning/comments/1bx5r8r/d_i_just_cant_fine_tune_bert_over_40_accuracy_for/)  