<a href="https://colab.research.google.com/github/nov05/Google-Colaboratory/blob/master/20241129_finetune_bert_solution_2_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

* changed by nov05 on 2024-11-28  
* Udacity AWS MLE Nanodegree (ND189)  
  Course 4, 3.15 Excercise: Fine-Tuning BERT    
* local env `conda activate cuda_py310` with cuda enabled  
* [CoLA dataset on KaggleHub](https://www.kaggle.com/datasets/krazy47/cola-the-corpus-of-linguistic-acceptability)  
  The Corpus of Linguistic Acceptability   

In [8]:
!mkdir -p cola_public
!mkdir -p cola_public/raw
!wget https://raw.githubusercontent.com/nov05/udacity-CD0387-deep-learning-topics-within-computer-vision-nlp-project-starter/refs/heads/main/cd0387_common_model_arch_types_fine_tuning/cola_public/raw/in_domain_train.tsv -O cola_public/raw/in_domain_train.tsv

--2024-11-29 13:12:54--  https://raw.githubusercontent.com/nov05/udacity-CD0387-deep-learning-topics-within-computer-vision-nlp-project-starter/refs/heads/main/cd0387_common_model_arch_types_fine_tuning/cola_public/raw/in_domain_train.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 428578 (419K) [text/plain]
Saving to: ‘cola_public/raw/in_domain_train.tsv’


2024-11-29 13:12:54 (12.1 MB/s) - ‘cola_public/raw/in_domain_train.tsv’ saved [428578/428578]



## Solution: Fine-tune BERT model

In [9]:
import os
# import sys
# import json
from tqdm import tqdm
import wandb
import numpy as np
import pandas as pd
import torch
from torch.optim import AdamW
# import torch.distributed as dist
import torch.utils.data
import torch.utils.data.distributed
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from transformers import BertForSequenceClassification, BertTokenizer # type: ignore
from transformers import get_linear_schedule_with_warmup # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.utils.class_weight import compute_class_weight # type: ignore

## log training process with W&B if uncommented
# os.environ['WANDB_MODE'] = 'disabled'

In [10]:
class Config:
    def __init__(self):
        self.wandb = True
        self.device = torch.device('cpu')
        self.max_len = 64 ## this is the max length of the sentence
        self.epochs = 15
        self.batch_size = 64
        self.opt_lr = 2e-5

config = Config()
config.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"👉 Running on device type: {config.device}")

👉 Running on device type: cuda:0


In [11]:
df = pd.read_csv(
   r"/content/cola_public/raw/in_domain_train.tsv",
   sep="\t",
   header=None,
   usecols=[1, 3],
   names=["label", "sentence"],
)
sentences = df.sentence.values
labels = df.label.values
print(df.shape)
df.sample(3)

(8551, 2)


Unnamed: 0,label,sentence
1507,1,Do you believe that somebody was looking for s...
4444,0,Americans have paying income tax ever since 1913.
6507,1,"Almost every cat likes mice, but Felix doesn't."


In [12]:
!mkdir -p data
!mkdir -p data/cola
train_df, test_df = train_test_split(df, stratify=labels)
train_df.to_csv(r"data/cola/train.csv", index=False)
test_df.to_csv(r"data/cola/test.csv", index=False)

In [13]:
print("Loading BERT tokenizer...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat==labels_flat) / len(labels_flat)


def get_train_data_loader(batch_size):
    dataset = pd.read_csv(os.path.join("data", "cola", "train.csv"))
    sentences = dataset.sentence.values
    labels = dataset.label.values

    input_ids = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(sent, add_special_tokens=True)
        input_ids.append(encoded_sent)

    # pad shorter sentences
    input_ids_padded = []
    for id in input_ids:
        while len(id) < config.max_len:
            id.append(0)
        input_ids_padded.append(id)
    input_ids = input_ids_padded

    # mask; 0: added, 1: otherwise
    attention_masks = []
    # For each sentence...
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)

    # convert to PyTorch data types.
    train_inputs = torch.tensor(input_ids)
    train_labels = torch.tensor(labels)
    train_masks = torch.tensor(attention_masks)

    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    return train_dataloader


def get_test_data_loader(test_batch_size):
    dataset = pd.read_csv(os.path.join("data", "cola", "test.csv"))
    sentences = dataset.sentence.values
    labels = dataset.label.values

    input_ids = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(sent, add_special_tokens=True)
        input_ids.append(encoded_sent)

    # pad shorter sentences
    input_ids_padded = []
    for id in input_ids:
        while len(id)<config.max_len:
            id.append(0)
        input_ids_padded.append(id)
    input_ids = input_ids_padded

    # mask; 0: added, 1: otherwise
    attention_masks = []
    # For each sentence...
    for sent in input_ids:
        att_mask = [int(token_id>0) for token_id in sent]
        attention_masks.append(att_mask)

    # convert to PyTorch data types.
    train_inputs = torch.tensor(input_ids)
    train_labels = torch.tensor(labels)
    train_masks = torch.tensor(attention_masks)

    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=test_batch_size)

    return train_dataloader


def train():
    train_loader = get_train_data_loader(config.batch_size)
    test_loader = get_test_data_loader(config.batch_size)
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
        num_labels=2,  # The number of output labels--2 for binary classification.
        output_attentions=False,  # Whether the model returns attentions weights
        output_hidden_states=False,  # Whether the model returns all hidden-states
    )
    model = model.to(config.device)
    optimizer = AdamW(model.parameters(), lr=config.opt_lr)
    total_steps = 0

    for epoch in tqdm(range(config.epochs)):
        print(f"👉 Train Epoch {epoch}:")
        loss_epoch = 0
        model.train()
        for step, batch in enumerate(train_loader):
            total_steps += 1
            b_input_ids = batch[0].to(config.device)
            b_input_mask = batch[1].to(config.device)
            b_labels = batch[2].to(config.device)
            model.zero_grad()

            outputs = model(
                b_input_ids,  ## Shape: (batch_size, sequence_length)
                token_type_ids=None, ## Shape: (batch_size, sequence_length)
                attention_mask=b_input_mask, ## Shape: (batch_size, sequence_length)
                labels=b_labels) ## Shape: (batch_size,)

            loss = outputs.loss  ## same with outputs[0]
            wandb.log({"train_loss": loss.item()}, step=total_steps)
            loss_epoch += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
            optimizer.step()
            if step%10==0:
                print(
                    f"Step {total_steps}: "
                    f"[{step*len(batch[0])}/{len(train_loader.sampler)} "
                    f"({(100.0*step/len(train_loader)):.0f}%)] "
                    f"Loss: {loss.item():.6f}"
                )
        wandb.log({"train_loss_epoch": loss_epoch/config.batch_size}, step=total_steps)
        eval_accuracy = test(model, test_loader)
        wandb.log({f"eval_accuracy_epoch (%)": eval_accuracy*100}, step=total_steps)
    return model


def test(model, test_loader):
    model.eval()
    eval_accuracy_steps = 0
    total_steps = 0
    with torch.no_grad():
        for batch in test_loader:
            total_steps += 1
            b_input_ids = batch[0].to(config.device)
            b_input_mask = batch[1].to(config.device)
            b_labels = batch[2].to(config.device)
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)
            logits = outputs.logits.detach().cpu().numpy()  ## differs from the original
            label_ids = b_labels.to("cpu").numpy()
            eval_accuracy_steps += flat_accuracy(logits, label_ids)
    eval_accuracy = eval_accuracy_steps / total_steps
    print("\n🟢 Test Accuracy (%): ", eval_accuracy*100)
    return eval_accuracy

Loading BERT tokenizer...


In [14]:
%%time
wandb.init(
    # set the wandb project where this run will be logged
    project="udacity-awsmle-bert-cola",
    config=config
)
train()
## 15 epochs, 16:49 mins

VBox(children=(Label(value='0.270 MB of 0.270 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_loss,▆█▅▄▇▆▄▇▇▇▄▆█▅▆▅▅▅▆▃▂▄▅▆▂▄▅▃▆▆▂▂▄▅▃▄▂▁▁▄
train_loss_epoch,▁

0,1
train_loss,0.42
train_loss_epoch,0.82879


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/15 [00:00<?, ?it/s]

👉 Train Epoch 0:


  7%|▋         | 1/15 [01:05<15:23, 65.94s/it]


🟢 Test Accuracy (%):  79.05472285067874
👉 Train Epoch 1:


 13%|█▎        | 2/15 [02:11<14:12, 65.56s/it]


🟢 Test Accuracy (%):  80.88588800904978
👉 Train Epoch 2:


 20%|██        | 3/15 [03:16<13:06, 65.56s/it]


🟢 Test Accuracy (%):  81.12273755656109
👉 Train Epoch 3:


 27%|██▋       | 4/15 [04:22<12:00, 65.51s/it]


🟢 Test Accuracy (%):  81.00608031674209
👉 Train Epoch 4:


 33%|███▎      | 5/15 [05:27<10:55, 65.53s/it]


🟢 Test Accuracy (%):  80.88942307692308
👉 Train Epoch 5:


 40%|████      | 6/15 [06:33<09:49, 65.50s/it]


🟢 Test Accuracy (%):  80.66317873303169
👉 Train Epoch 6:


 47%|████▋     | 7/15 [07:38<08:43, 65.48s/it]


🟢 Test Accuracy (%):  80.43339932126698
👉 Train Epoch 7:


 53%|█████▎    | 8/15 [08:44<07:38, 65.44s/it]


🟢 Test Accuracy (%):  81.11920248868778
👉 Train Epoch 8:


 60%|██████    | 9/15 [09:49<06:32, 65.46s/it]


🟢 Test Accuracy (%):  80.84346719457014
👉 Train Epoch 9:


 67%|██████▋   | 10/15 [10:54<05:27, 65.44s/it]


🟢 Test Accuracy (%):  81.05203619909503
👉 Train Epoch 10:


 73%|███████▎  | 11/15 [12:00<04:21, 65.43s/it]


🟢 Test Accuracy (%):  81.83328619909503
👉 Train Epoch 11:


 80%|████████  | 12/15 [13:05<03:16, 65.44s/it]


🟢 Test Accuracy (%):  80.46167986425338
👉 Train Epoch 12:


 87%|████████▋ | 13/15 [14:11<02:10, 65.44s/it]


🟢 Test Accuracy (%):  81.64946266968326
👉 Train Epoch 13:


 93%|█████████▎| 14/15 [15:16<01:05, 65.40s/it]


🟢 Test Accuracy (%):  80.89649321266968
👉 Train Epoch 14:


100%|██████████| 15/15 [16:21<00:00, 65.46s/it]


🟢 Test Accuracy (%):  81.44089366515837
CPU times: user 16min 18s, sys: 1.89 s, total: 16min 20s
Wall time: 16min 32s





BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [15]:
wandb.finish()

0,1
eval_accuracy_epoch (%),▁▆▆▆▆▅▄▆▆▆█▅█▆▇
train_loss,██▅▇▃▄▃▃▃▃▁▂▂▁▁▂▁▂▁▁▂▁▁▂▁▁▁▂▂▁▁▁▁▁▁▁▂▁▁▁
train_loss_epoch,█▆▄▃▂▂▂▂▂▁▁▁▁▁▁

0,1
eval_accuracy_epoch (%),81.44089
train_loss,0.00013
train_loss_epoch,0.02814


```python
BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (pooler): BertPooler(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (activation): Tanh()
    )
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
)
```

* [Unfreezing all layers of BERT giving good results than freezing and adding custom Forward layer for Fine-Tuning](https://www.reddit.com/r/MLQuestions/comments/1d07qlz/unfreezing_all_layers_of_bert_giving_good_results/)   
* [I just can't fine tune BERT over 40% accuracy for text-classification task](https://www.reddit.com/r/MachineLearning/comments/1bx5r8r/d_i_just_cant_fine_tune_bert_over_40_accuracy_for/)  