### Finetuning DistilBERT Classifier
Finetune the output layers of the pretrained transformer

In [2]:
import pandas as pd
import numpy as np
import torch
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load from local
imdb_dataset = datasets.load_dataset('csv', data_files={
        'train': 'dataset/aclImdb/train.csv', 
        'val': 'dataset/aclImdb/val.csv',
        'test': 'dataset/aclImdb/test.csv'
    })

imdb_dataset

FileNotFoundError: Unable to find '/Users/tu/build/python/deeplearning-fundamental/dataset/aclImdb/train.csv'

#### Tokenization

In [20]:
from transformers import AutoTokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
print(f'Input max lengh {tokenizer.model_max_length}')
print(f'vocabulary size: {tokenizer.vocab_size}')

Input max lengh 512
vocabulary size: 30522


In [9]:
def tokenize_batch(batch):
    return tokenizer(batch['text'], truncation=True, padding=True) 

In [10]:
imdb_dataset_tokenized = imdb_dataset.map(tokenize_batch, batched=True, batch_size=None)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [11]:
imdb_dataset_tokenized
# => DatasetDict({
#     train: Dataset({
#         features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
#         num_rows: 35000
#     })
#     val: ...
#     test: ...
# })

# input_ids are tokens (after tokenized) corresponding to the text

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 35000
    })
    val: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [13]:
for rows in imdb_dataset_tokenized.get('train'):
    break
rows

{'index': 0,
 'text': 'When we started watching this series on cable, I had no idea how addictive it would be. Even when you hate a character, you hold back because they are so beautifully developed, you can almost understand why they react to frustration, fear, greed or temptation the way they do. It\'s almost as if the viewer is experiencing one of Christopher\'s learning curves.<br /><br />I can\'t understand why Adriana would put up with Christopher\'s abuse of her, verbally, physically and emotionally, but I just have to read the newspaper to see how many women can and do tolerate such behavior. Carmella has a dream house, endless supply of expensive things, but I\'m sure she would give it up for a loving and faithful husband - or maybe not. That\'s why I watch.<br /><br />It doesn\'t matter how many times you watch an episode, you can find something you missed the first five times. We even watch episodes out of sequence (watch season 1 on late night with commercials but all the l

In [14]:
imdb_dataset_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [15]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

#### Setup DataLoaders

In [16]:
from torch.utils.data import Dataset, DataLoader

In [17]:
class ImdbDataset(Dataset):
    def __init__(self, dataset_dict, partition_key):
        self.partition = dataset_dict[partition_key]

    def __getitem__(self, index):
        return self.partition[index]
    
    def __len__(self):
        return self.partition.num_rows

In [18]:
train_dataset = ImdbDataset(imdb_dataset_tokenized, 'train')
val_dataset = ImdbDataset(imdb_dataset_tokenized, 'val')
test_dataset = ImdbDataset(imdb_dataset_tokenized, 'test')

In [19]:
batch_size = 12
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, num_workers=4)
test_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, num_workers=4)

#### Initialize DistilBERT

In [21]:
from transformers import AutoModelForSequenceClassification

In [22]:
distil_bert = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 2)
distil_bert

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

Freeze the whole Model except the last two Layers

In [23]:
# freeze the whole model
for param in distil_bert.parameters():
    param.requires_grad = False

# unfreeze the last two layers
for param in distil_bert.pre_classifier.parameters():
    param.requires_grad = True
    
for param in distil_bert.classifier.parameters():
    param.requires_grad = True

#### Finetuning

In [24]:
import lightning
import torchmetrics

In [28]:
class LightningTransformer(lightning.LightningModule):
    def __init__(self, torch_model, learning_rate=5e-5):
        super().__init__()

        self.model = torch_model
        self.learning_rate = learning_rate

        self.val_acc = torchmetrics.Accuracy(task='multiclass', num_classes=2)
        self.test_acc = torchmetrics.Accuracy(task='multiclass', num_classes=2)

    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)
    
    def training_step(self, batch, batch_idx):
        outputs = self.forward(batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'])
        self.log('train_loss', outputs['loss'])
        return outputs['loss']
    
    def validation_step(self, batch, batch_idx):
        outputs = self.forward(batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'])
        self.log('val_loss', outputs['loss'], prog_bar=True)

        logits = outputs['logits']
        predicted_labels = torch.argmax(logits, 1)
        self.val_acc(predicted_labels, batch['label'])
        self.log('val_acc', self.val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        outputs = self.forward(batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'])
        # self.log('test_loss', outputs['loss'], prog_bar=True)

        logits = outputs['logits']
        predicted_labels = torch.argmax(logits, 1)
        self.test_acc(predicted_labels, batch['label'])
        self.log('test_acc', self.test_acc, prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

In [29]:
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger


callbacks = [
    ModelCheckpoint(
        save_top_k=1, mode="max", monitor="val_acc"
    )  # save top 1 model
]
logger = CSVLogger(save_dir="lightning_logs/", name="distil-bert")

In [30]:
lightning_model = LightningTransformer(torch_model=distil_bert)

trainer = lightning.Trainer(
    max_epochs=3,
    callbacks=callbacks,
    accelerator="cpu",
    precision="bf16-mixed",
    # devices=[0],
    logger=logger,
    log_every_n_steps=10,
)

trainer.fit(model=lightning_model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name     | Type                                | Params
-----------------------------------------------------------------
0 | model    | DistilBertForSequenceClassification | 67.0 M
1 | val_acc  | MulticlassAccuracy                  | 0     
2 | test_acc | MulticlassAccuracy                  | 0     
-----------------------------------------------------------------
592 K     Trainable params
66.4 M    Non-trainable params
67.0 M    Total params
267.820   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]