<h1>Fine-tuning Transformers

<h2>1. Set up dataset input </h2>

In [None]:
!pip install datasets
from datasets import load_dataset

emotions = load_dataset('emotion')

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.05k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/127k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/129k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [None]:
emotions.set_format(type = 'pandas')
df = emotions['train'][:]
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [None]:
def label_int2str(row):
  return emotions["train"].features["label"].int2str(row)
df['label_name'] = df['label'].apply(label_int2str)
df.head()

Unnamed: 0,text,label,label_name
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned...,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,anger


In [None]:
emotions.reset_format()

In [None]:
from transformers import AutoTokenizer
model_ckpt = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize(batches):
  return tokenizer(batches['text'], padding = True, truncation = True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
print(tokenize(emotions['train'][:2]))

{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [None]:
emotion_encoded = emotions.map(tokenize, batched = True, batch_size = None)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
emotion_encoded['train'].column_names

['text', 'label', 'input_ids', 'attention_mask']

<h2>2. Loading a pretrained model</h2>

In [None]:
import torch
import numpy as np
import pandas as pd

from transformers import AutoModelForSequenceClassification
#AutoModelForSequenceClassifcation has a classification head on top of the body, which is different than AutoModel in previosu case.


In [None]:
num_labels = 6
device = torch.device('cuda' if torch.cuda.is_available() else 'gpu')

model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels = num_labels).to(device))

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<h2>3. Defining the performance metrics </h2>

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average = 'weighted')
  acc = accuracy_score(labels, preds)
  return {'accuracy': acc, 'f1': f1}

Setting up: <br>
<ol>
<li>Login on Hugging Face Hub to push the fine-tuned model to our account on Hub and share it with the Community.</li>
<li>Define all the hyperparameters for the training phase.</li>


<h2>4. Training the Model</h2>

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import TrainingArguments, Trainer

batch_size = 64
logging_steps = len(emotion_encoded['train']) // batch_size
model_name = f'{model_ckpt}-finetuned-emotion'
training_args = TrainingArguments(output_dir = model_name,
                                  num_train_epochs= 2,
                                  learning_rate = 2e-5,
                                  per_device_train_batch_size = batch_size,
                                  per_device_eval_batch_size = batch_size,
                                  weight_decay = 0.01,
                                  eval_strategy = 'epoch',
                                  disable_tqdm = False,
                                  logging_steps = logging_steps,
                                  push_to_hub = True,
                                  log_level = 'error')

In [None]:
trainer = Trainer(model = model,
                  args = training_args,
                  compute_metrics = compute_metrics,
                  train_dataset = emotion_encoded['train'],
                  eval_dataset = emotion_encoded['validation'],
                  tokenizer = tokenizer)
trainer.train()

  trainer = Trainer(model = model,
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnguyenvietkhoa1409[0m ([33mnguyenvietkhoa1409-university-of-economics[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8104,0.308186,0.9115,0.911189
2,0.2466,0.214689,0.929,0.928754


TrainOutput(global_step=500, training_loss=0.5285234756469727, metrics={'train_runtime': 265.4258, 'train_samples_per_second': 120.561, 'train_steps_per_second': 1.884, 'total_flos': 720342861696000.0, 'train_loss': 0.5285234756469727, 'epoch': 2.0})

The result shows that **fine-tuning method** yields a significant improvement over the previous method (adopt the whole pre-trained body with added classification head).

<h3>Inspect model architecture and parameters</h3>

In [None]:
print(model)


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total Parameters: {total_params:,}")
print(f"Trainable Parameters: {trainable_params:,}")


Total Parameters: 66,958,086
Trainable Parameters: 66,958,086


<h1>5. Error Analysis </h1>

<h3>5.1 Test with an instance </h3>

In [None]:
# #Keypoint:
# 1. Move data to the correct device
# 2. Perform forward pass without Gradient Computation
# 3. Compute predictions and loss
# 4. Return Results as Dictionary

In [None]:
val_sample = emotion_encoded['validation'][0]
print(val_sample)

{'text': 'im feeling quite sad and sorry for myself but ill snap out of it soon', 'label': 0, 'input_ids': [101, 10047, 3110, 3243, 6517, 1998, 3374, 2005, 2870, 2021, 5665, 10245, 2041, 1997, 2009, 2574, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [None]:
print(emotion_encoded['train'].column_names)

['text', 'label', 'input_ids', 'attention_mask']


In [None]:
#Place test_sample into correct device
val_sample_tensor = {
    "label": torch.tensor(val_sample['label']).unsqueeze(0).to(device), # unsqueeze: Add batch dimension
    "input_ids": torch.tensor(val_sample["input_ids"]).unsqueeze(0).to(device),
    "attention_mask": torch.tensor(val_sample["attention_mask"]).unsqueeze(0).to(device),
}
val_sample_tensor.keys()

dict_keys(['label', 'input_ids', 'attention_mask'])

In [None]:
from torch.nn.functional import cross_entropy

with torch.no_grad():
  inputs = {k:v for k,v in val_sample_tensor.items() if k in tokenizer.model_input_names}
  outputs = model(**inputs)
  pred_label = torch.argmax(outputs.logits, axis = -1)
  loss = cross_entropy(outputs.logits, val_sample_tensor['label'], reduction = 'none')

  # Convert label numbers to class names
true_label_str = label_int2str(val_sample["label"])
pred_label_str = label_int2str(pred_label.item())  # Convert tensor to integer

# Display results
print(f"Text: {val_sample['text']}")
print(f"True Label: {true_label_str}")
print(f"Predicted Label: {pred_label_str}")
print(f"Loss: {loss.item()}")


Text: im feeling quite sad and sorry for myself but ill snap out of it soon
True Label: sadness
Predicted Label: sadness
Loss: 0.02453184686601162
