In [3]:
import warnings
warnings.filterwarnings("ignore")

# Load dataset
The **Emotion** dataset is a collection of English social media posts labeled with one of six basic emotions: **anger, fear, joy, love, sadness,** and **surprise**. It is commonly used for training and evaluating emotion classification models in natural language processing tasks.


In [5]:
from datasets import load_dataset, DatasetDict

dataset = load_dataset("emotion", trust_remote_code=True)

README.md:   0%|          | 0.00/9.05k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/127k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/129k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [7]:
label_list = dataset['train'].features['label'].names
print(label_list)

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


In [8]:
model_name = "bert-base-uncased"
num_labels = len(label_list)

# Data preprocessing
Use BertTokenizer to tokenize the text

In [9]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
def preprocess(example):
  return tokenizer(example['text'], truncation=True, padding=True, return_tensors="pt")

tokenized_dataset = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

# Set up Training arguments and Trainer

We use a unified training arguments, compute metrics, and datacollator throughout this task for all following models.

In [11]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_dir="./logs",
    label_names=['labels'],
    metric_for_best_model="eval_f1",
    report_to="tensorboard",
    load_best_model_at_end=True,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

2025-06-01 05:44:06.799493: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748756647.023157      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748756647.087592      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [12]:
from evaluate import load
import numpy as np

metric = load("accuracy")
f1_metric = load("f1")
precision_metric = load("precision")
recall_metric = load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return {
        "accuracy": metric.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"],
        "precision": precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"],
        "recall": recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"],
    }


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

In [13]:
from transformers import EarlyStoppingCallback

early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# Full fine-tuning

In [14]:
from peft import get_peft_model, TaskType
from transformers import BertForSequenceClassification

In [15]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
import numpy as np

def count_trainable_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    all_params = sum(p.numel() for p in model.parameters())
    trainable_percentage = 100 * trainable_params / all_params

    print(f"trainable params: {trainable_params:,} || all params: {all_params:,} || trainable%: {trainable_percentage:.4f}")

count_trainable_parameters(model)

trainable params: 109,486,854 || all params: 109,486,854 || trainable%: 100.0000


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4368,0.186824,0.9265,0.927241,0.931555,0.9265
2,0.1388,0.168374,0.937,0.93559,0.938484,0.937
3,0.1173,0.153666,0.9395,0.939747,0.940408,0.9395
4,0.0964,0.14856,0.938,0.938323,0.940497,0.938
5,0.0765,0.215513,0.9385,0.938287,0.938839,0.9385
6,0.0526,0.244514,0.936,0.935299,0.936156,0.936


TrainOutput(global_step=3000, training_loss=0.15306032498677571, metrics={'train_runtime': 1099.8996, 'train_samples_per_second': 727.339, 'train_steps_per_second': 22.729, 'total_flos': 4233000456579456.0, 'train_loss': 0.15306032498677571, 'epoch': 6.0})

In [18]:
result = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
for key, value in result.items():
    print(f"{key}: {value:.4f}")
model.save_pretrained("./seq-full-model");

eval_loss: 0.1579
eval_accuracy: 0.9260
eval_f1: 0.9264
eval_precision: 0.9276
eval_recall: 0.9260
eval_runtime: 6.2680
eval_samples_per_second: 319.0810
eval_steps_per_second: 10.0510
epoch: 6.0000


# Partial fine-tuning
Freeze base model, only train the classifier head

In [19]:
frozen_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
for param in frozen_model.bert.parameters():
    param.requires_grad = False
count_trainable_parameters(frozen_model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 4,614 || all params: 109,486,854 || trainable%: 0.0042


In [20]:
frozen_trainer = Trainer(
    model=frozen_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

frozen_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.5878,1.56452,0.3615,0.223369,0.23232,0.3615
2,1.5569,1.552792,0.393,0.288381,0.246638,0.393
3,1.5485,1.545106,0.4085,0.29839,0.261715,0.4085
4,1.5376,1.535379,0.4305,0.330076,0.268431,0.4305
5,1.5291,1.523861,0.428,0.316144,0.276651,0.428
6,1.5249,1.519795,0.4445,0.342093,0.278109,0.4445
7,1.5165,1.509469,0.447,0.33754,0.281266,0.447
8,1.5115,1.504462,0.454,0.346199,0.282746,0.454
9,1.5062,1.499002,0.454,0.344405,0.284413,0.454
10,1.5004,1.497913,0.4555,0.351194,0.285762,0.4555


TrainOutput(global_step=15000, training_loss=1.4886580403645833, metrics={'train_runtime': 2679.1026, 'train_samples_per_second': 298.607, 'train_steps_per_second': 9.331, 'total_flos': 2.1164064916098816e+16, 'train_loss': 1.4886580403645833, 'epoch': 30.0})

In [21]:
result = frozen_trainer.evaluate(eval_dataset=tokenized_dataset["test"])
for key, value in result.items():
    print(f"{key}: {value:.4f}")
frozen_model.save_pretrained("./seq-frozen-model");

eval_loss: 1.4105
eval_accuracy: 0.4775
eval_f1: 0.3716
eval_precision: 0.3722
eval_recall: 0.4775
eval_runtime: 6.1727
eval_samples_per_second: 324.0050
eval_steps_per_second: 10.2060
epoch: 30.0000


# PEFT-LoRA

In [22]:
from peft import get_peft_model, LoraConfig, TaskType

base_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

lora_config =  LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "key", "value"],
)

lora_model = get_peft_model(base_model, lora_config)
for param in lora_model.classifier.parameters():
    param.requires_grad = True

lora_model.print_trainable_parameters()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 451,596 || all params: 109,933,836 || trainable%: 0.4108


In [23]:
lora_trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

lora_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.3906,1.050932,0.6195,0.519372,0.461206,0.6195
2,0.9272,0.79254,0.6945,0.640537,0.621333,0.6945
3,0.7232,0.58504,0.773,0.735847,0.78841,0.773
4,0.5591,0.440495,0.8545,0.847541,0.856815,0.8545
5,0.4434,0.334152,0.8855,0.883569,0.884845,0.8855
6,0.3618,0.289214,0.91,0.909167,0.909327,0.91
7,0.3082,0.253922,0.9175,0.917784,0.91876,0.9175
8,0.2767,0.232615,0.9235,0.924251,0.926196,0.9235
9,0.2425,0.216598,0.9255,0.92553,0.926302,0.9255
10,0.2227,0.208558,0.9255,0.926519,0.929911,0.9255


TrainOutput(global_step=9500, training_loss=0.3662460166529605, metrics={'train_runtime': 2858.5324, 'train_samples_per_second': 279.864, 'train_steps_per_second': 8.746, 'total_flos': 1.347206587841664e+16, 'train_loss': 0.3662460166529605, 'epoch': 19.0})

In [24]:
result = lora_trainer.evaluate(eval_dataset=tokenized_dataset["test"])
for key, value in result.items():
    print(f"{key}: {value:.4f}")
lora_model.save_pretrained("./seq-lora-model");

eval_loss: 0.1794
eval_accuracy: 0.9220
eval_f1: 0.9232
eval_precision: 0.9271
eval_recall: 0.9220
eval_runtime: 7.1103
eval_samples_per_second: 281.2830
eval_steps_per_second: 8.8600
epoch: 19.0000


## Fine-tuning Strategies Comparison

| Strategy              | Trainable Params | Epochs | Train Runtime (s) | Eval Accuracy | Eval Precision | Eval Recall | Eval F1  | Eval Loss |
|-----------------------|------------------|--------|--------------------|---------------|----------------|-------------|----------|-----------|
| Full Fine-tuning      | 109,486,854      | 6      | 1099.90            | **0.9260**     | **0.9276**      | 0.9260      | **0.9264** | 0.1579    |
| Classifier Only       | 4,614            | 30     | 2679.10            | 0.4775        | 0.3722         | 0.4775      | 0.3716   | 1.4105    |
| LoRA                  | 451,596          | 19     | 2858.53            | 0.9220        | 0.9271         | **0.9220**   | 0.9232   | **0.1794** |

## Conclusion

- **Full fine-tuning** achieves the highest F1 score and overall accuracy, with all model parameters updated. It is the most performant option but also the most resource-intensive.
- **LoRA** achieves **nearly identical performance** to full fine-tuning while training only **0.41%** of the parameters. It offers the best trade-off between efficiency and effectiveness.
- **Classifier-only fine-tuning** fails to learn effectively, with poor accuracy and F1 despite a longer training time and more epochs. It is not suitable for this task.