In [1]:
%%capture --no-display

!pip install numpy torch datasets transformers scikit-learn peft

In [2]:
import os
import random

import numpy as np
import torch

seed=42
os.environ['PYTHONHASHSEED'] = str(seed)

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
!sudo apt-get install git-lfs

!git lfs install
!git clone https://huggingface.co/datasets/dair-ai/emotion emotion
!git clone https://huggingface.co/google-bert/bert-base-uncased bert-base-uncased

!cd emotion
!git lfs pull

Detected operating system as Ubuntu/jammy.
Checking for curl...
Detected curl...
Checking for gpg...
Detected gpg...
Detected apt version as 2.4.12
Running apt-get update... done.
Installing apt-transport-https... done.
Installing /etc/apt/sources.list.d/github_git-lfs.list...done.
Importing packagecloud gpg key... Packagecloud gpg key imported to /etc/apt/keyrings/github_git-lfs-archive-keyring.gpg
done.
Running apt-get update... done.

The repository is setup! You can now install packages.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.6.1).
The following package was automatically installed and is no longer required:
  libpciaccess0
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 203 not upgraded.
Git LFS initialized.
Cloning into 'emotion'...
remote: Enumerating objects: 79, done.[K
remote: Total 79 (delta 0), reused 0 (delta 0), pack-reused 79 (from 1)

In [4]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

dataset = load_dataset('parquet', data_files={
    'train': './emotion/split/train-00000-of-00001.parquet',
    'validation': './emotion/split/validation-00000-of-00001.parquet',
    'test': './emotion/split/test-00000-of-00001.parquet',
})

tokenizer = BertTokenizer.from_pretrained('./bert-base-uncased')
base_model = BertForSequenceClassification.from_pretrained('./bert-base-uncased', num_labels=6)

for param in base_model.parameters():
    param.requires_grad = True

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 16000 examples [00:00, 769879.59 examples/s]
Generating validation split: 2000 examples [00:00, 412338.18 examples/s]
Generating test split: 2000 examples [00:00, 441831.24 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def tokenize(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=502, return_tensors='pt')

tokenized_datasets = dataset.map(
    lambda examples: {
        **tokenize(examples),
        'labels': examples['label']
    }, 
    batched=True,
)

Map: 100%|██████████| 16000/16000 [00:15<00:00, 1057.92 examples/s]
Map: 100%|██████████| 2000/2000 [00:01<00:00, 1049.82 examples/s]
Map: 100%|██████████| 2000/2000 [00:01<00:00, 1042.64 examples/s]


In [6]:
import time

from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted')
    }

def train_and_evaluate(trainer, model, tokenized_datasets, method_name):
    start_time = time.time()

    trainer.train()

    training_time = time.time() - start_time

    eval_results = trainer.evaluate(tokenized_datasets['validation'])

    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    if torch.cuda.is_available():
        gpu_memory = torch.cuda.max_memory_allocated() / 1024**2
    else:
        gpu_memory = 0

    results = {
        'method': method_name,
        'accuracy': eval_results['eval_accuracy'],
        'f1_score': eval_results['eval_f1'],
        'training_time': training_time,
        'trainable_params': trainable_params,
        'gpu_memory': gpu_memory
    }

    return results

## Full fine-tuning

In [7]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    report_to='none',
)

full_finetuning_trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  full_finetuning_trainer = Trainer(
Detected kernel version 5.4.210, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


## Linear probing

In [8]:
from torch import nn

class LinearProbingHead(nn.Module):
    def __init__(self, hidden_size, num_labels):
        super().__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.out_proj = nn.Linear(hidden_size, num_labels)

    def forward(self, x):
        x = self.dense(x)
        x = nn.functional.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)

        return x

linear_probing_model = base_model

linear_probing_model.classifier = LinearProbingHead(linear_probing_model.config.hidden_size, linear_probing_model.config.num_labels)

for param in linear_probing_model.bert.parameters():
    param.requires_grad = True

linear_probing_trainer = Trainer(
    model=linear_probing_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  linear_probing_trainer = Trainer(
Detected kernel version 5.4.210, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


## PEFT with Prompt tuning

In [9]:
from transformers import BertModel
from peft import PromptTuningConfig, get_peft_model

config = PromptTuningConfig(
    task_type='SEQ_CLS',
    num_virtual_tokens=10,
    prompt_tuning_init='TEXT',
    prompt_tuning_init_text='Classify the emotion:',
    tokenizer_name_or_path='./bert-base-uncased',
)

prompt_tuning_model = get_peft_model(base_model, config)

prompt_tuning_trainer = Trainer(
    model=prompt_tuning_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  prompt_tuning_trainer = Trainer(
Detected kernel version 5.4.210, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## PEFT with Prefix tuning

In [10]:
from peft import PrefixTuningConfig, get_peft_model

config = PrefixTuningConfig(
    task_type='SEQ_CLS',
    num_virtual_tokens=10,
    encoder_hidden_size=768,
)

prefix_tuning_model = get_peft_model(base_model, config)

prefix_tuning_trainer = Trainer(
    model=prefix_tuning_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  prefix_tuning_trainer = Trainer(
Detected kernel version 5.4.210, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## PEFT with LoRA

In [11]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    task_type='SEQ_CLS',
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
)

lora_model = get_peft_model(base_model, config)

lora_trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  lora_trainer = Trainer(
Detected kernel version 5.4.210, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


**Про ранг**: 8 эмпирически оказалось оптимальным значением, если брать по степени двойки больше обучение занимает неприятно много времени (>5 часов T4), при 4 просаживалось качество, остальные параметры не трогал из за ограничений в квоте

In [13]:
all_results = []

# Full Fine-Tuning
results_full_finetuning = train_and_evaluate(full_finetuning_trainer, base_model, tokenized_datasets, method_name='Full fine-tuning')
all_results.append(results_full_finetuning)

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.2919,1.193927,0.5515,0.430617
2,1.1208,1.063789,0.599,0.50934
3,1.0915,1.015977,0.6285,0.561025


In [14]:
# Linear Probing
results_linear_probing = train_and_evaluate(linear_probing_trainer, linear_probing_model, tokenized_datasets, method_name='Linear probing')
all_results.append(results_linear_probing)

Epoch,Training Loss,Validation Loss


In [15]:
# PEFT с использованием Prompt Tuning
results_prompt_tuning = train_and_evaluate(prompt_tuning_trainer, prompt_tuning_model, tokenized_datasets, method_name='Prompt tuning')
all_results.append(results_prompt_tuning)

Epoch,Training Loss,Validation Loss


In [16]:
# PEFT с использованием Prompt Tuning
results_prompt_tuning = train_and_evaluate(prefix_tuning_trainer, prefix_tuning_model, tokenized_datasets, method_name='Prefix tuning')
all_results.append(results_prompt_tuning)

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6059,0.487572,0.815,0.809788
2,0.5178,0.452115,0.832,0.828124
3,0.5698,0.444304,0.838,0.834007


In [17]:
# PEFT с использованием LoRA
results_lora = train_and_evaluate(lora_trainer, lora_model, tokenized_datasets, method_name='LoRA')
all_results.append(results_lora)

Epoch,Training Loss,Validation Loss


In [28]:
import pandas as pd

pd.DataFrame(all_results)

Unnamed: 0,method,accuracy,f1_score,training_time,trainable_params,gpu_memory
0,Full fine-tuning,0.6285,0.561025,1211.641426,890118,4564.460938
1,Linear probing,0.732,0.706888,1224.886235,890118,4570.785156
2,Prompt tuning,0.8085,0.80118,1310.224491,897798,4570.785156
3,Prefix tuning,0.838,0.834007,1221.088016,1074438,4570.785156
4,LoRA,0.865,0.86332,1205.281711,890118,4593.141602


## Вывод
---

Пройдусь по каждому из параметров каждой модели:
1. Качество:
- Full fine-tuning - худшее качество на валидации (0.63 Accuracy / 0.56 F1), LoRA - лучшее (0.86 Accuracy / 0.86 F1), то есть видно, что LoRA работает лучше с дисбалансом классов в пуле
- У Prefix tuning'a все еще хорошее качество, но проблемы с кол-во параметров, всего скорее подойдет на все железки, придется шарить между несколькими

2. Время обучения:
- Каждый из подходов обучились в среднем за одинаковое время на Tesla T4, видно, что Prompt tuning немного дольще, но скорее выброс, Prefix tuning с самым большим кол-во параметров обучился внутри довер. интервала

3. Потребляемые ресурсы:
- У всех подходов кроме Prefix tuning'a (1074438 параметров) почти идентичное кол-во параметров, GPU памяти при это они занимают тоже одинаковое кол-во

Если смотреть сверху, то кажется, что лучше всего брать LoRA, объективно качество лучше без оверхеда по ресурсам