# Предобработка данных

## Небольшие примеры

Обучим всю модель (тело + голова)

In [25]:
import torch
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())

old_weights = {name: param.clone().detach() for name, param in model.named_parameters()}

model.train()
optimizer.zero_grad()
loss = model(**batch).loss
loss.backward()
optimizer.step()

new_weights = {name: param.clone().detach() for name, param in model.named_parameters()}

old_weights['classifier.weight'], new_weights[
    'classifier.weight']  # видны изменения в головах, но поменялись не только они, но и веса в теле

(tensor([[ 0.0138, -0.0222,  0.0421,  ...,  0.0305, -0.0367, -0.0273],
         [-0.0140, -0.0306,  0.0452,  ..., -0.0017, -0.0054, -0.0285]]),
 tensor([[ 0.0138, -0.0222,  0.0411,  ...,  0.0305, -0.0377, -0.0273],
         [-0.0140, -0.0306,  0.0462,  ..., -0.0017, -0.0044, -0.0285]]))

Обучим только голову, заморозив тело

In [26]:
import torch
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau

checkpoint = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to("cuda")

for param in model.base_model.parameters():
    param.requires_grad = False

for param in model.classifier.parameters():
    param.requires_grad = True

sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt").to("cuda")
batch["labels"] = torch.tensor([1, 1]).to("cuda")

optimizer = AdamW(model.classifier.parameters(), lr=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.1, patience=3)

old_weights = {name: param.clone().detach() for name, param in model.named_parameters()}

model.train()
for _ in tqdm(range(100)):  # 100 эпох
    optimizer.zero_grad()

    loss = model(**batch).loss
    loss.backward()

    optimizer.step()
    scheduler.step(loss)

new_weights = {name: param.clone().detach() for name, param in model.named_parameters()}

old_weights['classifier.weight'], new_weights['classifier.weight']


  0%|          | 0/100 [00:00<?, ?it/s][A
 22%|██▏       | 22/100 [00:00<00:00, 212.88it/s][A
 54%|█████▍    | 54/100 [00:00<00:00, 273.45it/s][A
100%|██████████| 100/100 [00:00<00:00, 295.56it/s][A


(tensor([[ 0.0138, -0.0222,  0.0421,  ...,  0.0305, -0.0367, -0.0273],
         [-0.0140, -0.0306,  0.0452,  ..., -0.0017, -0.0054, -0.0285]],
        device='cuda:0'),
 tensor([[ 0.0138, -0.0222,  0.0420,  ...,  0.0304, -0.0368, -0.0273],
         [-0.0140, -0.0306,  0.0452,  ..., -0.0016, -0.0053, -0.0285]],
        device='cuda:0'))

## Загрузка датасета с Hub


In [27]:
from datasets import load_dataset

dataset = load_dataset("glue", "mrpc")
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [28]:
dataset.column_names

{'train': ['sentence1', 'sentence2', 'label', 'idx'],
 'validation': ['sentence1', 'sentence2', 'label', 'idx'],
 'test': ['sentence1', 'sentence2', 'label', 'idx']}

In [29]:
dataset["train"][:2]

{'sentence1': ['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
  "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion ."],
 'sentence2': ['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
  "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 ."],
 'label': [1, 0],
 'idx': [0, 1]}

In [30]:
dataset["train"].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

## Предобработка датасета

In [31]:
tokenized_dataset = tokenizer(dataset["train"]["sentence1"], dataset["train"]["sentence2"],
                              padding=True, truncation=True, )
tokenized_dataset.keys()

dict_keys(['input_ids', 'attention_mask'])

Это будет работать только если у нас достаточно оперативной памяти (RAM) для хранения целого датасета во время токенизации (в то время как датасеты из библиотеки Datasets являются Apache Arrow файлами, хранящимися на диске; они будут загружены только в тот момент, когда вы их будете запрашивать).

Чтобы хранить данные в формате датасета, мы будем использовать методы Dataset.map(). Это позволит нам сохранить высокую гибкость даже если нам нужно что-то большее, чем просто токенизация. Метод map() работает так: применяет некоторую функцию к каждому элементу датасетаДля данной проблемы есть решение - функция map. Она сохраняет исходные колонки и добавляет новые в зависимости от функции

In [32]:
from transformers import AutoTokenizer

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], padding="max_length", truncation=True, max_length=128)


tokenized_datasets = dataset.map(tokenize_function)  # по одной строке за раз
tokenized_datasets.column_names

{'train': ['sentence1',
  'sentence2',
  'label',
  'idx',
  'input_ids',
  'token_type_ids',
  'attention_mask'],
 'validation': ['sentence1',
  'sentence2',
  'label',
  'idx',
  'input_ids',
  'token_type_ids',
  'attention_mask'],
 'test': ['sentence1',
  'sentence2',
  'label',
  'idx',
  'input_ids',
  'token_type_ids',
  'attention_mask']}

**batched=True** ускоряет обработку за счёт групповой токенизации.

In [33]:
from transformers import AutoTokenizer

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], padding="max_length", truncation=True,
                     max_length=128)  # фиксированная длинна после токенизации - 128


tokenized_datasets = dataset.map(tokenize_function, batched=True)  # функция вызывается один раз на batch
tokenized_datasets.column_names

{'train': ['sentence1',
  'sentence2',
  'label',
  'idx',
  'input_ids',
  'token_type_ids',
  'attention_mask'],
 'validation': ['sentence1',
  'sentence2',
  'label',
  'idx',
  'input_ids',
  'token_type_ids',
  'attention_mask'],
 'test': ['sentence1',
  'sentence2',
  'label',
  'idx',
  'input_ids',
  'token_type_ids',
  'attention_mask']}

Удалим ненужные столбцы

In [34]:
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2', 'idx'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets = tokenized_datasets.with_format('torch')

tokenized_datasets.column_names

{'train': ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
 'validation': ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
 'test': ['labels', 'input_ids', 'token_type_ids', 'attention_mask']}

Можно создать датасет меньшего размера, используя **select**

In [35]:
tokenized_datasets['train'].select(range(100))

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})

Если надо обрабатывать столбцы как разные

In [36]:
tokenizer('zaza', 'berg')

{'input_ids': [101, 195, 10961, 1161, 102, 1129, 10805, 102], 'token_type_ids': [0, 0, 0, 0, 0, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [37]:
tokenizer(['zaza', 'berg'])

{'input_ids': [[101, 195, 10961, 1161, 102], [101, 1129, 10805, 102]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1]]}

Как можно заметить передача столбцов в тупую в токенайзер не работает

In [38]:
tokenizer([dataset["train"]["sentence1"], dataset["train"]["sentence2"]], padding=True, truncation=True, )

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

Если не использовать batch, то можно и так, ибо map работает построчно

In [39]:
from transformers import AutoTokenizer

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer([example["sentence1"], example["sentence2"]], padding="max_length", truncation=True,
                     max_length=128)


tokenized_datasets = dataset.map(tokenize_function)  # по одной строке за раз
tokenized_datasets['train']['token_type_ids'][0]

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

Других способов много, можно спокойно заколхозить

## Dynamic padding

Ранее мы использовали фиксированную длину для токенизатора. Чтобы в случае батчинга не заполнять массивы большим количеством пустых значений воспользуемся **DataCollatorWithPadding**

In [40]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)  # убираем padding


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")

In [41]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True,
                              collate_fn=data_collator)

for step, batch in enumerate(train_dataloader):
    print(batch["input_ids"].shape)
    if step > 5:
        break

torch.Size([16, 72])
torch.Size([16, 78])
torch.Size([16, 80])
torch.Size([16, 111])
torch.Size([16, 80])
torch.Size([16, 73])
torch.Size([16, 82])


In [42]:
next(iter(train_dataloader))

{'labels': tensor([1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1]), 'input_ids': tensor([[  101,  1960,  6449,  ...,     0,     0,     0],
        [  101,  1244,  8570,  ...,  1177,   119,   102],
        [  101,  1109,  2084,  ...,     0,     0,     0],
        ...,
        [  101, 13857,  1144,  ...,     0,     0,     0],
        [  101,  4180,   118,  ...,     0,     0,     0],
        [  101,  1252,  1103,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

# Trainer

## Базовая работа

In [43]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test-trainer",
                                  num_train_epochs=10,
                                  per_device_train_batch_size=30,
                                  per_device_eval_batch_size=30,
                                  # eval_strategy='steps',
                                  eval_strategy='epoch',
                                  learning_rate=1e-5,
                                  weight_decay=0.01,
                                  use_cpu=False)

После создания экземпляра предобученной модели будет распечатано предупреждение. Это происходит потому, что BERT не был предобучен для задачи классификации пар предложений, его последний слой не будет использован, вместо него будет добавлен слой, позволяющий работать с такой задачей. Предупреждения сообщают, что некоторые веса не будут использованы (как раз тех слоев, которые не будут использоваться) и для новых будут инициализированы случайные веса. В заключении предлагается обучить модель, что мы и сделаем прямо сейчас.

In [44]:
from transformers import AutoModelForSequenceClassification

checkpoint = "bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Можно заметить, что к модели автоматически добавился линейный слой в конец

In [45]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [46]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.524259
2,No log,0.412525
3,No log,0.449286
4,No log,0.469331
5,0.371700,0.54033
6,0.371700,0.58787
7,0.371700,0.61064
8,0.371700,0.639322
9,0.080600,0.635914
10,0.080600,0.647751


TrainOutput(global_step=1230, training_loss=0.19131156487193535, metrics={'train_runtime': 254.8007, 'train_samples_per_second': 143.956, 'train_steps_per_second': 4.827, 'total_flos': 1562116030111320.0, 'train_loss': 0.19131156487193535, 'epoch': 10.0})

In [47]:
predictions = trainer.predict(tokenized_datasets["test"])
predictions.predictions.shape, predictions.label_ids.shape

((1725, 2), (1725,))

In [48]:
predictions.predictions[:5, :]

array([[-2.4772704,  2.603379 ],
       [-2.0506606,  1.9318771],
       [-2.4952757,  2.5817323],
       [-1.7328042,  1.3866795],
       [ 2.3559062, -2.6548834]], dtype=float32)

In [52]:
predictions.label_ids

array([1, 1, 1, ..., 0, 1, 1], shape=(1725,))

In [53]:
from torch import argmax, tensor

preds = argmax(tensor(predictions.predictions), dim=-1)
preds, preds.shape

(tensor([1, 1, 1,  ..., 1, 1, 1]), torch.Size([1725]))

Посмотрим на метрики

In [54]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8289855072463768, 'f1': 0.8735533647663952}

## Работа с учетом метрик

In [55]:
from torch import argmax, tensor
import evaluate


def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = argmax(tensor(logits), dim=-1)
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments("test-trainer", eval_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.583257,0.759804,0.850153
2,0.571400,0.456783,0.813725,0.87291
3,0.355300,0.748559,0.823529,0.876712


TrainOutput(global_step=1377, training_loss=0.3969130671881383, metrics={'train_runtime': 130.7957, 'train_samples_per_second': 84.131, 'train_steps_per_second': 10.528, 'total_flos': 419446300011600.0, 'train_loss': 0.3969130671881383, 'epoch': 3.0})

In [56]:
import evaluate
from torch import argmax, tensor

predictions = trainer.predict(tokenized_datasets["test"])
preds = argmax(tensor(predictions.predictions), dim=-1)

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8220289855072463, 'f1': 0.8723492723492724}

# Pytorch Training

In [57]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer)


def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)  # убираем padding


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")

In [58]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=16, collate_fn=data_collator, pin_memory=True
    # ускорит перекид с CPU на GPU
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=16, collate_fn=data_collator, pin_memory=True
    # ускорит перекид с CPU на GPU
)

for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([16]),
 'input_ids': torch.Size([16, 72]),
 'token_type_ids': torch.Size([16, 72]),
 'attention_mask': torch.Size([16, 72])}

In [59]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
outputs = model(**batch)
outputs.loss.item(), outputs.logits.shape

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(0.8421317338943481, torch.Size([16, 2]))

In [60]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [61]:
from torch.optim import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)  # количество эпох * количество батчей
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
num_training_steps

690

In [62]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/690 [00:00<?, ?it/s]

In [63]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.8529411764705882, 'f1': 0.8969072164948454}

In [64]:
import evaluate
from torch import argmax, tensor

predictions = trainer.predict(tokenized_datasets["test"])
preds = argmax(tensor(predictions.predictions), dim=-1)

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8220289855072463, 'f1': 0.8723492723492724}

## Ускорение работы с помощью accelerate

In [92]:
from accelerate import Accelerator, notebook_launcher
from transformers import AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

optimizer = AdamW(model.parameters(), lr=3e-5)

accelerator = Accelerator()
train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

num_epochs = 5
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)


def train_with_accelerator(model):
    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dl:
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)


notebook_launcher(train_with_accelerator, args=(model,), num_processes=1)
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Launching training on one GPU.


  0%|          | 0/1150 [00:00<?, ?it/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [100]:
import evaluate
from torch import inference_mode

metric = evaluate.load("glue", "mrpc")

@inference_mode()
def eval_with_accelerator(model,metric):
    model.eval()

    for batch in eval_dl:
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        metric.add_batch(
            predictions=accelerator.gather(predictions),
            references=accelerator.gather(batch["labels"]))



notebook_launcher(eval_with_accelerator, args=(model,metric,), num_processes=1)
metric.compute()

Launching training on one GPU.


{'accuracy': 0.8406862745098039, 'f1': 0.8877374784110535}