## Fine-tuning

In [1]:
import torch

print(torch.__version__)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

2.0.1+cu117
cuda:0


In [2]:
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from datasets import load_dataset

# BERTモデルとトークナイザの準備
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# データセットのダウンロードと前処理
dataset = load_dataset('imdb')

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

train_dataset, test_dataset = dataset['train'].map(tokenize, batched=True), dataset['test'].map(tokenize, batched=True)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [4]:
test_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 25000
})

In [8]:
# トレーニングの設定
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,  # accumulate gradients over 2 batches
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# トレーナーの初期化とトレーニング開始
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

  0%|          | 0/9375 [00:00<?, ?it/s]

{'loss': 0.4668, 'learning_rate': 5e-05, 'epoch': 0.16}
{'loss': 0.36, 'learning_rate': 4.71830985915493e-05, 'epoch': 0.32}
{'loss': 0.3425, 'learning_rate': 4.436619718309859e-05, 'epoch': 0.48}
{'loss': 0.321, 'learning_rate': 4.154929577464789e-05, 'epoch': 0.64}
{'loss': 0.2957, 'learning_rate': 3.8732394366197184e-05, 'epoch': 0.8}
{'loss': 0.2968, 'learning_rate': 3.5915492957746486e-05, 'epoch': 0.96}
{'loss': 0.2073, 'learning_rate': 3.3098591549295775e-05, 'epoch': 1.12}
{'loss': 0.2059, 'learning_rate': 3.028169014084507e-05, 'epoch': 1.28}
{'loss': 0.1906, 'learning_rate': 2.746478873239437e-05, 'epoch': 1.44}
{'loss': 0.1981, 'learning_rate': 2.4647887323943664e-05, 'epoch': 1.6}
{'loss': 0.1749, 'learning_rate': 2.1830985915492956e-05, 'epoch': 1.76}
{'loss': 0.1839, 'learning_rate': 1.9014084507042255e-05, 'epoch': 1.92}
{'loss': 0.1278, 'learning_rate': 1.619718309859155e-05, 'epoch': 2.08}
{'loss': 0.0868, 'learning_rate': 1.3380281690140845e-05, 'epoch': 2.24}
{'loss'

TrainOutput(global_step=9375, training_loss=0.2035408331298828, metrics={'train_runtime': 2544.3499, 'train_samples_per_second': 29.477, 'train_steps_per_second': 3.685, 'train_loss': 0.2035408331298828, 'epoch': 3.0})