In [None]:
import os
import glob
import pickle
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset

torch.cuda.empty_cache()

In [None]:
# ---------------------------------------------------------
# データの読み込み
# ---------------------------------------------------------
dataset = load_dataset("imdb")
train_data = dataset["train"].train_test_split(shuffle=True)

In [None]:
# ---------------------------------------------------------
# トークナイズ
# ---------------------------------------------------------
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

train_dataset = train_data["train"]
eval_dataset = train_data["test"]

train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
# ---------------------------------------------------------
# 評価と学習
# ---------------------------------------------------------
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
data_collator = DataCollatorWithPadding(tokenizer)

for param in model.bert.encoder.layer[:2].parameters():    # 先頭 2 層を凍結
  param.requires_grad = False

training_args = TrainingArguments(
  output_dir = "./results",                 # 結果を格納するディレクトリー
  logging_dir = "./logs",                   # 途中経過のログを格納するディレクトリー
  overwrite_output_dir = True,              # ファイルを上書きする
  num_train_epochs = 2,                     # エポック数
  per_device_train_batch_size = 4,          # 訓練時のバッチサイズ
  per_device_eval_batch_size = 4,           # 評価時のバッチサイズ
  warmup_steps = 500,                       # 学習系数がこのステップ数で徐々に増加
  weight_decay = 0.01,                      # 重みの減衰率
  eval_strategy = "steps",                  # 訓練中、一定のステップごとに評価
  save_safetensors = False                  # safetensors の使用を無効化
)

trainer = Trainer(
  model = model,                            # 使用するモデルを指定
  args = training_args,                     # TrainingArguments の設定
  train_dataset = train_dataset,            # 訓練用のデータ
  eval_dataset = eval_dataset,              # 評価用のデータ
  data_collator = data_collator             # データコレーターを設定
)

trainer.train()
trainer.evaluate()

In [None]:
# ---------------------------------------------------------
# モデルの保存
# ---------------------------------------------------------
model_path = "./model"

if not os.path.exists(model_path):
  os.makedirs(model_path)

model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)