In [1]:
import os
import torch
from torch.utils.data import DataLoader, random_split
from transformers import BertJapaneseTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time

# モデルとトークナイザの準備
model_name = "cl-tohoku/bert-base-japanese-v3"
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# モデルのパラメータ数を出力
model_parameters = sum(p.numel() for p in model.parameters())
print(f"Number of model parameters: {model_parameters}")


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v3 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

Number of model parameters: 111208706


In [2]:
# データの読み込みと前処理
def load_data(directory):
    texts, labels = [], []
    for label, category in enumerate(["dokujo-tsushin", "it-life-hack"]):
        category_dir = os.path.join(directory, category)
        for file in os.listdir(category_dir):
            if os.path.isfile(os.path.join(category_dir, file)):
                with open(os.path.join(category_dir, file), 'r', encoding='utf-8') as f:
                    lines = f.readlines()[2:]  # 最初の2行をスキップ
                    text = ''.join(lines).strip()
                    texts.append(text)
                    labels.append(label)
    return texts, labels

home_dir = os.path.expanduser("~")
data_dir = os.path.join(home_dir, "data/livedoor-text")
texts, labels = load_data(data_dir)

# データセットの分割
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.4, random_state=1)
val_texts, test_texts, val_labels, test_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=1)

# データセットの準備
class LivedoorDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

train_dataset = LivedoorDataset(train_encodings, train_labels)
val_dataset = LivedoorDataset(val_encodings, val_labels)
test_dataset = LivedoorDataset(test_encodings, test_labels)

print(f"{len(train_dataset)=}")
print(f"{train_texts[0]=}")
print(f"{train_labels[0]=}")


len(train_dataset)=1045
train_texts[0]='結婚しても働くのはなぜ？ 既婚女性のつぶやき\n\u3000「彼の収入が少ないから私も働かなければならないし、それを思うと結婚はもう少し先でもいいかな」と結婚を躊躇する独女がいる。彼女は彼の収入だけで暮らせるのなら、仕事は今すぐにでも辞めたいらしい。つまり専業主婦志望なのだが、彼の年収を聞いて首を傾げた。\n\n\u3000この金額で本当に生活ができないのだろうか？ \n\n\u3000かつて専業主婦が多かった時代、主婦の働き先はなく、今月もかつかつだとこぼしながらも、夫の稼ぎだけで暮らしていた家庭が多かった。しかし今は不況で夫の収入が減ったとはいえ、外食、ブランド品購入、安いツアーとはいえ海外旅行にも行っている。食べるだけで精一杯の昔に比べれば、ものすごく贅沢ではないだろうか？\n\u3000\n\u3000成人した二人の子供がいる専業主婦の紀世子さん（56 歳）は、「今は専業主婦がセレブのように言われますけど、私はブランド品も持ったことがなければ、家族で海外旅行にも行ったことがないんですよ。夫の収入だけで充分とはいいませんけど、贅沢さえしなければ毎月何とかなったものです」という。\n\u3000\n\u3000子供が小学校に入学すると、塾の費用を捻出するためにパートに行く主婦もいたが、紀世子さんの家庭はご主人の方針で塾には一切通わせず、兄は水泳、妹は習字と、週に一度の習い事に通わせただけだそうだ。\n\n\u3000「私立中学受験で塾に通わせているご家庭は大変そうでしたよ。塾の費用が一か月5万円と聞いてびっくりしました。そこまでして私立に行かせて、その後も莫大な教育費がかかるのに大変だとあと思いました」\n\n\u3000紀世子さんの長女は私立の女子大学に入学したが、中学・高校から持ち上がりできた友人には小学校の時の同級生もいる。「中高一貫教育の必要性はよく分かりませんが、結局同じ大学に通うなら何も高い教育費を払って中学から行く必要がないのでは？」これは私の考えですがと紀世子さん。\n\n\u3000仕事に生きがいを持ち自分のために働いている主婦もいるが、家族で海外旅行に行ったり外食をしたり、生活水準を上げるために働いている主婦もいる。自分の稼ぎでブランド品を買う主婦もいるが、やはり

In [3]:
# トレーニングの設定
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

# トレーニングの実行
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

# モデルの保存
model.save_pretrained("./finetuned_bert_japanese")

# 評価
def evaluate_model(model, dataset, batch_size=8):
    dataloader = DataLoader(dataset, batch_size=batch_size)
    model.eval()
    total, correct = 0, 0
    start_time = time.time()
    for batch in dataloader:
        inputs = {k: v.to(model.device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(model.device)
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        total += labels.size(0)
        correct += (predictions == labels).sum().item()
    elapsed_time = time.time() - start_time
    return correct / total, elapsed_time

train_accuracy, train_time = evaluate_model(model, train_dataset)
val_accuracy, val_time = evaluate_model(model, val_dataset)
test_accuracy, test_time = evaluate_model(model, test_dataset)

print(f"Train Accuracy: {train_accuracy}, Time: {train_time}")
print(f"Validation Accuracy: {val_accuracy}, Time: {val_time}")
print(f"Test Accuracy: {test_accuracy}, Time: {test_time}")



  0%|          | 0/2620 [00:00<?, ?it/s]

{'loss': 0.6939, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.08}
{'loss': 0.6485, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.15}
{'loss': 0.6016, 'learning_rate': 3e-06, 'epoch': 0.23}
{'loss': 0.4639, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.31}
{'loss': 0.4111, 'learning_rate': 5e-06, 'epoch': 0.38}
{'loss': 0.2809, 'learning_rate': 6e-06, 'epoch': 0.46}
{'loss': 0.2223, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.53}
{'loss': 0.1178, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.61}
{'loss': 0.0928, 'learning_rate': 9e-06, 'epoch': 0.69}
{'loss': 0.0277, 'learning_rate': 1e-05, 'epoch': 0.76}
{'loss': 0.0405, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.84}
{'loss': 0.0107, 'learning_rate': 1.2e-05, 'epoch': 0.92}
{'loss': 0.0666, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.99}


  0%|          | 0/44 [00:00<?, ?it/s]

{'eval_loss': 0.12046581506729126, 'eval_runtime': 97.0742, 'eval_samples_per_second': 3.585, 'eval_steps_per_second': 0.453, 'epoch': 1.0}
{'loss': 0.0047, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.07}
{'loss': 0.0096, 'learning_rate': 1.5e-05, 'epoch': 1.15}
{'loss': 0.055, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.22}
{'loss': 0.1524, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.3}
{'loss': 0.0024, 'learning_rate': 1.8e-05, 'epoch': 1.37}
{'loss': 0.0243, 'learning_rate': 1.9e-05, 'epoch': 1.45}
{'loss': 0.0058, 'learning_rate': 2e-05, 'epoch': 1.53}
{'loss': 0.0031, 'learning_rate': 2.1e-05, 'epoch': 1.6}
{'loss': 0.0008, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.68}
{'loss': 0.0606, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.76}
{'loss': 0.0009, 'learning_rate': 2.4e-05, 'epoch': 1.83}
{'loss': 0.0035, 'learning_rate': 2.5e-05, 'epoch': 1.91}
{'loss': 0.0117, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.98}


  0%|          | 0/44 [00:00<?, ?it/s]

{'eval_loss': 0.046354349702596664, 'eval_runtime': 96.3035, 'eval_samples_per_second': 3.614, 'eval_steps_per_second': 0.457, 'epoch': 2.0}
{'loss': 0.001, 'learning_rate': 2.7000000000000002e-05, 'epoch': 2.06}
{'loss': 0.0005, 'learning_rate': 2.8000000000000003e-05, 'epoch': 2.14}
{'loss': 0.0004, 'learning_rate': 2.9e-05, 'epoch': 2.21}
{'loss': 0.0003, 'learning_rate': 3e-05, 'epoch': 2.29}
{'loss': 0.0004, 'learning_rate': 3.1e-05, 'epoch': 2.37}
{'loss': 0.0003, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.44}
{'loss': 0.0003, 'learning_rate': 3.3e-05, 'epoch': 2.52}
{'loss': 0.0002, 'learning_rate': 3.4000000000000007e-05, 'epoch': 2.6}
{'loss': 0.0002, 'learning_rate': 3.5e-05, 'epoch': 2.67}
{'loss': 0.0002, 'learning_rate': 3.6e-05, 'epoch': 2.75}
{'loss': 0.0046, 'learning_rate': 3.7e-05, 'epoch': 2.82}
{'loss': 0.0003, 'learning_rate': 3.8e-05, 'epoch': 2.9}
{'loss': 0.0541, 'learning_rate': 3.9000000000000006e-05, 'epoch': 2.98}


  0%|          | 0/44 [00:00<?, ?it/s]

{'eval_loss': 0.04289337247610092, 'eval_runtime': 96.4297, 'eval_samples_per_second': 3.609, 'eval_steps_per_second': 0.456, 'epoch': 3.0}
{'loss': 0.0044, 'learning_rate': 4e-05, 'epoch': 3.05}
{'loss': 0.0004, 'learning_rate': 4.1e-05, 'epoch': 3.13}
{'loss': 0.0674, 'learning_rate': 4.2e-05, 'epoch': 3.21}
{'loss': 0.0084, 'learning_rate': 4.3e-05, 'epoch': 3.28}
{'loss': 0.0579, 'learning_rate': 4.4000000000000006e-05, 'epoch': 3.36}
{'loss': 0.004, 'learning_rate': 4.5e-05, 'epoch': 3.44}
{'loss': 0.0075, 'learning_rate': 4.600000000000001e-05, 'epoch': 3.51}
{'loss': 0.0003, 'learning_rate': 4.7e-05, 'epoch': 3.59}
{'loss': 0.0692, 'learning_rate': 4.8e-05, 'epoch': 3.66}
{'loss': 0.0539, 'learning_rate': 4.9e-05, 'epoch': 3.74}
{'loss': 0.0027, 'learning_rate': 5e-05, 'epoch': 3.82}
{'loss': 0.0001, 'learning_rate': 4.976415094339622e-05, 'epoch': 3.89}
{'loss': 0.0001, 'learning_rate': 4.952830188679246e-05, 'epoch': 3.97}


  0%|          | 0/44 [00:00<?, ?it/s]

{'eval_loss': 0.07540009170770645, 'eval_runtime': 94.5978, 'eval_samples_per_second': 3.679, 'eval_steps_per_second': 0.465, 'epoch': 4.0}
{'loss': 0.0003, 'learning_rate': 4.929245283018868e-05, 'epoch': 4.05}
{'loss': 0.0001, 'learning_rate': 4.9056603773584906e-05, 'epoch': 4.12}
{'loss': 0.0001, 'learning_rate': 4.8820754716981134e-05, 'epoch': 4.2}
{'loss': 0.0001, 'learning_rate': 4.858490566037736e-05, 'epoch': 4.27}
{'loss': 0.0001, 'learning_rate': 4.834905660377358e-05, 'epoch': 4.35}
{'loss': 0.0001, 'learning_rate': 4.811320754716982e-05, 'epoch': 4.43}
{'loss': 0.0001, 'learning_rate': 4.787735849056604e-05, 'epoch': 4.5}
{'loss': 0.0001, 'learning_rate': 4.7641509433962266e-05, 'epoch': 4.58}
{'loss': 0.0, 'learning_rate': 4.7405660377358494e-05, 'epoch': 4.66}
{'loss': 0.0001, 'learning_rate': 4.716981132075472e-05, 'epoch': 4.73}
{'loss': 0.0, 'learning_rate': 4.693396226415094e-05, 'epoch': 4.81}
{'loss': 0.0, 'learning_rate': 4.669811320754717e-05, 'epoch': 4.89}
{'l

  0%|          | 0/44 [00:00<?, ?it/s]

{'eval_loss': 0.08540798723697662, 'eval_runtime': 96.1918, 'eval_samples_per_second': 3.618, 'eval_steps_per_second': 0.457, 'epoch': 5.0}
{'loss': 0.0, 'learning_rate': 4.6226415094339625e-05, 'epoch': 5.04}
{'loss': 0.0, 'learning_rate': 4.5990566037735846e-05, 'epoch': 5.11}
{'loss': 0.0, 'learning_rate': 4.575471698113208e-05, 'epoch': 5.19}
{'loss': 0.0, 'learning_rate': 4.55188679245283e-05, 'epoch': 5.27}
{'loss': 0.0, 'learning_rate': 4.528301886792453e-05, 'epoch': 5.34}
{'loss': 0.0, 'learning_rate': 4.504716981132076e-05, 'epoch': 5.42}
{'loss': 0.0, 'learning_rate': 4.4811320754716985e-05, 'epoch': 5.5}
{'loss': 0.0, 'learning_rate': 4.4575471698113206e-05, 'epoch': 5.57}
{'loss': 0.0, 'learning_rate': 4.433962264150944e-05, 'epoch': 5.65}
{'loss': 0.0002, 'learning_rate': 4.410377358490566e-05, 'epoch': 5.73}
{'loss': 0.0, 'learning_rate': 4.386792452830189e-05, 'epoch': 5.8}
{'loss': 0.0, 'learning_rate': 4.363207547169812e-05, 'epoch': 5.88}
{'loss': 0.0, 'learning_rate

  0%|          | 0/44 [00:00<?, ?it/s]

{'eval_loss': 0.08984164893627167, 'eval_runtime': 96.447, 'eval_samples_per_second': 3.608, 'eval_steps_per_second': 0.456, 'epoch': 6.0}
{'train_runtime': 8791.3103, 'train_samples_per_second': 2.377, 'train_steps_per_second': 0.298, 'train_loss': 0.055398016048288125, 'epoch': 6.0}
Train Accuracy: 1.0, Time: 290.0567800998688
Validation Accuracy: 0.9885057471264368, Time: 96.65163064002991
Test Accuracy: 0.9885386819484241, Time: 97.23863410949707
