<a href="https://colab.research.google.com/github/nukano0522/pytorch/blob/master/livedoor_news_cls/bert_livedoor_with_hf_trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://qiita.com/m__k/items/2c4e476d7ac81a3a44af

In [1]:
! pip install transformers fugashi ipadic accelerate==0.20.3

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fugashi
  Downloading fugashi-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (599 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m599.9/599.9 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ipadic
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate==0.20.3
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-p

In [2]:
import pandas as pd
import os

df = pd.read_csv("./drive/MyDrive/Colab_Notebooks/data/livedoor_text.csv")
# カテゴリーのID列を付与しておく
categories = df['category'].unique().tolist()
df.sample(3)


Unnamed: 0,text,category
6844,人気ホラー小説「リング」の最新作について構想が明らかになった。作家の鈴木光司氏が次回作につい...,2
2023,ロボット格闘技の世界を舞台に、人生に挫折した男の再起のドラマと、父と息子とのかけがえのない...,4
3720,野球解説者・野村克也氏と、中日ドラゴンズ・落合博満監督による注目の対談が、19日深夜にTBS...,7


In [3]:
from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(df, train_size=0.7)
eval_df, test_df = train_test_split(eval_df, train_size=0.5)
print('train size', train_df.shape)
print('eval size', eval_df.shape)
print('test size', test_df.shape)
# train size (5163, 4)
# eval size (1106, 4)
# test size (1107, 4)


train size (5156, 2)
eval size (1105, 2)
test size (1106, 2)


In [8]:
# from torch.utils.data import Dataset
# from tqdm import tqdm

# class LivedoorDataset(Dataset):
#     def __init__(self, df):
#         self.features = [
#             {
#                 'title': row.text,
#                 'category_id': row.category
#             } for row in tqdm(df.itertuples(), total=df.shape[0])
#         ]

#     def __len__(self):
#         return len(self.features)

#     def __getitem__(self, idx):
#         return self.features[idx]

# train_dataset = LivedoorDataset(train_df)
# eval_dataset = LivedoorDataset(eval_df)
# test_dataset = LivedoorDataset(test_df)


In [15]:
from torch.utils.data import IterableDataset
from tqdm import tqdm

class LivedoorDataset(IterableDataset):
    def __init__(self, df):
        self.features = [
            {
                'title': row.text,
                'category_id': row.category
            } for row in tqdm(df.itertuples(), total=df.shape[0])
        ]

    def __len__(self):
        return len(self.features)

    def __iter__(self):
        return iter(self.features)

train_dataset = LivedoorDataset(train_df)
eval_dataset = LivedoorDataset(eval_df)
test_dataset = LivedoorDataset(test_df)


100%|██████████| 5156/5156 [00:00<00:00, 118417.25it/s]
100%|██████████| 1105/1105 [00:00<00:00, 370835.81it/s]
100%|██████████| 1106/1106 [00:00<00:00, 352794.91it/s]


In [16]:
import torch
from transformers import AutoTokenizer

class LivedoorCollator():
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, examples):
        examples = {
            'title': list(map(lambda x: x['title'], examples)),
            'category_id': list(map(lambda x: x['category_id'], examples))
        }

        encodings = self.tokenizer(examples['title'],
                                   padding=True,
                                   truncation=True,
                                   max_length=self.max_length,
                                   return_tensors='pt')
        encodings['category_id'] = torch.tensor(examples['category_id'])
        return encodings

tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
livedoor_collator = LivedoorCollator(tokenizer)


In [17]:
import torch.nn as nn
from transformers import AutoModel
from transformers.modeling_outputs import ModelOutput

class LivedoorNet(nn.Module):
    def __init__(self, pretrained_model, num_categories, loss_function=None):
        super().__init__()
        self.bert = pretrained_model
        self.hidden_size = self.bert.config.hidden_size
        self.linear = nn.Linear(self.hidden_size, num_categories)
        self.loss_function = loss_function

    def forward(self,
                input_ids,
                attention_mask=None,
                position_ids=None,
                token_type_ids=None,
                output_attentions=False,
                output_hidden_states=False,
                category_id=None):

        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            position_ids=position_ids,
                            token_type_ids=token_type_ids,
                            output_attentions=output_attentions,
                            output_hidden_states=output_hidden_states)

        state = outputs.last_hidden_state[:, 0, :]
        state = self.linear(state)

        loss=None
        if category_id is not None and self.loss_function is not None:
            loss = self.loss_function(state, category_id)

        attentions=None
        if output_attentions:
            attentions=outputs.attentions

        hidden_states=None
        if output_hidden_states:
            hidden_states=outputs.hidden_states

        return ModelOutput(
            logits=state,
            loss=loss,
            last_hidden_state=outputs.last_hidden_state,
            attentions=attentions,
            hidden_states=hidden_states
        )

loss_fct = nn.CrossEntropyLoss()
pretrained_model = AutoModel.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
net = LivedoorNet(pretrained_model, len(categories), loss_fct)


In [18]:
from transformers import EvalPrediction
from typing import Dict
from sklearn.metrics import precision_score, recall_score, f1_score

def custom_compute_metrics(res: EvalPrediction) -> Dict:
    # res.predictions, res.label_idsはnumpyのarray
    pred = res.predictions.argmax(axis=1)
    target = res.label_ids
    precision = precision_score(target, pred, average='macro')
    recall = recall_score(target, pred, average='macro')
    f1 = f1_score(target, pred, average='macro')
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [19]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./output/model',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    label_names=['category_id'],
    lr_scheduler_type='constant',
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    remove_unused_columns=False,
    report_to='none'
)


In [None]:
from transformers import Trainer
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=net,
    tokenizer=tokenizer,
    data_collator=livedoor_collator,
    compute_metrics=custom_compute_metrics,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train(ignore_keys_for_eval=['last_hidden_state', 'hidden_states', 'attentions'])




Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4539,0.22221,0.935225,0.928014,0.929894


In [10]:
trainer.save_state()
trainer.save_model()

In [16]:
pred_result = trainer.predict(test_dataset, ignore_keys=['loss', 'last_hidden_state', 'hidden_states', 'attentions'])
test_df['predict'] = pred_result.predictions.argmax(axis=1).tolist()

from sklearn.metrics import classification_report
print(classification_report(test_df['category'], test_df['predict']))

              precision    recall  f1-score   support

           0       0.99      0.90      0.94       125
           1       0.86      0.97      0.91       136
           2       0.98      0.87      0.92       132
           3       0.98      0.77      0.87        84
           4       0.96      0.97      0.97       131
           5       0.85      0.98      0.91       119
           6       0.96      0.98      0.97       118
           7       0.96      1.00      0.98       135
           8       0.98      0.97      0.97       126

    accuracy                           0.94      1106
   macro avg       0.95      0.94      0.94      1106
weighted avg       0.95      0.94      0.94      1106

