<a href="https://colab.research.google.com/github/nukano0522/pytorch/blob/master/livedoor_news_cls/bert_livedoor_with_trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

参考：https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing#scrollTo=Ybeyl20n3dYH

In [1]:
! pip install -q transformers trl bitsandbytes peft accelerate bitsandbytes fugashi ipadic unidic_lite

In [2]:
# MODEL_ID = "cl-tohoku/bert-base-japanese-whole-word-masking"
MODEL_ID = "cl-tohoku/bert-base-japanese-v2"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
import os

df = pd.read_csv("./drive/MyDrive/Colab_Notebooks/data/livedoor_text.csv")
# カテゴリーのID列を付与しておく
categories = df['category'].unique().tolist()
df.sample(3)


Unnamed: 0,text,category
4115,4日（月）、NHKの情報番組「あさイチ」の放送内容が不謹慎であると、ネット掲示板上で話題にな...,8
1584,9日、W杯アジア最終予選の組み合わせ抽選発表が行なわれ、グループBに入った日本はオーストラリ...,7
4848,Xperia GX SO-04D特集！ \n\nNTTドコモから今月9日（木）に発売開始され...,6


In [5]:
from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(df, train_size=0.7)
eval_df, test_df = train_test_split(eval_df, train_size=0.5)
print('train size', train_df.shape)
print('eval size', eval_df.shape)
print('test size', test_df.shape)


train size (5156, 2)
eval size (1105, 2)
test size (1106, 2)


In [6]:
from torch.utils.data import IterableDataset
from tqdm import tqdm

class LivedoorDataset(IterableDataset):
    def __init__(self, df):
        self.features = [
            {
                'title': row.text,
                'labels': row.category
            } for row in tqdm(df.itertuples(), total=df.shape[0])
        ]

    def __len__(self):
        return len(self.features)

    def __iter__(self):
        return iter(self.features)

train_dataset = LivedoorDataset(train_df)
eval_dataset = LivedoorDataset(eval_df)
test_dataset = LivedoorDataset(test_df)


100%|██████████| 5156/5156 [00:00<00:00, 660109.01it/s]
100%|██████████| 1105/1105 [00:00<00:00, 729300.70it/s]
100%|██████████| 1106/1106 [00:00<00:00, 674945.47it/s]


In [7]:
import torch
from transformers import AutoTokenizer

class LivedoorCollator():
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, examples):
        examples = {
            'title': list(map(lambda x: x['title'], examples)),
            'labels': list(map(lambda x: x['labels'], examples))
        }

        encodings = self.tokenizer(examples['title'],
                                   padding=True,
                                   truncation=True,
                                   max_length=self.max_length,
                                   return_tensors='pt')
        encodings['labels'] = torch.tensor(examples['labels'])
        return encodings

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
livedoor_collator = LivedoorCollator(tokenizer)


In [8]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM

pretrained_model = AutoModel.from_pretrained(MODEL_ID)

In [9]:
# パラメタ数
sum(p.numel() for p in pretrained_model.parameters() if p.requires_grad)
# sum(p.numel() for p in pretrained_model.parameters())

111207168

In [21]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [22]:
# from peft import LoraConfig, get_peft_model, TaskType

# config = LoraConfig(
#     task_type=TaskType.SEQ_CLS,
#     inference_mode=False,
#     r=16,
#     lora_alpha=32,
#     lora_dropout=0.1
# )

# model = get_peft_model(pretrained_model, config)
print_trainable_parameters(pretrained_model)

trainable params: 111207168 || all params: 111207168 || trainable%: 100.0


In [13]:
import torch.nn as nn
from transformers.modeling_outputs import ModelOutput

class LivedoorNet(nn.Module):
    def __init__(self, pretrained_model, num_categories, loss_function=None):
        super().__init__()
        self.bert = pretrained_model
        self.hidden_size = self.bert.config.hidden_size
        self.linear = nn.Linear(self.hidden_size, num_categories)
        self.loss_function = loss_function

    def forward(self,
                input_ids,
                attention_mask=None,
                position_ids=None,
                token_type_ids=None,
                output_attentions=False,
                output_hidden_states=False,
                labels=None
                ):

        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            position_ids=position_ids,
                            token_type_ids=token_type_ids,
                            output_attentions=output_attentions,
                            output_hidden_states=output_hidden_states)

        state = outputs.last_hidden_state[:, 0, :]
        state = self.linear(state)

        loss=None
        if labels is not None and self.loss_function is not None:
            loss = self.loss_function(state, labels)

        attentions=None
        if output_attentions:
            attentions=outputs.attentions

        hidden_states=None
        if output_hidden_states:
            hidden_states=outputs.hidden_states

        return ModelOutput(
            logits=state,
            loss=loss,
            last_hidden_state=outputs.last_hidden_state,
            attentions=attentions,
            hidden_states=hidden_states
        )

loss_fct = nn.CrossEntropyLoss()
net = LivedoorNet(pretrained_model, len(categories), loss_fct)

In [14]:
net

LivedoorNet(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32768, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [15]:
from torch.utils.data import DataLoader
loader = DataLoader(train_dataset, collate_fn=livedoor_collator, batch_size=4)
batch = next(iter(loader))
batch

{'input_ids': tensor([[    2, 14966,    38,  ..., 12675,   893,     3],
        [    2, 24855, 30768,  ...,     0,     0,     0],
        [    2, 11807,   828,  ...,     0,     0,     0],
        [    2, 11436,  2719,  ..., 29434,   873,     3]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([1, 6, 4, 7])}

In [16]:
# check
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
batch.to(device)
res = net(**batch)
# res


In [17]:
from transformers import EvalPrediction
from typing import Dict
from sklearn.metrics import precision_score, recall_score, f1_score

def custom_compute_metrics(res: EvalPrediction) -> Dict:
    # res.predictions, res.label_idsはnumpyのarray
    pred = res.predictions.argmax(axis=1)
    target = res.label_ids
    precision = precision_score(target, pred, average='macro')
    recall = recall_score(target, pred, average='macro')
    f1 = f1_score(target, pred, average='macro')
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [18]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./output/model',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    label_names=['labels'],
    lr_scheduler_type='constant',
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    remove_unused_columns=False,
    report_to='none'
)


In [19]:
from transformers import Trainer
from trl import SFTTrainer
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=net,
    # model=pretrained_model,
    tokenizer=tokenizer,
    data_collator=livedoor_collator,
    compute_metrics=custom_compute_metrics,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train(ignore_keys_for_eval=['last_hidden_state', 'hidden_states', 'attentions'])


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4969,0.240392,0.931535,0.922372,0.925606
2,0.1415,0.233901,0.932797,0.929482,0.930819
3,0.0643,0.334237,0.934147,0.927853,0.930226
4,0.0339,0.316993,0.934872,0.932826,0.933573
5,0.0169,0.345029,0.942964,0.937647,0.939433


TrainOutput(global_step=1615, training_loss=0.15069796706869876, metrics={'train_runtime': 761.6121, 'train_samples_per_second': 33.849, 'train_steps_per_second': 2.121, 'total_flos': 0.0, 'train_loss': 0.15069796706869876, 'epoch': 5.0})

In [20]:
pred_result = trainer.predict(test_dataset, ignore_keys=['loss', 'last_hidden_state', 'hidden_states', 'attentions'])
test_df['predict'] = pred_result.predictions.argmax(axis=1).tolist()

from sklearn.metrics import classification_report
print(classification_report(test_df['category'], test_df['predict']))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       130
           1       0.97      0.94      0.96       123
           2       0.88      0.96      0.92       123
           3       0.93      0.81      0.86        79
           4       0.95      0.96      0.96       124
           5       0.96      0.94      0.95       128
           6       0.99      0.99      0.99       134
           7       0.97      1.00      0.98       157
           8       0.95      0.93      0.94       108

    accuracy                           0.95      1106
   macro avg       0.95      0.94      0.94      1106
weighted avg       0.95      0.95      0.95      1106

