<a href="https://colab.research.google.com/github/nukano0522/pytorch/blob/master/livedoor_news_cls/bert_livedoor_with_peft_cls.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install -q transformers trl peft accelerate bitsandbytes fugashi ipadic

In [2]:
use_peft = True

In [3]:
import pandas as pd
import os

df = pd.read_csv("./drive/MyDrive/Colab_Notebooks/data/livedoor_text.csv")
# カテゴリーのID列を付与しておく
categories = df['category'].unique().tolist()
df.sample(3)


Unnamed: 0,text,category
1766,10日、神宮球場で行われた、プロ野球＝ヤクルトスワローズ×DeNAベイスターズの一戦では、D...,7
3356,ドラマ「家政婦のミタ」（日本テレビ系）がヒットしている。このヒットで「やはり松嶋菜々子は視聴...,2
7002,国外のゲーム専用機市場は、時期や人気機種の動向などによっても異なるが、おおよそ日本の5倍前後...,1


In [4]:
from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(df, train_size=0.7)
eval_df, test_df = train_test_split(eval_df, train_size=0.5)
print('train size', train_df.shape)
print('eval size', eval_df.shape)
print('test size', test_df.shape)
# train size (5163, 4)
# eval size (1106, 4)
# test size (1107, 4)


train size (5156, 2)
eval size (1105, 2)
test size (1106, 2)


In [5]:
from torch.utils.data import IterableDataset
from tqdm import tqdm

class LivedoorDataset(IterableDataset):
    def __init__(self, df):
        self.features = [
            {
                'title': row.text,
                'labels': row.category
            } for row in tqdm(df.itertuples(), total=df.shape[0])
        ]

    def __len__(self):
        return len(self.features)

    def __iter__(self):
        return iter(self.features)

train_dataset = LivedoorDataset(train_df)
eval_dataset = LivedoorDataset(eval_df)
test_dataset = LivedoorDataset(test_df)


100%|██████████| 5156/5156 [00:00<00:00, 495732.43it/s]
100%|██████████| 1105/1105 [00:00<00:00, 366119.43it/s]
100%|██████████| 1106/1106 [00:00<00:00, 752335.42it/s]


In [6]:
import torch
from transformers import AutoTokenizer

class LivedoorCollator():
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, examples):
        examples = {
            'title': list(map(lambda x: x['title'], examples)),
            'labels': list(map(lambda x: x['labels'], examples))
        }

        encodings = self.tokenizer(examples['title'],
                                   padding=True,
                                   truncation=True,
                                   max_length=self.max_length,
                                   return_tensors='pt')
        encodings['labels'] = torch.tensor(examples['labels'])
        return encodings

tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
livedoor_collator = LivedoorCollator(tokenizer)


In [7]:
from transformers import AutoModel
pretrained_model = AutoModel.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

In [8]:
if use_peft:
  from peft import LoraConfig, TaskType, get_peft_model
  peft_config = LoraConfig(
      task_type=TaskType.SEQ_CLS,
      inference_mode=False,
      r=8,
      lora_alpha=32,
      lora_dropout=0.1)

  pretrained_model = get_peft_model(pretrained_model, peft_config)
  pretrained_model.print_trainable_parameters()

In [9]:
import torch.nn as nn
from transformers.modeling_outputs import ModelOutput

class LivedoorNet(nn.Module):
    def __init__(self, pretrained_model, num_categories, loss_function=None):
        super().__init__()
        self.bert = pretrained_model
        self.hidden_size = self.bert.config.hidden_size
        self.linear = nn.Linear(self.hidden_size, num_categories)
        self.loss_function = loss_function

    def forward(self,
                input_ids,
                attention_mask=None,
                position_ids=None,
                token_type_ids=None,
                output_attentions=False,
                output_hidden_states=False,
                labels=None
                ):

        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            position_ids=position_ids,
                            token_type_ids=token_type_ids,
                            output_attentions=output_attentions,
                            output_hidden_states=output_hidden_states)

        state = outputs.last_hidden_state[:, 0, :]
        state = self.linear(state)

        loss=None
        if labels is not None and self.loss_function is not None:
            loss = self.loss_function(state, labels)

        attentions=None
        if output_attentions:
            attentions=outputs.attentions

        hidden_states=None
        if output_hidden_states:
            hidden_states=outputs.hidden_states

        return ModelOutput(
            logits=state,
            loss=loss,
            last_hidden_state=outputs.last_hidden_state,
            attentions=attentions,
            hidden_states=hidden_states
        )

loss_fct = nn.CrossEntropyLoss()
net = LivedoorNet(pretrained_model, len(categories), loss_fct)
# net = LivedoorNet(peft_pretrained_model, len(categories), loss_fct)


In [10]:
net

LivedoorNet(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [11]:
from torch.utils.data import DataLoader
loader = DataLoader(train_dataset, collate_fn=livedoor_collator, batch_size=4)
batch = next(iter(loader))
batch

{'input_ids': tensor([[    2, 11880,     9,  ..., 28482, 12799,     3],
        [    2,  8653,     9,  ...,  6177,  2638,     3],
        [    2,   182, 18023,  ..., 28599, 28472,     3],
        [    2,  2198,    28,  ...,   451,  6172,     3]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([3, 1, 4, 0])}

In [12]:
# check
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
batch.to(device)
res = net(**batch)
res

# 以下のエラーが出る
# TypeError: forward() got an unexpected keyword argument 'labels'
# peft_model.pyのなかで、return base_model(BertModel)に"labels"を引数としてわたしているが、
# BertModelのforward処理ではlabelsは引数として持っていないためエラーになるっぽい
# とりあえず当該箇所（return base_modeのlabels引数とコメントアウト）することで回避はできた


ModelOutput([('logits',
              tensor([[ 0.3814, -0.0654,  0.0848,  0.1377,  0.5016,  0.4774, -0.1533, -0.5049,
                        0.0648],
                      [ 0.0545,  0.1687,  0.2816,  0.1260,  0.5361,  0.2741, -0.0519, -0.2991,
                       -0.0389],
                      [ 0.2149,  0.0415,  0.3023,  0.3556,  0.5184,  0.1725,  0.0642, -0.6150,
                       -0.0563],
                      [ 0.3263,  0.2143,  0.2687,  0.1166,  0.2404,  0.2502, -0.1370, -0.6291,
                        0.0715]], device='cuda:0', grad_fn=<AddmmBackward0>)),
             ('loss',
              tensor(2.0486, device='cuda:0', grad_fn=<NllLossBackward0>)),
             ('last_hidden_state',
              tensor([[[-1.2678e-01,  3.3030e-02, -3.3959e-01,  ..., -6.3331e-02,
                        -9.9559e-02,  2.7650e-01],
                       [-2.3755e-02,  5.9945e-01, -8.2145e-01,  ..., -1.7633e-02,
                        -2.1293e-01,  2.6402e-01],
                   

In [13]:
from transformers import EvalPrediction
from typing import Dict
from sklearn.metrics import precision_score, recall_score, f1_score

def custom_compute_metrics(res: EvalPrediction) -> Dict:
    # res.predictions, res.label_idsはnumpyのarray
    pred = res.predictions.argmax(axis=1)
    target = res.label_ids
    precision = precision_score(target, pred, average='macro')
    recall = recall_score(target, pred, average='macro')
    f1 = f1_score(target, pred, average='macro')
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [14]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./output/model',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    label_names=['labels'],
    lr_scheduler_type='constant',
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    remove_unused_columns=False,
    report_to='none'
)


In [15]:
from transformers import Trainer
from trl import SFTTrainer
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=net,
    # model=pretrained_model,
    tokenizer=tokenizer,
    data_collator=livedoor_collator,
    compute_metrics=custom_compute_metrics,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train(ignore_keys_for_eval=['last_hidden_state', 'hidden_states', 'attentions'])




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [None]:
from peft import LoraConfig, TaskType

peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)

In [None]:
from peft import get_peft_model

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
"output: trainable params: 2359296 || all params: 1231940608 || trainable%: 0.19151053100118282"

In [None]:
model.save_pretrained("output_dir")

# if pushing to Hub
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model.push_to_hub("my_awesome_peft_model")