<a href="https://colab.research.google.com/github/nukano0522/pytorch/blob/master/livedoor_news_cls/bert_livedoor_with_peft_cls.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install -q transformers trl peft accelerate bitsandbytes fugashi ipadic

In [2]:
import pandas as pd
import os

df = pd.read_csv("./drive/MyDrive/Colab_Notebooks/data/livedoor_text.csv")
# カテゴリーのID列を付与しておく
categories = df['category'].unique().tolist()
df.sample(3)


Unnamed: 0,text,category
2671,全7色から選べるPANTONEケータイ \n\nフトバンクモバイルおよびウィルコムは29日、...,6
6381,10月16日に放送された日本テレビ「行列のできる法律相談所SP」で、ゆうこりんがぶっちゃけた...,8
4743,マーク・ザッカーバーグがFacebookに訴えられる……という驚くべきニュースが入ってきた。...,2


In [3]:
from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(df, train_size=0.7)
eval_df, test_df = train_test_split(eval_df, train_size=0.5)
print('train size', train_df.shape)
print('eval size', eval_df.shape)
print('test size', test_df.shape)
# train size (5163, 4)
# eval size (1106, 4)
# test size (1107, 4)


train size (5156, 2)
eval size (1105, 2)
test size (1106, 2)


In [4]:
from torch.utils.data import IterableDataset
from tqdm import tqdm

class LivedoorDataset(IterableDataset):
    def __init__(self, df):
        self.features = [
            {
                'title': row.text,
                'labels': row.category
            } for row in tqdm(df.itertuples(), total=df.shape[0])
        ]

    def __len__(self):
        return len(self.features)

    def __iter__(self):
        return iter(self.features)

train_dataset = LivedoorDataset(train_df)
eval_dataset = LivedoorDataset(eval_df)
test_dataset = LivedoorDataset(test_df)


100%|██████████| 5156/5156 [00:00<00:00, 629609.63it/s]
100%|██████████| 1105/1105 [00:00<00:00, 676105.90it/s]
100%|██████████| 1106/1106 [00:00<00:00, 751847.69it/s]


In [5]:
import torch
from transformers import AutoTokenizer

class LivedoorCollator():
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, examples):
        examples = {
            'title': list(map(lambda x: x['title'], examples)),
            'labels': list(map(lambda x: x['labels'], examples))
        }

        encodings = self.tokenizer(examples['title'],
                                   padding=True,
                                   truncation=True,
                                   max_length=self.max_length,
                                   return_tensors='pt')
        encodings['labels'] = torch.tensor(examples['labels'])
        return encodings

tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
livedoor_collator = LivedoorCollator(tokenizer)


In [6]:
from transformers import AutoModel
pretrained_model = AutoModel.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

In [7]:
from peft import LoraConfig, TaskType
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)

In [8]:
peft_use = True

In [16]:
import torch.nn as nn
from transformers.modeling_outputs import ModelOutput

class LivedoorNet(nn.Module):
    def __init__(self, pretrained_model, num_categories, loss_function=None):
        super().__init__()
        self.bert = pretrained_model
        self.hidden_size = self.bert.config.hidden_size
        self.linear = nn.Linear(self.hidden_size, num_categories)
        self.loss_function = loss_function

    def forward(self,
                input_ids,
                attention_mask=None,
                position_ids=None,
                token_type_ids=None,
                output_attentions=False,
                output_hidden_states=False,
                labels=None
                ):

        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            position_ids=position_ids,
                            token_type_ids=token_type_ids,
                            output_attentions=output_attentions,
                            output_hidden_states=output_hidden_states)

        state = outputs.last_hidden_state[:, 0, :]
        state = self.linear(state)

        loss=None
        if labels is not None and self.loss_function is not None:
            loss = self.loss_function(state, labels)

        attentions=None
        if output_attentions:
            attentions=outputs.attentions

        hidden_states=None
        if output_hidden_states:
            hidden_states=outputs.hidden_states

        return ModelOutput(
            logits=state,
            loss=loss,
            last_hidden_state=outputs.last_hidden_state,
            attentions=attentions,
            hidden_states=hidden_states
        )

loss_fct = nn.CrossEntropyLoss()
net = LivedoorNet(pretrained_model, len(categories), loss_fct)

if peft_use:
  from peft import get_peft_model
  from peft import LoraConfig, TaskType
  peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
  peft_pretrained_model = get_peft_model(pretrained_model, peft_config)
  net = LivedoorNet(peft_pretrained_model, len(categories), loss_fct)
  # peft_net = get_peft_model(net, peft_config)
  # peft_net.print_trainable_parameters()
  print("peft use")

peft use


In [18]:
# net

In [19]:
from torch.utils.data import DataLoader
loader = DataLoader(train_dataset, collate_fn=livedoor_collator, batch_size=4)
batch = next(iter(loader))
batch

{'input_ids': tensor([[    2,  2221,  4158,  ...,   965,    32,     3],
        [    2,   132,  3273,  ...,  3869,   630,     3],
        [    2, 16755,  4598,  ..., 28986,   506,     3],
        [    2, 17287,     5,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 0, 2, 2])}

In [20]:
# check
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
batch.to(device)
res = net(**batch)
res

# 以下のエラーが出る
# TypeError: forward() got an unexpected keyword argument 'labels'
# peft_model.pyのなかで、return base_model(BertModel)に"labels"を引数としてわたしているが、
# BertModelのforward処理ではlabelsは引数として持っていないためエラーになるっぽい
# とりあえず当該箇所（return base_modeのlabels引数とコメントアウト）することで回避はできた


ModelOutput([('logits',
              tensor([[-0.0232,  0.5219, -0.3863, -0.2844,  0.2402, -0.1508, -0.1727,  0.4182,
                        0.4158],
                      [ 0.1180,  0.4082, -0.5544,  0.0378,  0.3910, -0.1059, -0.3833,  0.4587,
                        0.3099],
                      [ 0.2268,  0.3771, -0.5862,  0.0441,  0.0641, -0.0443, -0.3512,  0.2940,
                        0.4454],
                      [ 0.2311,  0.2753, -0.6715, -0.0642,  0.3520,  0.1151, -0.3851,  0.4026,
                        0.3344]], device='cuda:0', grad_fn=<AddmmBackward0>)),
             ('loss',
              tensor(2.4678, device='cuda:0', grad_fn=<NllLossBackward0>)),
             ('last_hidden_state',
              tensor([[[-0.1982,  0.1041, -0.1912,  ..., -0.5656,  0.0543,  0.0929],
                       [ 0.4826, -0.6330,  1.2177,  ..., -0.7639, -0.5783,  0.1741],
                       [ 0.4272, -0.8818,  1.3709,  ...,  0.3682, -0.5861, -0.4678],
                       ...,
  

In [21]:
from transformers import EvalPrediction
from typing import Dict
from sklearn.metrics import precision_score, recall_score, f1_score

def custom_compute_metrics(res: EvalPrediction) -> Dict:
    # res.predictions, res.label_idsはnumpyのarray
    pred = res.predictions.argmax(axis=1)
    target = res.label_ids
    precision = precision_score(target, pred, average='macro')
    recall = recall_score(target, pred, average='macro')
    f1 = f1_score(target, pred, average='macro')
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [22]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./output/model',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    label_names=['labels'],
    lr_scheduler_type='constant',
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    remove_unused_columns=False,
    report_to='none'
)


In [None]:
from transformers import Trainer
from trl import SFTTrainer
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=net,
    # model=pretrained_model,
    tokenizer=tokenizer,
    data_collator=livedoor_collator,
    compute_metrics=custom_compute_metrics,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train(ignore_keys_for_eval=['last_hidden_state', 'hidden_states', 'attentions'])




Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,1.499,0.752172,0.769155,0.758244,0.741961
2,0.6729,0.493074,0.840514,0.837669,0.835321


In [6]:
from peft import LoraConfig, TaskType

peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)

In [4]:
from transformers import AutoModelForSeq2SeqLM

model_name_or_path = "bigscience/mt0-large"
tokenizer_name_or_path = "bigscience/mt0-large"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)

Downloading (…)lve/main/config.json:   0%|          | 0.00/800 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

In [7]:
from peft import get_peft_model

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
"output: trainable params: 2359296 || all params: 1231940608 || trainable%: 0.19151053100118282"

trainable params: 2,359,296 || all params: 1,231,940,608 || trainable%: 0.19151053100118282


'output: trainable params: 2359296 || all params: 1231940608 || trainable%: 0.19151053100118282'

In [9]:
model.save_pretrained("output_dir")

# if pushing to Hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

RepositoryNotFoundError: ignored

In [10]:
model.push_to_hub("my_awesome_peft_model")

adapter_model.bin:   0%|          | 0.00/9.54M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mayokori/my_awesome_peft_model/commit/8469d3be803bac803d7057a5a30ab810a640ef9c', commit_message='Upload model', commit_description='', oid='8469d3be803bac803d7057a5a30ab810a640ef9c', pr_url=None, pr_revision=None, pr_num=None)