<a href="https://colab.research.google.com/github/nukano0522/pytorch/blob/master/livedoor_news_cls/bert_livedoor_with_peft_qlora_cls.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

参考：https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing#scrollTo=Ybeyl20n3dYH

In [1]:
! pip install -q transformers trl bitsandbytes peft accelerate bitsandbytes fugashi ipadic unidic_lite

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for unidic_lite (setup.py) ... [?25l[?25hdone


In [1]:
# MODEL_ID = "cl-tohoku/bert-base-japanese-whole-word-masking"
MODEL_ID = "cl-tohoku/bert-base-japanese-v2"

In [2]:
import pandas as pd
import os

df = pd.read_csv("./drive/MyDrive/Colab_Notebooks/data/livedoor_text.csv")
# カテゴリーのID列を付与しておく
categories = df['category'].unique().tolist()
df.sample(3)


Unnamed: 0,text,category
5503,20日のACL（アジア・チャンピオンズリーグ）1次リーグにて、アデレードに0-2で完敗した...,7
6102,昨年5月「第64回カンヌ国際映画祭」でパルムドール（最高賞）に輝いた映画『ツリー・オブ・ラ...,4
3924,ソニーのウォークマンのテレビCMに人気歌手の西野カナさんが出演している。冬の空気にぴったりな...,2


In [3]:
from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(df, train_size=0.7)
eval_df, test_df = train_test_split(eval_df, train_size=0.5)
print('train size', train_df.shape)
print('eval size', eval_df.shape)
print('test size', test_df.shape)


train size (5156, 2)
eval size (1105, 2)
test size (1106, 2)


In [4]:
from torch.utils.data import IterableDataset
from tqdm import tqdm

class LivedoorDataset(IterableDataset):
    def __init__(self, df):
        self.features = [
            {
                'title': row.text,
                'labels': row.category
            } for row in tqdm(df.itertuples(), total=df.shape[0])
        ]

    def __len__(self):
        return len(self.features)

    def __iter__(self):
        return iter(self.features)

train_dataset = LivedoorDataset(train_df)
eval_dataset = LivedoorDataset(eval_df)
test_dataset = LivedoorDataset(test_df)


100%|██████████| 5156/5156 [00:00<00:00, 607603.71it/s]
100%|██████████| 1105/1105 [00:00<00:00, 764172.45it/s]
100%|██████████| 1106/1106 [00:00<00:00, 688877.37it/s]


In [5]:
import torch
from transformers import AutoTokenizer

class LivedoorCollator():
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, examples):
        examples = {
            'title': list(map(lambda x: x['title'], examples)),
            'labels': list(map(lambda x: x['labels'], examples))
        }

        encodings = self.tokenizer(examples['title'],
                                   padding=True,
                                   truncation=True,
                                   max_length=self.max_length,
                                   return_tensors='pt')
        encodings['labels'] = torch.tensor(examples['labels'])
        return encodings

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
livedoor_collator = LivedoorCollator(tokenizer)


In [6]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

pretrained_model = AutoModel.from_pretrained(MODEL_ID, quantization_config=bnb_config, device_map={"":0})

In [7]:
# パラメタ数
sum(p.numel() for p in pretrained_model.parameters() if p.requires_grad)

26189568

In [8]:
from peft import prepare_model_for_kbit_training

pretrained_model.gradient_checkpointing_enable()
pretrained_model = prepare_model_for_kbit_training(pretrained_model)

In [9]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [10]:
from peft import LoraConfig, get_peft_model, TaskType

config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1
)

model = get_peft_model(pretrained_model, config)
print_trainable_parameters(model)

trainable params: 589824 || all params: 69329664 || trainable%: 0.8507527167591639


In [11]:
import torch.nn as nn
from transformers.modeling_outputs import ModelOutput

class LivedoorNet(nn.Module):
    def __init__(self, pretrained_model, num_categories, loss_function=None):
        super().__init__()
        self.bert = pretrained_model
        self.hidden_size = self.bert.config.hidden_size
        self.linear = nn.Linear(self.hidden_size, num_categories)
        self.loss_function = loss_function

    def forward(self,
                input_ids,
                attention_mask=None,
                position_ids=None,
                token_type_ids=None,
                output_attentions=False,
                output_hidden_states=False,
                labels=None
                ):

        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            position_ids=position_ids,
                            token_type_ids=token_type_ids,
                            output_attentions=output_attentions,
                            output_hidden_states=output_hidden_states)

        state = outputs.last_hidden_state[:, 0, :]
        state = self.linear(state)

        loss=None
        if labels is not None and self.loss_function is not None:
            loss = self.loss_function(state, labels)

        attentions=None
        if output_attentions:
            attentions=outputs.attentions

        hidden_states=None
        if output_hidden_states:
            hidden_states=outputs.hidden_states

        return ModelOutput(
            logits=state,
            loss=loss,
            last_hidden_state=outputs.last_hidden_state,
            attentions=attentions,
            hidden_states=hidden_states
        )

loss_fct = nn.CrossEntropyLoss()
net = LivedoorNet(model, len(categories), loss_fct)

# if peft_use:
#   from peft import get_peft_model
#   from peft import LoraConfig, TaskType
#   peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=16, lora_alpha=32, lora_dropout=0.1)
#   peft_pretrained_model = get_peft_model(pretrained_model, peft_config)
#   peft_pretrained_model.print_trainable_parameters()

#   net = LivedoorNet(peft_pretrained_model, len(categories), loss_fct)
#   # peft_net = get_peft_model(net, peft_config)
#   print("peft use")

In [12]:
net

LivedoorNet(
  (bert): PeftModelForSequenceClassification(
    (base_model): LoraModel(
      (model): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(32768, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): Linear4bit(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=16, b

In [13]:
from torch.utils.data import DataLoader
loader = DataLoader(train_dataset, collate_fn=livedoor_collator, batch_size=4)
batch = next(iter(loader))
batch

{'input_ids': tensor([[    2, 12114,  4156,  ...,     0,     0,     0],
        [    2, 12362, 11141,  ...,   828,  1343,     3],
        [    2,   838, 22160,  ...,   828, 16889,     3],
        [    2,   854, 27280,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([8, 4, 0, 2])}

In [15]:
# check
## peft_model.pyの l732の「labels=labels」の行をコメントアウトしたら動いた（23/10/09時点）
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
batch.to(device)
res = net(**batch)
# res


In [16]:
from transformers import EvalPrediction
from typing import Dict
from sklearn.metrics import precision_score, recall_score, f1_score

def custom_compute_metrics(res: EvalPrediction) -> Dict:
    # res.predictions, res.label_idsはnumpyのarray
    pred = res.predictions.argmax(axis=1)
    target = res.label_ids
    precision = precision_score(target, pred, average='macro')
    recall = recall_score(target, pred, average='macro')
    f1 = f1_score(target, pred, average='macro')
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [18]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./output/model',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    label_names=['labels'],
    lr_scheduler_type='constant',
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    remove_unused_columns=False,
    report_to='none'
)


In [19]:
from transformers import Trainer
from trl import SFTTrainer
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=net,
    # model=pretrained_model,
    tokenizer=tokenizer,
    data_collator=livedoor_collator,
    compute_metrics=custom_compute_metrics,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train(ignore_keys_for_eval=['last_hidden_state', 'hidden_states', 'attentions'])


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,1.3925,0.687823,0.789424,0.768568,0.757081
2,0.5651,0.468955,0.844927,0.830893,0.82948
3,0.4155,0.394244,0.878628,0.869847,0.870374
4,0.3359,0.352277,0.890801,0.885131,0.885204
5,0.2824,0.325661,0.894373,0.891995,0.891575


TrainOutput(global_step=1615, training_loss=0.5982979789237858, metrics={'train_runtime': 844.6253, 'train_samples_per_second': 30.522, 'train_steps_per_second': 1.912, 'total_flos': 0.0, 'train_loss': 0.5982979789237858, 'epoch': 5.0})

In [20]:
pred_result = trainer.predict(test_dataset, ignore_keys=['loss', 'last_hidden_state', 'hidden_states', 'attentions'])
test_df['predict'] = pred_result.predictions.argmax(axis=1).tolist()

from sklearn.metrics import classification_report
print(classification_report(test_df['category'], test_df['predict']))

              precision    recall  f1-score   support

           0       0.90      0.89      0.90       127
           1       0.89      0.88      0.89       129
           2       0.94      0.81      0.87       133
           3       0.78      0.73      0.75        81
           4       0.88      0.98      0.93       125
           5       0.87      0.92      0.90       140
           6       0.94      0.97      0.95       122
           7       0.98      0.98      0.98       146
           8       0.95      0.96      0.96       103

    accuracy                           0.91      1106
   macro avg       0.90      0.90      0.90      1106
weighted avg       0.91      0.91      0.91      1106

