In [1]:
from pprint import pprint
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForMultipleChoice
import torch
from torch.utils.data import IterableDataset, Dataset, DataLoader
from tqdm import tqdm

In [2]:
# Hugging Face Hub上のllm-book/JGLUEのリポジトリから
# JCommonsenseQAのデータを読み込む
train_dataset = load_dataset(
    "llm-book/JGLUE", name="JCommonsenseQA", split="train"
)
valid_dataset = load_dataset(
    "llm-book/JGLUE", name="JCommonsenseQA", split="validation"
)


In [4]:
idx = 0
# pprintで見やすく表示する
pprint(train_dataset[idx])
pprint(train_dataset.features)

{'choice0': '世界',
 'choice1': '写真集',
 'choice2': '絵本',
 'choice3': '論文',
 'choice4': '図鑑',
 'label': 2,
 'q_id': 0,
 'question': '主に子ども向けのもので、イラストのついた物語が書かれているものはどれ？'}
{'choice0': Value(dtype='string', id=None),
 'choice1': Value(dtype='string', id=None),
 'choice2': Value(dtype='string', id=None),
 'choice3': Value(dtype='string', id=None),
 'choice4': Value(dtype='string', id=None),
 'label': ClassLabel(names=['choice0',
                            'choice1',
                            'choice2',
                            'choice3',
                            'choice4'],
                     id=None),
 'q_id': Value(dtype='int64', id=None),
 'question': Value(dtype='string', id=None)}


In [5]:
# 各設問の選択肢の数をチェック
all_num_choices = []
for q in train_dataset:
    # 選択肢の数を"choice"から始まるキーの数として算出
    num_choices = sum(
        key.startswith("choice") for key in q.keys()
    )
    all_num_choices.append(num_choices)

# 選択肢の数のユニークな値を確認
print(set(all_num_choices))

{5}


In [6]:
transformer_model_name = "cl-tohoku/bert-base-japanese-v3"
tokenizer = AutoTokenizer.from_pretrained(transformer_model_name)

In [13]:
class Dataset1(Dataset):
    def __init__(self, ds):
        self.features = [
            {
                'question': row['question'],
                'choice0': row['choice0'],
                'choice1': row['choice1'],
                'choice2': row['choice2'],
                'choice3': row['choice3'],
                'choice4': row['choice4'],
                'label': row['label']
            } for row in tqdm(ds)
        ]

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return self.features[index]

train_dataset1 = Dataset1(train_dataset)
valid_dataset1 = Dataset1(valid_dataset)

100%|██████████| 8939/8939 [00:00<00:00, 11378.66it/s]
100%|██████████| 1119/1119 [00:00<00:00, 11792.97it/s]


In [14]:
# class Dataset1(IterableDataset):
#     def __init__(self, ds):
#         self.features = [
#             {
#                 'question': row['question'],
#                 'choice0': row['choice0'],
#                 'choice1': row['choice1'],
#                 'choice2': row['choice2'],
#                 'choice3': row['choice3'],
#                 'choice4': row['choice4'],
#                 'label': row['label']
#             } for row in tqdm(ds)
#         ]

#     def __len__(self):
#         return len(self.features)

#     def __iter__(self):
#         return iter(self.features)

# train_dataset1 = Dataset1(train_dataset)
# valid_dataset1 = Dataset1(valid_dataset)

In [16]:
train_dataset1[0:2]
# train_dataset[0:2]

[{'question': '主に子ども向けのもので、イラストのついた物語が書かれているものはどれ？',
  'choice0': '世界',
  'choice1': '写真集',
  'choice2': '絵本',
  'choice3': '論文',
  'choice4': '図鑑',
  'label': 2},
 {'question': '未成年者を監護・教育し，彼らを監督し，彼らの財産上の利益を守る法律上の義務をもつ人は？',
  'choice0': '浮浪者',
  'choice1': '保護者',
  'choice2': 'お坊さん',
  'choice3': '宗教者',
  'choice4': '預言者',
  'label': 1}]

In [17]:
# datasetのサイズを確認
print(len(train_dataset1))
# datasetの中身を確認
tmp = next(iter(train_dataset1))
print(tmp)

8939
{'question': '主に子ども向けのもので、イラストのついた物語が書かれているものはどれ？', 'choice0': '世界', 'choice1': '写真集', 'choice2': '絵本', 'choice3': '論文', 'choice4': '図鑑', 'label': 2}


In [57]:
class DataCollator1():
    def __init__(self, tokenizer, max_length=64):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, examples):
        num_choices = 5
        examples = {
            'question': list(map(lambda x: x['question'], examples)),
            # 'choice_list': list(map(lambda x: [x['choice0'], x['choice1'], x['choice2'], x['choice3'], x['choice4']], examples)),
            'choice_list': list(map(lambda x: [x['choice0'], x['choice1'], x['choice2'], x['choice3'], x['choice4']], examples)),
            'repeated_question_list': list(map(lambda x: [x['question'], x['question'], x['question'], x['question'], x['question']], examples)),
            'label': list(map(lambda x: x['label'], examples)),
        }

        # 選択肢と設問のリストを1次元に平坦化
        repeated_question_list_flat = [item for sublist in examples['repeated_question_list'] for item in sublist]
        choice_list_flat = [item for sublist in examples['choice_list'] for item in sublist]
        
        # (バッチサイズ * 選択肢数, 最大系列長)の形式でエンコード
        encodings = self.tokenizer(repeated_question_list_flat,
                                   choice_list_flat,
                                   padding=True, 
                                   truncation=True,
                                   max_length=self.max_length,
                                   return_tensors='pt')

        # （バッチサイズ, 選択肢数, 最大系列長）に変換
        batch_size = len(examples["question"])
        batch = {
            k: v.view(batch_size, num_choices, -1)
            for k, v in encodings.items()
        }
        batch["labels"] = torch.tensor(examples['label'])

        return batch


tokenizer = AutoTokenizer.from_pretrained(transformer_model_name)
data_collator = DataCollator1(tokenizer)

In [58]:
# データローダの挙動を確認
loader = DataLoader(train_dataset1, collate_fn=data_collator, batch_size=4)
batch = next(iter(loader))

# batchの各keyのsizeを確認
for k, v in batch.items():
    print(k, v.size())

# batchの中身を確認
pprint(batch["input_ids"][0])
print(batch["labels"])

input_ids torch.Size([4, 5, 40])
token_type_ids torch.Size([4, 5, 40])
attention_mask torch.Size([4, 5, 40])
labels torch.Size([4])
tensor([[    2, 13182, 16044, 12994,   464, 12518,   457,   384, 14930,   464,
         12584,   449, 13360,   430, 14220,   494,   456, 12483, 12518,   465,
         19382,    46,     3, 12575,     3,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [    2, 13182, 16044, 12994,   464, 12518,   457,   384, 14930,   464,
         12584,   449, 13360,   430, 14220,   494,   456, 12483, 12518,   465,
         19382,    46,     3, 13409,  6460,     3,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [    2, 13182, 16044, 12994,   464, 12518,   457,   384, 14930,   464,
         12584,   449, 13360,   430, 14220,   494,   456, 12483, 12518,   465,
         19382,    46,     3, 20647,     3,     0,     0,     0,     0,     

In [59]:
# IDから元の文字列を復元
print(tokenizer.decode(batch["input_ids"][0][2]))

# ラベルの確認
print(batch["labels"])

[CLS] 主に 子ども 向け の もの で 、 イラスト の つい た 物語 が 書か れ て いる もの は どれ? [SEP] 絵本 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
tensor([2, 1, 3, 4])


In [23]:
train_dataset.features["label"]

ClassLabel(names=['choice0', 'choice1', 'choice2', 'choice3', 'choice4'], id=None)

In [28]:
transformers_model_name = "cl-tohoku/bert-base-japanese-v3"


model = AutoModelForMultipleChoice.from_pretrained(
    transformers_model_name,
    num_labels=train_dataset.features["label"].num_classes,
)

print(type(model).__name__)

# モデルの出力を確認
outputs = model(**batch)
outputs

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForMultipleChoice


MultipleChoiceModelOutput(loss=tensor(1.8538, grad_fn=<NllLossBackward0>), logits=tensor([[-0.5226, -0.4835, -0.5693, -0.5206, -0.5575],
        [-0.7181, -0.9051,  0.1348, -0.6280, -0.1062],
        [ 0.4246,  0.3564,  0.4028, -0.0059,  0.2815],
        [ 0.4608,  0.4725,  0.4362,  0.0568,  0.2600]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [29]:
training_args = TrainingArguments(
    output_dir="output_jcommonsenseqa",  # 結果の保存フォルダ
    per_device_train_batch_size=8,  # 訓練時のバッチサイズ
    per_device_eval_batch_size=8,  # 評価時のバッチサイズ
    learning_rate=2e-5,  # 学習率
    lr_scheduler_type="linear",  # 学習率スケジューラの種類
    warmup_ratio=0.1,  # 学習率のウォームアップの長さを指定
    num_train_epochs=3,  # エポック数
    save_strategy="epoch",  # チェックポイントの保存タイミング
    logging_strategy="epoch",  # ロギングのタイミング
    evaluation_strategy="epoch",  # 検証セットによる評価のタイミング
    load_best_model_at_end=True,  # 訓練後に開発セットで最良のモデルをロード
    metric_for_best_model="accuracy",  # 最良のモデルを決定する評価指標
    fp16=True,  # 自動混合精度演算の有効化
    remove_unused_columns=False, # 入力データに含まれない列を削除するかどうか(https://qiita.com/m__k/items/2c4e476d7ac81a3a44af)
)

In [30]:
import numpy as np

def compute_accuracy(
    eval_pred: tuple[np.ndarray, np.ndarray]
) -> dict[str, float]:
    """予測ラベルと正解ラベルから正解率を計算"""
    predictions, labels = eval_pred
    # predictionsは各ラベルについてのスコア
    # 最もスコアの高いインデックスを予測ラベルとする
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

In [31]:
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_accuracy,
)
trainer.train()



  0%|          | 0/3354 [00:00<?, ?it/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


{'loss': 0.7662, 'learning_rate': 1.4824387011265739e-05, 'epoch': 1.0}


  0%|          | 0/140 [00:00<?, ?it/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


{'eval_loss': 0.4517534673213959, 'eval_accuracy': 0.8319928507596068, 'eval_runtime': 3.9359, 'eval_samples_per_second': 284.307, 'eval_steps_per_second': 35.57, 'epoch': 1.0}


KeyboardInterrupt: 

In [19]:
# 検証セットでモデルを評価
eval_metrics = trainer.evaluate(valid_dataset)
pprint(eval_metrics)

  0%|          | 0/140 [00:00<?, ?it/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


{'epoch': 3.0,
 'eval_accuracy': 0.837354781054513,
 'eval_loss': 0.5849103331565857,
 'eval_runtime': 4.0165,
 'eval_samples_per_second': 278.599,
 'eval_steps_per_second': 34.856}
