In [1]:
from pprint import pprint
from datasets import load_dataset
from transformers import BatchEncoding, AutoTokenizer
from tqdm import tqdm

In [24]:
train_dataset = load_dataset('llm-book/JGLUE', name='JSTS', split='train')
valid_dataset = load_dataset('llm-book/JGLUE', name='JSTS', split='validation')

In [25]:
idx = 100
print(train_dataset[idx])
print(valid_dataset[idx])

{'sentence_pair_id': '100', 'yjcaptions_id': '103678-107788-107792', 'sentence1': '線路の上を黄色い電車が走っています。', 'sentence2': '線路の上を黄色い電車が走っています。', 'label': 4.800000190734863}
{'sentence_pair_id': '100', 'yjcaptions_id': '133467-2721-2724', 'sentence1': 'ベンチに座っている人の前にハトが数羽います。', 'sentence2': 'ベンチに座った人の前に２羽のハトがいます。', 'label': 3.799999952316284}


In [26]:
transformer_model_name = "cl-tohoku/bert-base-japanese-v3"
tokenizer = AutoTokenizer.from_pretrained(transformer_model_name)

In [27]:
from torch.utils.data import IterableDataset, Dataset
from tqdm import tqdm

class Dataset1(IterableDataset):
    def __init__(self, ds):
        self.features = [
            {
                'sentence_pair_id': row['sentence_pair_id'],
                'yjcaptions_id': row['yjcaptions_id'],
                'sentence1': row['sentence1'],
                'sentence2': row['sentence2'],
                'label': row['label']
            } for row in tqdm(ds)
        ]

    def __len__(self):
        return len(self.features)

    def __iter__(self):
        return iter(self.features)

train_dataset1 = Dataset1(train_dataset)
valid_dataset1 = Dataset1(valid_dataset)

100%|██████████| 12451/12451 [00:00<00:00, 15779.94it/s]
100%|██████████| 1457/1457 [00:00<00:00, 13794.53it/s]


In [28]:
# datasetのサイズを確認
print(len(train_dataset1))
# datasetの中身を確認
tmp = next(iter(train_dataset1))
print(tmp)

12451
{'sentence_pair_id': '0', 'yjcaptions_id': '10005_480798-10996-92616', 'sentence1': '川べりでサーフボードを持った人たちがいます。', 'sentence2': 'トイレの壁に黒いタオルがかけられています。', 'label': 0.0}


In [29]:
import torch
from transformers import AutoTokenizer

class DataCollator1():
    def __init__(self, tokenizer, max_length=256):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, examples):
        # sentence1とsentence2を連結し、encodingsを返す
        examples = {
            'sentence1': list(map(lambda x: x['sentence1'], examples)),
            'sentence2': list(map(lambda x: x['sentence2'], examples)),
            'label': list(map(lambda x: x['label'], examples)),
        }
        encodings = self.tokenizer(
                                   examples['sentence1'],
                                   examples['sentence2'],
                                   padding=True, 
                                   truncation=True,
                                   max_length=self.max_length,
                                   return_tensors='pt')

        encodings['labels'] = torch.tensor(examples['label'])
        return encodings


tokenizer = AutoTokenizer.from_pretrained(transformer_model_name)
data_collator = DataCollator1(tokenizer)

In [30]:
from torch.utils.data import DataLoader
loader = DataLoader(train_dataset1, collate_fn=data_collator, batch_size=4)
batch = next(iter(loader))
# batch = next(iter(loader))

# batchの各keyのsizeを確認
for k, v in batch.items():
    print(k, v.size())

# batchの中身を確認
pprint(batch["input_ids"][3])
print(batch["labels"])

input_ids torch.Size([4, 32])
token_type_ids torch.Size([4, 32])
attention_mask torch.Size([4, 32])
labels torch.Size([4])
tensor([    2, 13341,   430, 13275,   500, 18967, 12867,   456, 16996,   456,
          422, 12995,   385,     3, 19898,   500,  1428,  7213, 16629,   464,
         1286,  9729,   722,   430, 12598,   494,   456,   422, 12995,   385,
            3,     0])
tensor([0.0000, 3.8000, 4.0000, 0.2000])


In [31]:
train_dataset.features["label"]

Value(dtype='float32', id=None)

In [32]:
from transformers import AutoModelForSequenceClassification

transformers_model_name = "cl-tohoku/bert-base-japanese-v3"

model = AutoModelForSequenceClassification.from_pretrained(
    transformers_model_name,
    num_labels=1,
    problem_type="regression",
)

print(type(model).__name__)

# モデルの出力を確認
outputs = model(**batch)
outputs

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v3 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification


SequenceClassifierOutput(loss=tensor(7.6491, grad_fn=<MseLossBackward0>), logits=tensor([[ 0.0274],
        [-0.0238],
        [ 0.0057],
        [ 0.0597]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [33]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="output_jsts",  # 結果の保存フォルダ
    per_device_train_batch_size=32,  # 訓練時のバッチサイズ
    per_device_eval_batch_size=32,  # 評価時のバッチサイズ
    learning_rate=2e-5,  # 学習率
    lr_scheduler_type="linear",  # 学習率スケジューラの種類
    warmup_ratio=0.1,  # 学習率のウォームアップの長さを指定
    num_train_epochs=3,  # エポック数
    save_strategy="epoch",  # チェックポイントの保存タイミング
    logging_strategy="epoch",  # ロギングのタイミング
    evaluation_strategy="epoch",  # 検証セットによる評価のタイミング
    load_best_model_at_end=True,  # 訓練後に開発セットで最良のモデルをロード
    metric_for_best_model="spearmanr",  # 最良のモデルを決定する評価指標
    fp16=True,  # 自動混合精度演算の有効化
    remove_unused_columns=False, # 入力データに含まれない列を削除するかどうか(https://qiita.com/m__k/items/2c4e476d7ac81a3a44af)
)

In [34]:
import numpy as np
from scipy.stats import pearsonr, spearmanr

def compute_correlation_metrics(
    eval_pred: tuple[np.ndarray, np.ndarray]
) -> dict[str, float]:
    """予測スコアと正解スコアから各種相関係数を計算"""
    predictions, labels = eval_pred
    predictions = predictions.squeeze(1)
    return {
        "pearsonr": pearsonr(predictions, labels).statistic,
        "spearmanr": spearmanr(predictions, labels).statistic,
    }

In [35]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    train_dataset=train_dataset1,
    eval_dataset=valid_dataset1,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_correlation_metrics,
)

trainer.train()



  0%|          | 0/1170 [00:00<?, ?it/s]

{'loss': 1.107, 'learning_rate': 1.4871794871794874e-05, 'epoch': 1.0}


  0%|          | 0/46 [00:00<?, ?it/s]

{'eval_loss': 0.46136826276779175, 'eval_pearsonr': 0.8956831678327875, 'eval_spearmanr': 0.8512333491469819, 'eval_runtime': 1.9739, 'eval_samples_per_second': 738.137, 'eval_steps_per_second': 23.304, 'epoch': 1.0}
{'loss': 0.3723, 'learning_rate': 7.4833808167141505e-06, 'epoch': 2.0}


  0%|          | 0/46 [00:00<?, ?it/s]

{'eval_loss': 0.3786185085773468, 'eval_pearsonr': 0.9085512049549194, 'eval_spearmanr': 0.8669636297172041, 'eval_runtime': 1.9249, 'eval_samples_per_second': 756.924, 'eval_steps_per_second': 23.897, 'epoch': 2.0}
{'loss': 0.2762, 'learning_rate': 7.597340930674265e-08, 'epoch': 3.0}


  0%|          | 0/46 [00:00<?, ?it/s]

{'eval_loss': 0.3605651557445526, 'eval_pearsonr': 0.9111519230333195, 'eval_spearmanr': 0.8713183009329197, 'eval_runtime': 1.9451, 'eval_samples_per_second': 749.075, 'eval_steps_per_second': 23.65, 'epoch': 3.0}
{'train_runtime': 271.8541, 'train_samples_per_second': 137.401, 'train_steps_per_second': 4.304, 'train_loss': 0.5851514441335303, 'epoch': 3.0}


TrainOutput(global_step=1170, training_loss=0.5851514441335303, metrics={'train_runtime': 271.8541, 'train_samples_per_second': 137.401, 'train_steps_per_second': 4.304, 'train_loss': 0.5851514441335303, 'epoch': 3.0})

In [36]:
# 検証セットでモデルを評価
eval_metrics = trainer.evaluate(valid_dataset)
pprint(eval_metrics)

  0%|          | 0/46 [00:00<?, ?it/s]

{'epoch': 3.0,
 'eval_loss': 0.3605651557445526,
 'eval_pearsonr': 0.9111519230333195,
 'eval_runtime': 2.029,
 'eval_samples_per_second': 718.091,
 'eval_spearmanr': 0.8713183009329197,
 'eval_steps_per_second': 22.671}
