In [None]:
from datasets import load_dataset

# ファイルのパスをリストで指定します
# 'train' スプリットには 'train_dataset.csv' を、
# 'validation' スプリットには 'validation_dataset.csv' を割り当てます
data_files = {
    "train": "train_dataset.csv",
    "validation": "validation_dataset.csv"
}

# CSVファイルを指定してロードします
dataset_dict = load_dataset("csv", data_files=data_files)

print(dataset_dict)

train_dataset = dataset_dict["train"]
validation_dataset = dataset_dict["validation"]

DatasetDict({
    train: Dataset({
        features: ['kanji_sentence', 'hiragana_sentence'],
        num_rows: 142418
    })
    validation: Dataset({
        features: ['kanji_sentence', 'hiragana_sentence'],
        num_rows: 13217
    })
})


In [None]:
from typing import Any
from transformers import BatchEncoding, PreTrainedTokenizer
from transformers import AutoTokenizer, PreTrainedTokenizer

model_name = "retrieva-jp/t5-small-short"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_data(
    data: dict[str, Any], tokenizer: PreTrainedTokenizer
) -> BatchEncoding:

  prefix = "ふりがな: "

  # ↓↓ [] を削除し、ただの文字列（str）にする ↓↓
  input_text = prefix + data["kanji_sentence"]

  inputs = tokenizer(
      input_text, max_length=512, truncation=True
  )

  # ↓↓ こちらも、ただの文字列（str）を渡す ↓↓
  inputs["labels"] = tokenizer(
      data["hiragana_sentence"], max_length=512, truncation=True
  )["input_ids"]

  return inputs

train_data = train_dataset.map(
    preprocess_data,
    fn_kwargs={"tokenizer": tokenizer},
    remove_columns=train_dataset.column_names,
)

validation_data = validation_dataset.map(
    preprocess_data,
    fn_kwargs={"tokenizer": tokenizer},
    remove_columns=validation_dataset.column_names,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/142418 [00:00<?, ? examples/s]

Map:   0%|          | 0/13217 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

model.resize_token_embeddings(len(tokenizer))

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Tokenizer.pad_token を eos_token に設定しました。")

# 2. model=model を引数から "削除" します
#    これにより、DataCollator は 'labels' から 'decoder_input_ids' を
#    自動生成するロジック（shift_tokens_right）を使います。
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True # 明示的にパディングを有効化
)

config.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# --- デバッグ用コード ---
print("--- DataCollatorの動作確認 ---")

# 1. 前処理済みの訓練データから、最初の2件だけ取り出します
debug_batch = [train_data[0], train_data[1]]

# 2. それを data_collator に手動で渡してみます
#    (trainer.train() が内部で行うことと同じです)
try:
    collated_batch = data_collator(debug_batch)

    # 3. DataCollator が生成した "labels" の中身を表示します
    print("\nDataCollator が生成した 'labels':")
    print(collated_batch["labels"])

    # 4. 念のため、'input_ids' も表示します
    print("\nDataCollator が生成した 'input_ids':")
    print(collated_batch["input_ids"])

except Exception as e:
    print(f"DataCollator の実行中にエラーが発生しました: {e}")

print("\n--- 確認ここまで ---")

--- DataCollatorの動作確認 ---

DataCollator が生成した 'labels':


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

tensor([[    8,  5494,   749, 16441, 10011,   910,   220,  6516,     4,  6823,
           288, 10960,   231,     3,   910,  1269,    15,  4505,   463,     4,
          1487, 10514,    14,  3927,    14,  4119,   288, 21853, 21172,  3348,
         24954,    18,  2252,    74, 29005,  6319,  1579,   173,     4, 10842,
           189,   231,   275,    59,   275,    16,  3927,    14,    91,  7142,
          1016,  6912,   121,  3556,    14,  2776,    77,     4,  4119,  1074,
            91,    16,     3,   870,   456,   220,  5494,   749, 16441, 10011,
             4,   910,   220,  6516,    16,     7, 19458, 11485,   345,    33,
          6823,   288, 10960,   231,    14,    49, 24688,  4712, 21172,  3348,
            33,     5,     1],
        [    8,  4317,  7908,     7,     3, 29005,  6319,  1579,   173, 16328,
          3029,  1074,  2914,  1489,    77,  7000,   565,  4935,     4, 10842,
           189,   231,   275,    59,   275,  6912,   121,  3556,    14,  2776,
            77,  7000

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers.trainer_utils import set_seed

set_seed(42)

train_args = Seq2SeqTrainingArguments(
    output_dir="output_t5_hurigana",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    fp16=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=train_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,3.0937,2.200756
2,1.4854,1.870006
3,1.2138,1.791516


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=53409, training_loss=1.9309669104224008, metrics={'train_runtime': 8220.9862, 'train_samples_per_second': 51.971, 'train_steps_per_second': 6.497, 'total_flos': 1.2206861131272192e+16, 'train_loss': 1.9309669104224008, 'epoch': 3.0})

In [None]:
from google.colab import drive

drive.mount("drive")

!cp -r output_t5_hurigana drive/MyDrive/llm-book

ValueError: mount failed

In [None]:
from transformers import pipeline

# 1. パイプラインを作成します
# タスクは「text2text-generation」（T5のタスク）
# 'trainer.model' には、学習済みの最良のモデルがロードされています
# 'tokenizer' は、前処理でロードしたものと同じものを使います
# 'device=0' を指定することで、GPUを使って高速に推論します
furigana_pipe = pipeline(
    "text2text-generation",
    model=trainer.model,
    tokenizer=tokenizer,
    device=0  # GPUを使う (CPUで動かす場合は -1)
)

PREFIX="ふりがな: "

# ★ 訓練時と同じ接頭辞
kanji_text_2 = "国の重要文化財を訪ねる。"
result_2 = furigana_pipe(PREFIX + kanji_text_2) # <-- 接頭辞を追加

print(f"入力: {kanji_text_2}")
print(f"ふりがな: {result_2[0]['generated_text']}")

NameError: name 'trainer' is not defined