In [1]:
import os
import re
import glob
import pickle
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import GPT2LMHeadModel, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset

torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ---------------------------------------------------------
# データの読み込み
# ---------------------------------------------------------
def load_corpus(corpus_dir):

  dir_list = []
  news_data = []
  dir_count = 0
  file_count = 0
  read_count = 0

  for f in os.listdir(path=corpus_dir):
    if os.path.isdir(os.path.join(corpus_dir, f)):
      dir_list.append(f)

  for i in range(len(dir_list)):

    if os.path.isdir(os.path.join(corpus_dir, dir_list[i])):
      read_count = 0
      for file_path in glob.glob(os.path.join(corpus_dir, dir_list[i], "*.txt")):
        if os.path.basename(file_path) != "LICENSE.txt":
          with open(file_path, "r", encoding='shift-jis') as f:
            content = f.read()
            text = "".join(content)
            text = sanitize(text)
            news_data.append(text)
            if (read_count >= 10000):       # ディレクトリー当たりの読み込む最大ファイル数
              break
            read_count = read_count + 1 
          file_count = file_count + 1
      dir_count = dir_count + 1

  print(dir_list)
  print(f"dir_count = {dir_count}")
  print(f"file_count = {file_count}")

  with open("./label.pkl", "wb") as f:
    pickle.dump(dir_list, f)

  return news_data

# ---------------------------------------------------------
# コンテンツの不要な部分を削除
# ---------------------------------------------------------
def sanitize(text):
  operations = [
    lambda text: re.split(r'\-{5,}', text)[2],
    lambda text: re.split(r'底本：', text)[0],
    lambda text: re.sub(r'《.+?》', '', text),
    lambda text: re.sub(r'［＃.+?］', '', text),
    lambda text: text.strip()
  ]

  for operation in operations:
    try:
      text = operation(text)
    except Exception as e:
      pass

  return text

# ---------------------------------------------------------
# メイン
# ---------------------------------------------------------
news_data = load_corpus('./text')

train_data, eval_data = train_test_split(news_data, shuffle=True)
train_data = pd.DataFrame(train_data, columns=["text"])
eval_data = pd.DataFrame(eval_data, columns=["text"])


['txt_ドイル アーサー・コナン', 'txt_江戸川 乱歩']
dir_count = 2
file_count = 141


In [3]:
# ---------------------------------------------------------
# データの保存
# ---------------------------------------------------------
csv_path = "./csv"

if not os.path.exists(csv_path):
  os.makedirs(csv_path)

train_data.to_csv(os.path.join(csv_path, "train_data.csv"), index=False, encoding='utf-8')
eval_data.to_csv(os.path.join(csv_path, "eval_data.csv"), index=False, encoding='utf-8')

In [4]:
# ---------------------------------------------------------
# トークナイズ
# ---------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained("rinna/japanese-gpt2-xsmall")

def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)

train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|██████████| 105/105 [00:02<00:00, 37.43 examples/s]
Map: 100%|██████████| 36/36 [00:01<00:00, 31.84 examples/s]


In [5]:
# ---------------------------------------------------------
# 学習と評価
# ---------------------------------------------------------
model = GPT2LMHeadModel.from_pretrained("rinna/japanese-gpt2-xsmall")
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# for param in model.transformer.h[0:2].parameters():    # 先頭 2 層を凍結
#   param.requires_grad = False

# 最初の 12 層をフリーズし、最後の層は学習可能にする
# for param in model.transformer.h[:12].parameters():
#  param.requires_grad = False

for param in model.transformer.parameters():    # すべての層をフリーズ
  param.requires_grad = False

for param in model.lm_head.parameters():        # エラーにならないよう、出力層（lm_head）のフリーズを解除
  param.requires_grad = True

training_args = TrainingArguments(
  output_dir = "./results",                 # 結果を格納するディレクトリー
  logging_dir = "./logs",                   # 途中経過のログを格納するディレクトリー
  overwrite_output_dir = True,              # ファイルを上書きする
  num_train_epochs = 2,                     # エポック数
  per_device_train_batch_size = 4,          # 訓練時のバッチサイズ
  per_device_eval_batch_size = 4,           # 評価時のバッチサイズ
  warmup_steps = 500,                       # 学習系数がこのステップ数で徐々に増加
  weight_decay = 0.01,                      # 重みの減衰率
  eval_strategy = "steps",                  # 訓練中、一定のステップごとに評価
  save_safetensors = False                  # safetensors の使用を無効化
)

trainer = Trainer(
  model = model,                            # 使用するモデルを指定
  args = training_args,                     # TrainingArguments の設定
  train_dataset = train_dataset,            # 訓練用のデータ
  eval_dataset = eval_dataset,              # 評価用のデータ
  data_collator = data_collator             # データコレーターを設定
)

trainer.train()
trainer.evaluate()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 54/54 [00:23<00:00,  2.33it/s]


{'train_runtime': 23.176, 'train_samples_per_second': 9.061, 'train_steps_per_second': 2.33, 'train_loss': 4.549195466218172, 'epoch': 2.0}


100%|██████████| 9/9 [00:01<00:00,  5.82it/s]


{'eval_loss': 4.616745471954346,
 'eval_runtime': 1.7518,
 'eval_samples_per_second': 20.55,
 'eval_steps_per_second': 5.138,
 'epoch': 2.0}

In [6]:
# ---------------------------------------------------------
# モデルの保存
# ---------------------------------------------------------
model_path = "./model"

if not os.path.exists(model_path):
  os.makedirs(model_path)

model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./model\\tokenizer_config.json',
 './model\\special_tokens_map.json',
 './model\\spiece.model',
 './model\\added_tokens.json',
 './model\\tokenizer.json')