In [1]:
import os
import glob
import pickle
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset

torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ---------------------------------------------------------
# データの読み込み
# ---------------------------------------------------------
def load_corpus(corpus_dir):

  dir_list = []
  news_data = []
  dir_count = 0
  file_count = 0
  read_count = 0

  for f in os.listdir(path=corpus_dir):
    if os.path.isdir(os.path.join(corpus_dir, f)):
      dir_list.append(f)

  for i in range(len(dir_list)):

    if os.path.isdir(os.path.join(corpus_dir, dir_list[i])):
      read_count = 0
      for file_path in glob.glob(os.path.join(corpus_dir, dir_list[i], "*.txt")):
        if os.path.basename(file_path) != "LICENSE.txt":
          with open(file_path, "r", encoding="utf-8") as f:
            content = f.read().splitlines()[2:]       # 最初の 2 行はメタデータなのでスキップ
            text = "".join(content)
            text = text.translate(str.maketrans({"\n":"", "\r":"", "\t":"", "\u3000":""}))
            news_data.append(text)
            if (read_count >= 10000):       # ディレクトリー当たりの読み込む最大ファイル数
              break
            read_count = read_count + 1 
          file_count = file_count + 1
      dir_count = dir_count + 1

  print(dir_list)
  print(f"dir_count = {dir_count}")
  print(f"file_count = {file_count}")

  with open("./label.pkl", "wb") as f:
    pickle.dump(dir_list, f)

  return news_data

news_data = load_corpus('./text')

train_data, eval_data = train_test_split(news_data, shuffle=True)
train_data = pd.DataFrame(train_data, columns=["text"])
eval_data = pd.DataFrame(eval_data, columns=["text"])

['movie-enter']
dir_count = 1
file_count = 870


In [3]:
# ---------------------------------------------------------
# データの保存
# ---------------------------------------------------------
csv_path = "./csv"

if not os.path.exists(csv_path):
  os.makedirs(csv_path)

train_data.to_csv(os.path.join(csv_path, "train_data.csv"), index=False, encoding='utf-8')
eval_data.to_csv(os.path.join(csv_path, "eval_data.csv"), index=False, encoding='utf-8')

In [4]:
# ---------------------------------------------------------
# トークナイズ
# ---------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained("rinna/japanese-gpt2-small")

def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)

train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|██████████| 652/652 [00:00<00:00, 1056.64 examples/s]
Map: 100%|██████████| 218/218 [00:00<00:00, 936.61 examples/s]


In [5]:
# ---------------------------------------------------------
# 評価と学習
# ---------------------------------------------------------
model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt2-small")
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# for param in model.transformer.h[:2].parameters():    # 先頭 2 層を凍結
#   param.requires_grad = False

training_args = TrainingArguments(
  output_dir = "./results",                 # 結果を格納するディレクトリー
  logging_dir = "./logs",                   # 途中経過のログを格納するディレクトリー
  overwrite_output_dir = True,              # ファイルを上書きする
  num_train_epochs = 2,                     # エポック数
  per_device_train_batch_size = 4,          # 訓練時のバッチサイズ
  per_device_eval_batch_size = 4,           # 評価時のバッチサイズ
  warmup_steps = 500,                       # 学習系数がこのステップ数で徐々に増加
  weight_decay = 0.01,                      # 重みの減衰率
  eval_strategy = "steps",                  # 訓練中、一定のステップごとに評価
  save_safetensors = False                  # safetensors の使用を無効化
)

trainer = Trainer(
  model = model,                            # 使用するモデルを指定
  args = training_args,                     # TrainingArguments の設定
  train_dataset = train_dataset,            # 訓練用のデータ
  eval_dataset = eval_dataset,              # 評価用のデータ
  data_collator = data_collator             # データコレーターを設定
)

trainer.train()
trainer.evaluate()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 326/326 [02:39<00:00,  2.05it/s]


{'train_runtime': 159.2254, 'train_samples_per_second': 8.19, 'train_steps_per_second': 2.047, 'train_loss': 3.3816337351418713, 'epoch': 2.0}


100%|██████████| 55/55 [00:06<00:00,  8.53it/s]


{'eval_loss': 3.122377634048462,
 'eval_runtime': 6.7426,
 'eval_samples_per_second': 32.332,
 'eval_steps_per_second': 8.157,
 'epoch': 2.0}

In [6]:
# ---------------------------------------------------------
# モデルの保存
# ---------------------------------------------------------
model_path = "./model"

if not os.path.exists(model_path):
  os.makedirs(model_path)

model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./model\\tokenizer_config.json',
 './model\\special_tokens_map.json',
 './model\\spiece.model',
 './model\\added_tokens.json',
 './model\\tokenizer.json')