In [None]:
!pip install datasets

In [None]:
import os
import glob
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import GPT2LMHeadModel, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ---------------------------------------------------------
# データの読み込み
# ---------------------------------------------------------
def load_livedoor_corpus(corpus_dir):

  dir_list = []
  news_data = []
  dir_count = 0
  file_count = 0
  read_count = 0

  for f in os.listdir(path=corpus_dir):
    if os.path.isdir(os.path.join(corpus_dir, f)):
      dir_list.append(f)

  for i in range(len(dir_list)):

    if os.path.isdir(os.path.join(corpus_dir, dir_list[i])):
      read_count = 0
      for file_path in glob.glob(os.path.join(corpus_dir, dir_list[i], "*.txt")):
        if os.path.basename(file_path) != "LICENSE.txt":
          with open(file_path, "r", encoding="utf-8") as f:
            content = f.read().splitlines()[2:]   # 最初の 2 行はメタデータなのでスキップ
            text = "".join(content)
            text = text.translate(str.maketrans({"\n": "", "\r": "", "\t": ""}))    # 特殊文字を除去
            news_data.append(text)
            if (read_count >= 100):       # 読み込む各ジャンルの記事の数
              break
            read_count = read_count + 1
          file_count = file_count + 1
      dir_count = dir_count + 1

  print(dir_list)
  print(f"dir_count = {dir_count}")
  print(f"file_count = {file_count}")

  with open("/content/drive/MyDrive/03.授業実施/2024-1st/AI_Web 実習/43.Fine Tuning - Japanese Generator (for Colab)/label.pkl", "wb") as f:
    pickle.dump(dir_list, f)

  return news_data

news_data = load_livedoor_corpus('/content/drive/Shareddrives/_00g-【AIシステム科／AI04】学生公開ドライブ１（参照用）/20240901 - AI・Web実習_データフォルダー/livedoor news copus')

train_data, eval_data = train_test_split(news_data, shuffle=True)
train_data = pd.DataFrame(train_data, columns=["text"])
eval_data = pd.DataFrame(eval_data, columns=["text"])

In [None]:
# ---------------------------------------------------------
# データの保存
# ---------------------------------------------------------
csv_path = "/content/drive/MyDrive/03.授業実施/2024-1st/AI_Web 実習/43.Fine Tuning - Japanese Generator (for Colab)/csv/"

if not os.path.exists(csv_path):
  os.makedirs(csv_path)

train_data.to_csv(os.path.join(csv_path, "train_data.csv"), index=False, encoding='utf-8')
eval_data.to_csv(os.path.join(csv_path, "eval_data.csv"), index=False, encoding='utf-8')

In [None]:
# ---------------------------------------------------------
# トークナイズ
# ---------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained("rinna/japanese-gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token

def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)

train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [None]:
# ---------------------------------------------------------
# 学習と評価
# ---------------------------------------------------------
model = GPT2LMHeadModel.from_pretrained("rinna/japanese-gpt2-medium")
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

def compute_metrics(result):
  labels = result.label_ids
  preds = result.predictions.argmax(-1)
  acc = accuracy_score(labels, preds)
  return {
    "accuracy": acc
  }

training_args = TrainingArguments(
  output_dir = "/content/drive/MyDrive/03.授業実施/2024-1st/AI_Web 実習/43.Fine Tuning - Japanese Generator (for Colab)/results/",
  logging_dir = "/content/drive/MyDrive/03.授業実施/2024-1st/AI_Web 実習/43.Fine Tuning - Japanese Generator (for Colab)/logs/",
  num_train_epochs = 3,
  per_device_train_batch_size = 2,
  per_device_eval_batch_size = 2,
  warmup_steps = 500,
  weight_decay = 0.01,
  eval_strategy = "steps",
  save_safetensors = False
)

trainer = Trainer(
  model = model,
  args = training_args,
  compute_metrics = compute_metrics,
  train_dataset = train_dataset,
#  eval_dataset = eval_dataset,
  data_collator = data_collator
)

trainer.train()
#trainer.evaluate()

In [None]:
# ---------------------------------------------------------
# モデルの保存
# ---------------------------------------------------------
model_path = "/content/drive/MyDrive/03.授業実施/2024-1st/AI_Web 実習/43.Fine Tuning - Japanese Generator (for Colab)/model/"

if not os.path.exists(model_path):
  os.makedirs(model_path)

model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
# ---------------------------------------------------------
# モデルを利用して文章を生成
# ---------------------------------------------------------
def generate_text(prompt, model, tokenizer, max_length=100):
  input_ids = tokenizer.encode(prompt, return_tensors="pt")
  output = model.generate(
    input_ids,
    max_length=max_length,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    top_k=50,
    top_p=0.95,
    temperature=1.0,
    do_sample=True,
    eos_token_id=tokenizer.eos_token_id,
  )
  return tokenizer.decode(output[0], skip_special_tokens=True)

# サンプルのプロンプトでテキストを生成
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

prompt = "今日はとても"
generated_text = generate_text(prompt, model, tokenizer)
print("生成された文章:", generated_text)