<a href="https://colab.research.google.com/github/miho777/FineTuning-open-calm/blob/main/Piper_FineTuning_0401b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install accelerate
!pip install transformers[torch]

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
# from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import transformers
import torch

In [None]:
# CSVファイルから追加学習データを読み込む
df = pd.read_csv("/content/redfish_llm.csv")  # ***ここは揮発領域***

In [None]:
# df.shape

In [None]:
# 追加学習データの表示
df

In [None]:
### ベースとなるLLMモデルの指定
# cyberagent/open-calm-small
# cyberagent/open-calm-medium
# cyberagent/open-calm-large
base_model = "cyberagent/open-calm-medium"

# トークナイザーの読み込み
tokenizer = transformers.AutoTokenizer.from_pretrained(base_model)

# ベースモデルの読み込み
model = transformers.AutoModelForCausalLM.from_pretrained(base_model)

# 追加学習データ用の前処理
def preprocess_function(examples):
    return tokenizer(examples["input"], return_tensors="pt")
    # return tokenizer(examples["input"], padding="max_length", max_length=8, truncation=True, return_tensors="pt")
    # return tokenizer(examples["input"], padding=True, max_length=24, truncation=True, return_tensors="pt")


In [None]:
# 追加学習データの加工
train_dataset = Dataset.from_pandas(df)
data = DatasetDict(
    {
        "train": train_dataset,
    }
)
# train_dataset = train_dataset.map(preprocess_function)
data = data.map(lambda samples: tokenizer(samples["output"]), batched=True)

In [None]:
data["train"][1]

In [None]:
# Fine Tuningの設定
# *** pip XXXのエラーが消えない場合は、ランタイム >セッション再起動　で解決 ***
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        # per_device_train_batch_size=4,
        # gradient_accumulation_steps=4,
        # warmup_steps=50,
        # # max_steps=500,
        # # warmup_steps=5,
        # # max_steps=200, # Epoch
        # max_steps=10, # Epoch
        # learning_rate=2e-4,
        # #fp16=True,
        logging_steps=1,
        output_dir="./output",
        num_train_epochs = 5,
        # per_device_train_batch_size = 4,
        warmup_steps = 10,
        weight_decay = 0.1,
        save_steps = 10,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
# ファインチューニングの実行
trainer.train()

In [None]:
### 推論 ###

input_text = "日本の有名な山は？"
# input_text = "日本の有名な観光地は？"
# input_text = "ネットワークコレクションのURIは何？"

# トークナイズ
input = tokenizer(input_text, return_tensors="pt")

# モデルによる推論実行
with torch.no_grad():
    output = model.generate(**input, max_new_tokens=24, pad_token_id=tokenizer.pad_token_id,)

# 出力テンソルをデコード（文字化）
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

# 結果を出力
print(output_text)

In [None]:
# 学習後のモデルを保存
trainer.save_model("./output/fine_tuned_model")