# 必要ライブラリのインストール

In [None]:
!pip install azure-ai-ml
!pip install azure-identity

In [None]:
!pip install azureml.core

# 実行元ジョブ用ディレクトリの作成＋ジョブ名の定義

In [None]:
!cp -r template_job dummy

# ジョブ名の定義

In [None]:
# 上でコピーした先のディレクトリ名をジョブ名とする
# !cp -r template_job test_jobならtest_job
pattern_id = "dummy"
job_name = "dummy_1"

# 定数の設定

In [None]:
import os

current_directory = os.getcwd()
print(current_directory)

In [None]:
BASE_DIR = f"{current_directory}/{job_name}"
print(BASE_DIR)

# azアカウントログイン->ワークスペースの定義、接続

In [None]:
#import required libraries
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

#Enter details of your Azure Machine Learning workspace
subscription_id = ""
resource_group = ""
workspace = ""

#connect to the workspace
ml_client = MLClient(DefaultAzureCredential(), subscription_id, resource_group, workspace)

# CCの接続

In [None]:
from azure.ai.ml.entities import AmlCompute

# specify aml compute name.
# 4GPU
gpu_compute_target = ""

ml_client.compute.get(gpu_compute_target)

# 実行ノードの定義(Dockerfile読込)

## 環境共通（APT,SFT,SFT_QLoRA）

In [None]:
from azure.ai.ml.entities import Environment, BuildContext

custom_env = Environment(
    build=BuildContext(path=f"{BASE_DIR}/config/OSS"),
    name="LLM-train-Env",
    description="Env to train SFT,QLoRA for OSS LLM",
)

# 学習ジョブの設定

# SFT

## Full

In [None]:
train_type = "SFT"

sub_id = ""
rg = ""
ws = ""
blob = ""

model_path = "elyza/ELYZA-japanese-Llama-2-7b-instruct"
output_path = f"azureml://subscriptions/{sub_id}/resourcegroups/{rg}/workspaces/{ws}/datastores/{blob}/paths/models/pattern_{pattern_id}/"

# 学習データ設定
train_data_path = f"data/train_data.csv"
valid_data_path = f"data/valid_data.csv"

# job初回起動時はenvはコメントアウトしているほうに変更してください
#env = custom_env
env = "LLM-train-Env:1"

# --save_steps 500 \
# --eval_steps 150 \
# --output_dir './outputs' \
cmd = f"accelerate launch train_script/train_SFT.py \
--model_name_or_path '{model_path}' \
--train_data_path '{train_data_path}' \
--valid_data_path '{valid_data_path}' \
--fp16 False \
--bf16 True \
--tf32 False \
--output_dir ${{outputs.model_output}} \
--num_train_epochs 3 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 2 \
--evaluation_strategy 'steps' \
--save_strategy 'steps' \
--save_steps 2500 \
--eval_steps 500 \
--save_total_limit 100 \
--learning_rate 1e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type 'cosine' \
--logging_steps 50 \
--fsdp 'shard_grad_op auto_wrap' \
--fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
--report_to 'mlflow' \
--ddp_timeout 7200"

In [None]:
from azure.ai.ml import command, Input, Output

# define the command
# python3.10 -m torch.distributed.run --nproc_per_node=4 --master_port=12345 train.py --model_name_or_path ${{inputs.base_model}} --data_path ${{inputs.train_data}} --fp16 True --output_dir ./outputs --num_train_epochs 5 --model_max_length 512 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --gradient_accumulation_steps 3 --evaluation_strategy 'no' --save_strategy 'steps' --save_steps 50000 --save_total_limit 1 --learning_rate 1e-5 --weight_decay 0. --warmup_ratio 0.02 --lr_scheduler_type 'cosine' --logging_steps 1 --fsdp 'shard_grad_op auto_wrap' --tf32 True --report_to 'mlflow'
command_job = command(
    code=f"{BASE_DIR}",
    # change point
    command=cmd,
    environment=env,
    compute=f"{gpu_compute_target}",
    timeout=180000,
    outputs={
        "model_output": Output(
            type="uri_folder",
            path=output_path,
            mode="rw_mount"
        )
    },
)

## QLoRA

In [None]:
train_type = "SFT_QLoRA"

sub_id = ""
rg = ""
ws = ""
blob = ""

model_path = "elyza/ELYZA-japanese-Llama-2-7b-instruct"
output_path = f"azureml://subscriptions/{sub_id}/resourcegroups/{rg}/workspaces/{ws}/datastores/{blob}/paths/models/pattern_{pattern_id}/"

# 学習データ設定
# train_data_path = "azureml://subscriptions/XXX/resourcegroups/YYY/workspaces/ZZZ/datastores/workspaceblobstore/paths/xxx"
# valid_data_path = "azureml://subscriptions/XXX/resourcegroups/YYY/workspaces/ZZZ/datastores/workspaceblobstore/paths/xxx"
train_data_path = "data/train_data.csv"
valid_data_path = "data/valid_data.csv"

# job初回起動時はenvはコメントアウトしているほうに変更してください
#env = custom_env
env = "LLM-train-Env:1"


# --eval_steps 150 \
# --save_strategy 'steps' \
# --save_steps 500 \
cmd = f"accelerate launch train_script/train_SFT_QLoRA.py \
--model_name {model_path} \
--fp16 False \
--bf16 True \
--tf32 False \
--train_data_path {train_data_path} \
--valid_data_path {valid_data_path} \
--output_dir ${{outputs.model_output}} \
--num_train_epochs 3 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 2 \
--evaluation_strategy 'steps' \
--eval_steps 500 \
--save_strategy 'steps' \
--save_steps 2500 \
--save_total_limit 10 \
--learning_rate 1e-5 \
--save_strategy steps \
--group_by_length True \
--logging_strategy steps \
--logging_steps 50 \
--weight_decay 0.0 \
--warmup_ratio 0.03 \
--max_grad_norm 0.3 \
--lr_scheduler_type 'cosine' \
--gradient_accumulation_steps 1 \
--report_to 'mlflow'"

In [None]:
from azure.ai.ml import command, Input, Output

# define the command
# python3.10 -m torch.distributed.run --nproc_per_node=4 --master_port=12345 train.py --model_name_or_path ${{inputs.base_model}} --data_path ${{inputs.train_data}} --fp16 True --output_dir ./outputs --num_train_epochs 5 --model_max_length 512 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --gradient_accumulation_steps 3 --evaluation_strategy 'no' --save_strategy 'steps' --save_steps 50000 --save_total_limit 1 --learning_rate 1e-5 --weight_decay 0. --warmup_ratio 0.02 --lr_scheduler_type 'cosine' --logging_steps 1 --fsdp 'shard_grad_op auto_wrap' --tf32 True --report_to 'mlflow'
command_job = command(
    code=BASE_DIR,
    # change point
    command=cmd,
    environment=env,
    compute=f"{gpu_compute_target}",
    timeout=180000,
    outputs={
        "model_output": Output(
            type="uri_folder",
            path=output_path,
            mode="rw_mount"
        )
    },
)

# ジョブ投入

In [None]:
# submit the command
returned_job = ml_client.jobs.create_or_update(
    # jobを指定
    command_job,
    # ディスプレイ名を設定 
    display_name=job_name + "_" + train_type,
    # ジョブ名を設定 
    experiment_name=job_name
)
# get a URL for the status of the job
returned_job.studio_url