# Setup

In [None]:
# Install the YAML magic
!pip install yamlmagic
%load_ext yamlmagic

# Training Configuration

Edit the following training parameters:

In [None]:
%%yaml parameters

# Model
model_name_or_path: Meta-Llama/Meta-Llama-3.1-8B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2    # one of eager (default), sdpa or flash_attention_2
use_liger: false                          # use Liger kernels

# PEFT / LoRA
use_peft: true
lora_r: 16
lora_alpha: 8
lora_dropout: 0.05
lora_target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
lora_modules_to_save: []

# QLoRA (BitsAndBytes)
load_in_4bit: false                       # use 4 bit precision for the base model (only with LoRA)
load_in_8bit: false                       # use 8 bit precision for the base model (only with LoRA)

# Dataset
dataset_name: gsm8k                       # id or path to the dataset
dataset_config: main                      # name of the dataset configuration
dataset_train_split: train                # dataset split to use for training
dataset_test_split: test                  # dataset split to use for evaluation
dataset_text_field: text                  # name of the text field of the dataset
dataset_kwargs:
  add_special_tokens: false               # template with special tokens
  append_concat_token: false              # add additional separator token

# SFT
max_seq_length: 1024                      # max sequence length for model and packing of the dataset
dataset_batch_size: 1000                  # samples to tokenize per batch
packing: false

# Training
num_train_epochs: 10                      # number of training epochs

per_device_train_batch_size: 32           # batch size per device during training
per_device_eval_batch_size: 32            # batch size for evaluation
auto_find_batch_size: false               # find a batch size that fits into memory automatically
eval_strategy: epoch                      # evaluate every epoch

bf16: true                                # use bf16 16-bit (mixed) precision
tf32: false                               # use tf32 precision

learning_rate: 2.0e-4                     # initial learning rate
warmup_steps: 10                          # steps for a linear warmup from 0 to `learning_rate`
lr_scheduler_type: inverse_sqrt           # learning rate scheduler (see transformers.SchedulerType)
# lr_scheduler_type: reduce_lr_on_plateau
# lr_scheduler_kwargs:
  # patience: 1
  # factor: 0.2
# metric_for_best_model: eval_loss

optim: adamw_torch_fused                  # optimizer (see transformers.OptimizerNames)
max_grad_norm: 1.0                        # max gradient norm
seed: 42

gradient_accumulation_steps: 1            # number of steps before performing a backward/update pass
gradient_checkpointing: false             # use gradient checkpointing to save memory
gradient_checkpointing_kwargs:
  use_reentrant: false

# FSDP
fsdp: "full_shard auto_wrap"              # add offload if not enough GPU memory
fsdp_config:
  activation_checkpointing: true
  cpu_ram_efficient_loading: false
  sync_module_states: true
  use_orig_params: true
  limit_all_gathers: false

# Checkpointing
save_strategy: epoch                      # save checkpoint every epoch
save_total_limit: 1                       # limit the total amount of checkpoints
resume_from_checkpoint: false             # load the last checkpoint in output_dir and resume from it

# Logging
log_level: warning                        # logging level (see transformers.logging)
logging_strategy: steps
logging_steps: 1                          # log every N steps
report_to:
- tensorboard                             # report metrics to tensorboard

output_dir: /mnt/shared/Meta-Llama-3.1-8B-Instruct

# Training Loop

Review the training function. You can adjust the chat template if needed depending on the model you want to fine-tune:

In [None]:
def main(parameters):
    import random

    from datasets import load_dataset
    from transformers import (
        AutoTokenizer,
        set_seed,
    )

    from trl import (
        ModelConfig,
        ScriptArguments,
        SFTConfig,
        SFTTrainer,
        TrlParser,
        get_peft_config,
        get_quantization_config,
        get_kbit_device_map,
    )

    parser = TrlParser((ScriptArguments, SFTConfig, ModelConfig))
    script_args, training_args, model_args = parser.parse_dict(parameters)

    # Set seed for reproducibility
    set_seed(training_args.seed)

    # Model and tokenizer
    quantization_config = get_quantization_config(model_args)
    model_kwargs = dict(
        revision=model_args.model_revision,
        trust_remote_code=model_args.trust_remote_code,
        attn_implementation=model_args.attn_implementation,
        torch_dtype=model_args.torch_dtype,
        use_cache=False if training_args.gradient_checkpointing or
                           training_args.fsdp_config.get("activation_checkpointing",
                                                         False) else True,
        device_map=get_kbit_device_map() if quantization_config is not None else None,
        quantization_config=quantization_config,
    )
    training_args.model_init_kwargs = model_kwargs
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code, use_fast=True
    )
    if tokenizer.pad_token is None:
        # Models like Llama 3 use a dedicated padding token
        right_pad_id = tokenizer.convert_tokens_to_ids('<|finetune_right_pad_id|>')
        if right_pad_id is not None:
            tokenizer.pad_token = '<|finetune_right_pad_id|>'
        else:
            tokenizer.pad_token = tokenizer.eos_token

    # Chat template
    # You may need to provide your own chat template if the model does not have a default one
    # or if you want to customize it
    # Llama 3 instruct template, make sure to add "lm_head" and "embed_tokens" layers to lora_modules_to_save
    # LLAMA_3_CHAT_TEMPLATE="{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
    # Anthropic/Vicuna like template without the need for special tokens
    # LLAMA_3_CHAT_TEMPLATE = (
    #     "{% for message in messages %}"
    #     "{% if message['role'] == 'system' %}"
    #     "{{ message['content'] }}"
    #     "{% elif message['role'] == 'user' %}"
    #     "{{ '\n\nHuman: ' + message['content'] +  eos_token }}"
    #     "{% elif message['role'] == 'assistant' %}"
    #     "{{ '\n\nAssistant: '  + message['content'] +  eos_token  }}"
    #     "{% endif %}"
    #     "{% endfor %}"
    #     "{% if add_generation_prompt %}"
    #     "{{ '\n\nAssistant: ' }}"
    #     "{% endif %}"
    # )
    # tokenizer.chat_template = LLAMA_3_CHAT_TEMPLATE

    # Datasets
    train_dataset = load_dataset(
        path=script_args.dataset_name,
        name=script_args.dataset_config,
        split=script_args.dataset_train_split,
    )
    test_dataset = None
    if training_args.eval_strategy != "no":
        test_dataset = load_dataset(
            path=script_args.dataset_name,
            name=script_args.dataset_config,
            split=script_args.dataset_test_split,
        )

    # Templatize datasets
    # You may need to adjust the mapping between columns and the chat template
    def template_dataset(sample):
        # return {"text": tokenizer.apply_chat_template(examples["messages"], tokenize=False)}
        messages = [
            {"role": "user", "content": sample['question']},
            {"role": "assistant", "content": sample['answer']},
        ]
        return {"text": tokenizer.apply_chat_template(messages, tokenize=False)}

    train_dataset = train_dataset.map(template_dataset, remove_columns=["question", "answer"])
    if training_args.eval_strategy != "no":
        # test_dataset = test_dataset.map(template_dataset, remove_columns=["messages"])
        test_dataset = test_dataset.map(template_dataset, remove_columns=["question", "answer"])

    # Check random samples
    with training_args.main_process_first(
        desc="Log few samples from the training set"
    ):
        for index in random.sample(range(len(train_dataset)), 2):
            print(train_dataset[index]["text"])

    # Training
    trainer = SFTTrainer(
        model=model_args.model_name_or_path,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        peft_config=get_peft_config(model_args),
        processing_class=tokenizer,
    )

    if trainer.accelerator.is_main_process and hasattr(trainer.model, "print_trainable_parameters"):
        trainer.model.print_trainable_parameters()

    checkpoint = None
    if training_args.resume_from_checkpoint is not None:
        checkpoint = training_args.resume_from_checkpoint

    trainer.train(resume_from_checkpoint=checkpoint)

    trainer.save_model(training_args.output_dir)

    with training_args.main_process_first(desc="Training completed"):
        print(f"Training completed, model checkpoint written to {training_args.output_dir}")

# Training Client

Configure the SDK client by providing the authentication token:

In [None]:
from kubernetes import client
from kubeflow.training import TrainingClient
from kubeflow.training.models import V1Volume, V1VolumeMount, V1PersistentVolumeClaimVolumeSource

api_server = "<API_SERVER>"
token = "<TOKEN>"

configuration = client.Configuration()
configuration.host = api_server
configuration.api_key = {"authorization": f"Bearer {token}"}
# Un-comment if your cluster API server uses a self-signed certificate or an un-trusted CA
#configuration.verify_ssl = False
api_client = client.ApiClient(configuration)
client = TrainingClient(client_configuration=api_client.configuration)

# Training Job

You're now almost ready to create the training job:
* Fill the `HF_TOKEN` environment variable with your HuggingFace token if you fine-tune a gated model 
* Check the number of worker nodes
* Amend the resources per worker according to the job requirements
* If you use AMD accelerators:
  * Change `nvidia.com/gpu` to `amd.com/gpu` in `resources_per_worker`
  * Change `base_image` to `quay.io/modh/training:py311-rocm62-torch251`
* Update the PVC name to the one you've attached to the workbench if needed

In [None]:
client.create_job(
    job_kind="PyTorchJob",
    name="sft",
    train_func=main,
    num_workers=8,
    num_procs_per_worker="1",
    resources_per_worker={
        "nvidia.com/gpu": 1,
        "memory": "64Gi",
        "cpu": 4,
    },
    base_image="quay.io/modh/training:py311-cuda124-torch251",
    env_vars={
        # HuggingFace
        "HF_HOME": "/mnt/shared/.cache",
        "HF_TOKEN": "",
        # CUDA / ROCm (HIP)
        "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
        "PYTORCH_HIP_ALLOC_CONF": "expandable_segments:True",
        # NCCL / RCCL
        "NCCL_DEBUG": "INFO",
    },
    parameters=parameters,
    volumes=[
        V1Volume(name="shared",
                 persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(claim_name="shared")),
    ],
    volume_mounts=[
        V1VolumeMount(name="shared", mount_path="/mnt/shared"),
    ],
)

Once the training job is created, you can follow its progress:

In [None]:
client.get_job_logs(
    name="sft",
    job_kind="PyTorchJob",
    follow=True,
)

# TensorBoard

You can track your job runs and visualize the training metrics with TensorBoard:

In [None]:
import os
os.environ["TENSORBOARD_PROXY_URL"]= os.environ["NB_PREFIX"]+"/proxy/6006/"

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir /opt/app-root/src/shared

# Testing

## Testing the Pre-Trained Model

If you've configured the workbench with a NVIDIA GPU or AMD accelerator, you can run inferences to validate the output generated by the fine-tuned model and compare it to the output of the pre-trained model.

In [None]:
# Install / upgrade dependencies
!pip install --upgrade transformers peft

In [None]:
import torch
import json
import transformers

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import LoraConfig, PeftModel
from IPython.display import display, Markdown

Check / update the paths to the pre-trained and fine-tuned model checkpoints prior to executing the cells below. 

In [None]:
# Load the pre-trained model
pretrained_path = "/opt/app-root/src/shared/.cache/hub/models--Meta-Llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/"
base_model = AutoModelForCausalLM.from_pretrained(
    pretrained_path,
    local_files_only=True,
    torch_dtype=torch.bfloat16,
).to("cuda")

In [None]:
# Configure the tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if base_model.config.pad_token_id is None:
    base_model.config.pad_token_id = base_model.config.eos_token_id

In [None]:
# Test the pre-trained model
pipeline = transformers.pipeline(
    "text-generation",
    model=base_model,
    tokenizer=tokenizer,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {
        "role": "user",
        "content": "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
    }
]

outputs = pipeline(messages, max_new_tokens=256, temperature = 0.01)

output1 = ""
for turn in outputs:
    for item in turn["generated_text"]:
        output1 += f"# {item['role']}\n\n{item['content']}\n\n"

display(Markdown(output1))

## Merging the LoRA adapters

If you've configured the training to use LoRA, then you can merge the fine-tuned LoRA adapters / layers into the pre-trained model.

Check / update the path to the fine-tuned model checkpoint prior to executing the cell below. 

In [None]:
# Merge the fine-tuned adapters into the base model 
finetuned_path = "/opt/app-root/src/shared/Meta-Llama-3.1-8B-Instruct/checkpoint-300/"
model = PeftModel.from_pretrained(base_model, finetuned_path)
model = model.merge_and_unload()

## Testing the Fine-Tuned Model

In [None]:
# Test the fine-tuned model
pipeline = transformers.pipeline(
    "text-generation",
    model=base_model,
    tokenizer=tokenizer,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {
        "role": "user",
        "content": "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
    }
]

outputs = pipeline(messages, max_new_tokens=256, temperature = 0.01)

output2 = ""
for turn in outputs:
    for item in turn["generated_text"]:
        output2 += f"# {item['role']}\n\n{item['content']}\n\n"

display(Markdown(output2))

# Cleaning Up

## Training Job

Once you're done or want to re-create the training job, you can delete the existing one:

In [None]:
client.delete_job(name="sft")

## GPU Memory

If you want to start over and test the pre-trained model again, you can free the GPU / accelerator memory with:

In [None]:
# Unload the model from GPU memory
import gc

del base_model, model

gc.collect()
torch.cuda.empty_cache()