In [4]:
!uv pip install yamlmagic
%load_ext yamlmagic

[2mAudited [1m1 package[0m [2min 3ms[0m[0m


In [1]:
from kubernetes import client, config, utils

configuration = client.Configuration()
config.load_kube_config(client_configuration=configuration)
k8s_client = client.ApiClient(configuration)

In [8]:
%%yaml output_pvc

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  labels:
    opendatahub.io/dashboard: "true"
  name: output
spec:
  accessModes:
  - ReadWriteMany
  resources:
    requests:
      storage: 100Gi
  storageClassName: nfs-csi

<IPython.core.display.Javascript object>

In [None]:
utils.create_from_dict(k8s_client, namespace="llama-stack", apply=True, data=output_pvc)

In [None]:
from llama_stack_client.types.post_training_supervised_fine_tune_params import (
    TrainingConfig,
    TrainingConfigDataConfig,
    TrainingConfigEfficiencyConfig,
    TrainingConfigOptimizerConfig,
)
from llama_stack_client.types.algorithm_config_param import LoraFinetuningConfig
from rich.pretty import pprint

In [None]:
algorithm_config = LoraFinetuningConfig(
    type="LoRA",
    # List of which linear layers LoRA should be applied to in each self-attention block
    # Options are {"q_proj", "k_proj", "v_proj", "output_proj"}.
    lora_attn_modules=["q_proj", "v_proj", "output_proj"],
    # Whether to apply LoRA to the MLP in each transformer layer. Default: False
    apply_lora_to_mlp=True,
    # Whether to apply LoRA to the model's final output projection. Default: False
    apply_lora_to_output=False,
    # Rank of each low-rank approximation
    rank=8,
    # Scaling factor for the low-rank approximation
    alpha=16,
)

In [None]:
data_config = TrainingConfigDataConfig(
    # Identifier of the registered dataset for finetune
    # Use client.datasets.list() to check all the available datasets
    dataset_id="post_training_dataset",
    # Identifier of the registered dataset to validate the finetune model
    # on validation_loss and perplexity
    # Skip this if you don't want to run validatation on the model
    validation_dataset_id="post_training_dataset",
    # Training data batch size
    batch_size=8,
    # Whether to shuffle the dataset.
    shuffle=False,
    # dataset format, select from ['instruct', 'dialog']
    # change it to 'dialog' if you use dialog format dataset
    data_format='instruct',
)

In [None]:
optimizer_config = TrainingConfigOptimizerConfig(
    # Currently only support adamw
    optimizer_type="adamw",
    # Learning rate
    lr=3e-4,
    # adamw weight decay coefficient
    weight_decay=0.1,
    # The number of steps for the warmup phase for lr scheduler
    num_warmup_steps=10,
)

In [None]:
efficiency_config = TrainingConfigEfficiencyConfig(
    # Help reduce memory by recalculating some intermediate activations
    # during backward
    enable_activation_checkpointing=True,
    # We offer another memory efficiency flag called enable_activation_offloading
    # which moves certain activations from GPU memory to CPU memory
    # This further reduces GPU memory usage at the cost of additional
    # data transfer overhead and possible slowdowns
    # enable_activation_offloading=False,
)

In [None]:
training_config = TrainingConfig(
    # num of training epochs
    n_epochs=1,
    data_config=data_config,
    efficiency_config=efficiency_config,
    optimizer_config=optimizer_config,
    # max num of training steps per epoch
    max_steps_per_epoch=10000,
    # max num of steps for validation
    max_validation_steps=10,
    # Accumulate how many steps to calculate the gradient and update model parameters
    # This is to simulate large batch size training while memory is limited
    gradient_accumulation_steps=4,
)

In [None]:
from llama_stack_client import LlamaStackClient

lls_client = LlamaStackClient(base_url="http://0.0.0.0:8321", provider_data={})

In [None]:
# call supervised finetune API
training_job = lls_client.post_training.supervised_fine_tune(
    job_uuid="fine-tune-llm",
    # Base Llama model to be finetuned on
    model="meta-llama/Llama-3.2-3B-Instruct",
    algorithm_config=algorithm_config,
    # algorithm_config=None,
    training_config=training_config,
    # Base model checkpoint dir
    # By default, the implementation will look at ~/.llama/checkpoints/<model>
    checkpoint_dir="null",
    # logger_config and hyperparam_search_config haven't been supported yet
    logger_config={},
    hyperparam_search_config={},
)

pprint(training_job)