In [2]:
# Remove once kubeflow-training SDK is upgraded to 1.9.0

In [3]:
pip install --upgrade kubeflow-training


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Dependencies for local inference

In [None]:
pip install --upgrade datasets s3fs transformers

In [26]:
def train_func():
    import os
    import logging
    from transformers import (
        AutoModelForCausalLM,
        AutoTokenizer,
        TrainingArguments,
        DataCollatorForLanguageModeling,
        pipeline,
    )
    from trl import SFTTrainer, SFTConfig
    from datasets import load_dataset
    from datasets.distributed import split_dataset_by_node
    from peft import LoraConfig, get_peft_model
    from accelerate import PartialState
    import s3fs

    rank = int(os.environ["RANK"])

    model_name = "meta-llama/Llama-3.2-1B-Instruct"
    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path=model_name,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path=model_name,
    )
    tokenizer.pad_token = tokenizer.eos_token

    def format_dataset(example):
        messages = [
            {"role": "user", "content": example['question']},
            {"role": "assistant", "content": example['answer']}
        ]
        prompt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=False
        )
        return {"prompt": prompt}

    # Use pipeline to retrieve sample response for test sample, to visually review training progress
    def infer_answer(model, question):
        messages = [
            {"role": "user", "content": question},
        ]
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0, temperature = 0.01)
        prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        outputs = pipe(messages, max_new_tokens=256)
        return outputs[0]['generated_text'][-1]['content']

    dataset = load_dataset("openai/gsm8k", "main")
    train_data = dataset["train"].map(format_dataset, remove_columns=['question', 'answer'])
    eval_data = dataset["test"].map(format_dataset, remove_columns=['question', 'answer'])

    training_args = SFTConfig(
        dataset_text_field="prompt",
        max_seq_length=1024,
        output_dir="/tmp",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        learning_rate=5e-7,
        logging_dir="/logs",
        eval_strategy="epoch",
        save_strategy="no",
        fsdp="full_shard",
        fsdp_config={
            "fsdp_state_dict_type": "SHARDED_STATE_DICT",
            "fsdp_sharding_strategy": "FULL_SHARD",
        },
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=eval_data,
        tokenizer=tokenizer,
        args=training_args,
    )

    # Check model response for sample test data to see how untrained model responds
    if rank == 0:
        test_prompt_answer = infer_answer(model, dataset['test']['question'][0])
        print(f"Query:\n{dataset['test']['question'][0]}")
        print(f"Original Answer:\n{dataset['test']['answer'][0]}")
        print(f"Generated Answer:\n{test_prompt_answer}")

    # Train and save the model.
    trainer.train()

    # https://github.com/huggingface/transformers/issues/30491
    if trainer.is_fsdp_enabled:
        trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")

    if rank == 0:
        save_model_path = "./saved-model"
        trainer.save_model(save_model_path)
        # Store trained model on AWS
        s3 = s3fs.S3FileSystem()
        s3_path = os.environ["AWS_S3_BUCKET"] + '/saved-model'
        s3.put(save_model_path, s3_path, recursive=True)

    # TODO remove before merging to examples
    print("parallel_mode: " + str(trainer.args.parallel_mode))
    print("is_model_parallel: " + str(trainer.is_model_parallel))
    print("model_wrapped: " + str(trainer.model_wrapped))

    # Check model response for sample test data to see how trained model responds
    if rank == 0:
        model = AutoModelForCausalLM.from_pretrained(
            pretrained_model_name_or_path="./saved-model/",
        )
        test_prompt_answer = infer_answer(model, dataset['test']['question'][0])
        print(f"Query:\n{dataset['test']['question'][0]}")
        print(f"Original Answer:\n{dataset['test']['answer'][0]}")
        print(f"Generated Answer:\n{test_prompt_answer}")

In [27]:
from kubeflow.training import TrainingClient
from kubernetes import client
from kubernetes.client import (
    V1EnvVar,
    V1EnvVarSource,
    V1SecretKeySelector
)

job_name = "pytorch-fsdp"

# aws_connection_name value should be the same as connection name in Data science project where the Workbench is running
aws_connection_name = "workbench-aws"

# Provide URL and token with all needed rights
# On OpenShift, you can retrieve the token by running `oc whoami -t`,
# and the server with `oc cluster-info`.

# token = ""
# openshift_api_url = ""

# api_key = {"authorization": "Bearer " + token}
# config = client.Configuration(host=openshift_api_url, api_key=api_key)
# config.verify_ssl = False
# tc = TrainingClient(client_configuration=config)


# Alternatively add edit role for user running this Notebook using oc CLI:
# oc adm policy add-role-to-user edit system:serviceaccount:<namespace>:<workbench name> -n <namespace>
tc = TrainingClient()

tc.create_job(
    job_kind="PyTorchJob",
    name=job_name,
    train_func=train_func,
    num_workers=2,
    num_procs_per_worker="auto",
    resources_per_worker={"gpu": 2},
    base_image="quay.io/modh/training:py311-cuda121-torch241",
    env_vars=[
        V1EnvVar(name="HF_TOKEN", value_from=V1EnvVarSource(secret_key_ref=V1SecretKeySelector(key="HF_TOKEN", name="hf-token"))),
        V1EnvVar(name="NCCL_DEBUG", value="INFO"),
        V1EnvVar(name="AWS_ACCESS_KEY_ID", value_from=V1EnvVarSource(secret_key_ref=V1SecretKeySelector(key="AWS_ACCESS_KEY_ID", name=aws_connection_name))),
        V1EnvVar(name="AWS_S3_BUCKET", value_from=V1EnvVarSource(secret_key_ref=V1SecretKeySelector(key="AWS_S3_BUCKET", name=aws_connection_name))),
        V1EnvVar(name="AWS_S3_ENDPOINT", value_from=V1EnvVarSource(secret_key_ref=V1SecretKeySelector(key="AWS_S3_ENDPOINT", name=aws_connection_name))),
        V1EnvVar(name="AWS_SECRET_ACCESS_KEY", value_from=V1EnvVarSource(secret_key_ref=V1SecretKeySelector(key="AWS_SECRET_ACCESS_KEY", name=aws_connection_name))),
    ],
    packages_to_install=[
        "s3fs",
    ],
)

In [None]:
logs, _ = tc.get_job_logs(job_name, follow=True)

[Pod pytorch-fsdp-master-0]: ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
[Pod pytorch-fsdp-master-0]: datasets 3.3.2 requires fsspec[http]<=2024.12.0,>=2023.1.0, but you have fsspec 2025.2.0 which is incompatible.
[Pod pytorch-fsdp-master-0]: W0225 11:56:28.146000 140310728419136 torch/distributed/run.py:779] 
[Pod pytorch-fsdp-master-0]: W0225 11:56:28.146000 140310728419136 torch/distributed/run.py:779] *****************************************
[Pod pytorch-fsdp-master-0]: W0225 11:56:28.146000 140310728419136 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
[Pod pytorch-fsdp-master-0]: W0225 11:56:28.146000 140310728419136 torch/distributed/run.py:779] **************

In [22]:
tc.delete_job(name=job_name)

In [23]:
import s3fs
import os

# Download trained model into local filesystem
s3 = s3fs.S3FileSystem()
s3_path = os.environ["AWS_S3_BUCKET"] + '/saved-model'
s3.get(s3_path, "./saved-model", recursive=True)

[None, None, None, None, None, None, None, None, None, None]

In [24]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    pipeline,
)
from datasets import load_dataset

# infer sample test result
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path="./saved-model",
).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path="./saved-model",
)
tokenizer.pad_token = tokenizer.eos_token
dataset = load_dataset("openai/gsm8k", "main")

def infer_answer(model, question):
    messages = [
        {"role": "user", "content": question},
    ]
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0, temperature = 0.01)
    prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    outputs = pipe(messages, max_new_tokens=256)
    return outputs[0]['generated_text'][-1]['content']

test_prompt_answer = infer_answer(model, dataset['test']['question'][0])
print(f"Query:\n{dataset['test']['question'][0]}")
print(f"Original Answer:\n{dataset['test']['answer'][0]}")
print(f"Generated Answer:\n{test_prompt_answer}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Query:
Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Original Answer:
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18
Generated Answer:
She eats 3 eggs for breakfast every morning, so she eats 3 eggs/day * 16 eggs/day = <<3*16=48>>48 eggs/day
She bakes muffins for her friends every day with 4 muffins, so she bakes 4 muffins/day * 16 muffins/day = <<4*16=64>>64 muffins/day
She sells the remainder at the farmers' market daily for $2 per fresh duck egg, so she sells 64 muffins/day - 48 eggs/day = <<64-48=16>>16 muffins/day
She sells 16 muffins/day at the farmers' market for $2 per muffin, so she makes 16 muffins/day * $2/muffin = $<<16*2=32>>32/day
#### 32


In [25]:
# Unload the model from GPU memory
import gc
import torch

del model

gc.collect()
torch.cuda.empty_cache()