In [17]:
import subprocess, json

NS = "ray-finetune-llm-deepspeed002"

def run(cmd):
    return subprocess.check_output(cmd, text=True).strip()

print("=== oc whoami ===")
print(run(["oc", "whoami"]))

print("\n=== RayCluster status ===")
try:
    data = json.loads(run(["oc", "get", "raycluster", "ray", "-n", NS, "-o", "json"]))
    head_tols = data["spec"]["headGroupSpec"]["template"]["spec"].get("tolerations", [])
    has_gpu_tol = any(
        t.get("key") == "nvidia.com/gpu" and t.get("effect") == "NoSchedule"
        for t in head_tols
    )
    print("RayCluster found. GPU toleration on head:", has_gpu_tol)
except Exception as e:
    print("❌ RayCluster not ready. Please contact your instructor.")
    print(e)


=== oc whoami ===
system:serviceaccount:ray-finetune-llm-deepspeed002:notebook

=== RayCluster status ===
RayCluster found. GPU toleration on head: False


In [18]:
import subprocess
from codeflare_sdk import TokenAuthentication

token = subprocess.check_output(["oc", "whoami", "-t"]).decode().strip()
server = subprocess.check_output(
    ["oc", "whoami", "--show-server=true"]
).decode().strip()

auth = TokenAuthentication(
    token=token,
    server=server,
    skip_tls=True,  # set True if your cluster TLS is self-signed and noisy
)
auth.login()
print("Authenticated to OpenShift API:", server)


Authenticated to OpenShift API: https://172.30.0.1:443


In [20]:
from ray.job_submission import JobSubmissionClient

NS = "ray-finetune-llm-deepspeed002"
ray_head_addr = f"http://ray-head-svc.{NS}.svc.cluster.local:8265"

client = JobSubmissionClient(ray_head_addr)
print("Connected to Ray dashboard at:", ray_head_addr)


Connected to Ray dashboard at: http://ray-head-svc.ray-finetune-llm-deepspeed002.svc.cluster.local:8265


In [5]:
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication
import os
import sys

In [6]:
import sys

!{sys.executable} -m pip install --upgrade --no-cache-dir \
    "numpy==1.26.4" \
    "pyarrow==15.0.2" \
    "datasets==2.18.0"


Collecting pyarrow==15.0.2
  Downloading pyarrow-15.0.2-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting datasets==2.18.0
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Downloading pyarrow-15.0.2-cp311-cp311-manylinux_2_28_x86_64.whl (38.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m194.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading datasets-2.18.0-py3-none-any.whl (510 kB)
Installing collected packages: pyarrow, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 22.0.0
    Uninstalling pyarrow-22.0.0:
      Successfully uninstalled pyarrow-22.0.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.4.1
    Uninstalling datasets-4.4.1:
      Successfully uninstalled datasets-4.4.1
Successfully installed datasets-2.18.0 pyarrow-15.0.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24

In [None]:
# Create the training and evaluation datasets.
# This can be run only once.

!{sys.executable} -m pip install --upgrade --no-cache-dir "datasets>=2.18.0" "pyarrow>=12.0.0"
from mlforeng.llm_finetune import create_dataset
# import create_dataset
create_dataset.gsm8k_qa_no_tokens_template()

Collecting datasets>=2.18.0
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=12.0.0
  Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m157.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pyarrow, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 15.0.2
    Uninstalling pyarrow-15.0.2:
      Successfully uninstalled pyarrow-15.0.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.18.0
    Uninstalling datasets-2.18.0:
      Successfully uninstalled datasets-2.18.0
Successfully installed datasets-4.4.1 pyarrow-22.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24

In [8]:
import subprocess
from codeflare_sdk import TokenAuthentication

# Ask the oc CLI for your current token and API server
token = subprocess.check_output(["oc", "whoami", "-t"]).decode().strip()
server = subprocess.check_output(
    ["oc", "whoami", "--show-server=true"]
).decode().strip()

print("Using server:", server)

# If your cluster uses self-signed certs, skip_tls=True avoids SSL errors.
# If you have proper certs, set skip_tls=False instead.
auth = TokenAuthentication(
    token=token,
    server=server,
    skip_tls=True,   # or False if TLS is fully trusted
)
auth.login()


Using server: https://172.30.0.1:443


'Logged into https://172.30.0.1:443'

In [9]:
from codeflare_sdk import Cluster, ClusterConfiguration

cluster_cfg = ClusterConfiguration(
    name="ray",
    namespace="ray-finetune-llm-deepspeed002",   # your DS project namespace

    # Small, schedulable cluster: head + 2 workers = 3 GPUs total
    num_workers=2,

    head_cpu_requests=4,
    head_cpu_limits=4,
    head_memory_requests=24,   # GiB
    head_memory_limits=24,

    worker_cpu_requests=4,
    worker_cpu_limits=4,
    worker_memory_requests=24,
    worker_memory_limits=24,

    image="quay.io/rhoai/ray:2.35.0-py311-cu121-torch24-fa26",

    # 1 GPU on head + each worker
    head_extended_resource_requests={"nvidia.com/gpu": 1},
    worker_extended_resource_requests={"nvidia.com/gpu": 1},

    # REQUIRED by the Kueue validating policy
    labels={"kueue.x-k8s.io/queue-name": "local-queue-ray"},
)

cluster = Cluster(cluster_cfg)
cluster


Yaml resources loaded for ray


VBox(children=(HBox(children=(Button(description='Cluster Up', icon='play', style=ButtonStyle(), tooltip='Crea…

Output()

<codeflare_sdk.ray.cluster.cluster.Cluster at 0x7fd9cf88e9d0>

In [11]:
# Create the Ray cluster
cluster.apply()

Ray Cluster: 'ray' has successfully been applied. For optimal resource management, you should delete this Ray Cluster when no longer in use.


In [None]:
cluster.details()

In [22]:
from ray.job_submission import JobSubmissionClient

# Use cluster config (you already have this from earlier cells)
namespace = cluster.config.namespace
cluster_name = cluster.config.name

# Construct URL dynamically
ray_url = f"http://ray-head-svc.{namespace}.svc.cluster.local:8265"

print(f"Cluster: {cluster_name}")
print(f"Namespace: {namespace}")
print(f"Ray URL: {ray_url}")

# Create client
client = JobSubmissionClient(ray_url)
print("✓ Client connected!")

# Verify
jobs = client.list_jobs()
print(f"✓ Found {len(jobs)} existing jobs")

Cluster: ray
Namespace: ray-finetune-llm-deepspeed002
Ray URL: http://ray-head-svc.ray-finetune-llm-deepspeed002.svc.cluster.local:8265
✓ Client connected!
✓ Found 0 existing jobs


In [None]:
# Storage configuration
storage_path = '/opt/app-root/src'

# The S3 bucket where to store checkpoint.
# It can be set manually, otherwise it's retrieved from configured the data connection.
s3_bucket = ''  # Empty string for local storage

# Comment out S3 logic - keep it simple
# if not s3_bucket:
#     s3_bucket = os.environ.get('AWS_S3_BUCKET')
# if s3_bucket:
#     storage_path = f's3://{s3_bucket}'

print(f"Using local storage: {storage_path}")

In [None]:
# Submit Ray job
submission_id = client.submit_job(
    entrypoint="python ray_finetune_llm_deepspeed.py "
               "--model-name=meta-llama/Meta-Llama-3.1-8B "
               "--lora "
               "--num-devices=2 "
               "--num-epochs=1 "
               "--max-steps=5 "
               "--ds-config=./deepspeed_configs/zero_3_offload_optim_param.json "
               f"--storage-path={storage_path}/ray_finetune_llm_deepspeed/ "
               "--batch-size-per-device=1 "
               "--eval-batch-size-per-device=1 ",
    runtime_env={
        "env_vars": {
            # Set the following variables if using AWS S3 as storage
            # 'AWS_ACCESS_KEY_ID': os.environ.get('AWS_ACCESS_KEY_ID'),
            # 'AWS_SECRET_ACCESS_KEY': os.environ.get('AWS_SECRET_ACCESS_KEY'),
            # 'AWS_DEFAULT_REGION': os.environ.get('AWS_DEFAULT_REGION'),
            'HF_HOME': f'{storage_path}/.cache'
        },
        'pip': 'requirements.txt',
        'working_dir': './',
        "excludes": ["/docs/", "*.ipynb", "*.md"]
    },
)
print(submission_id)

In [None]:
import time

print("Monitoring training progress...")
print("-" * 60)

prev_log_length = 0
for i in range(60):  # Check for 60 iterations (30 minutes)
    logs = client.get_job_logs(submission_id)
    lines = logs.split('\n')
    
    # Only show new lines
    if len(lines) > prev_log_length:
        new_lines = lines[prev_log_length:]
        for line in new_lines:
            if any(keyword in line.lower() for keyword in ['step', 'epoch', 'loss', 'loading', 'error', 'training']):
                print(line)
        prev_log_length = len(lines)
    
    status = client.get_job_status(submission_id)
    if status in ["SUCCEEDED", "FAILED", "STOPPED"]:
        print(f"\n✓ Job finished with status: {status}")
        break
    
    time.sleep(30)  # Check every 30 seconds

In [28]:
client.stop_job(submission_id)

True

In [None]:
cluster.down()