In [None]:
import os
import yaml

# Import pieces from codeflare-sdk
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication
from codeflare_sdk.ray.client import RayJobClient
from time import sleep

In [None]:
%pip show codeflare-sdk

In [None]:
#parameters
namespace = "default"
ray_image = "has to be specified"
openshift_api_url = "has to be specified"
kubernetes_user_bearer_token = "has to be specified"
num_gpus = "has to be specified"

In [None]:
auth = TokenAuthentication(
    token=kubernetes_user_bearer_token,
    server=openshift_api_url,
    skip_tls=True,
)
auth.login()

In [None]:
# Create our cluster
cluster = Cluster(
    ClusterConfiguration(
        namespace=namespace,
        name='mnisttest',
        head_cpu_requests=2,
        head_cpu_limits=2,
        head_memory_requests=6,
        head_memory_limits=8,
        head_extended_resource_requests={'nvidia.com/gpu':0},
        num_workers=1,
        worker_cpu_requests=1,
        worker_cpu_limits=1,
        worker_memory_requests=1,
        worker_memory_limits=4,
        worker_extended_resource_requests={'nvidia.com/gpu':int(num_gpus)},
        image=ray_image,
        write_to_file=True,
        verify_tls=False
    )
)

In [None]:
directory_path = os.path.expanduser("~/.codeflare/resources/")
outfile = os.path.join(directory_path, "mnisttest.yaml")
cluster_yaml = None
with open(outfile) as f:
    cluster_yaml = yaml.load(f, yaml.FullLoader)

# Add toleration for GPU nodes to Ray cluster worker pod
cluster_yaml["spec"]["workerGroupSpecs"][0]["template"]["spec"]["tolerations"]=[{"key": "nvidia.com/gpu", "value": "NONE", "effect": "NoSchedule"}]

with open(outfile, "w") as f:
    yaml.dump(cluster_yaml, f, default_flow_style=False)

In [None]:
# Bring up the cluster
cluster.up()
# Wait until status is updated (skip dashboard check as route naming changed in kuberay operator)
cluster.wait_ready(dashboard_check=False)

In [None]:
cluster.status()

In [None]:
cluster.details()

In [None]:
# Access dashboard directly via internal service (notebook runs inside the cluster)
# The service mnisttest-head-svc exposes the Ray dashboard on port 8265
ray_dashboard = f"http://mnisttest-head-svc.{namespace}.svc.cluster.local:8265"
print(f"Ray dashboard URL: {ray_dashboard}")

header = {"Authorization": f"Bearer {kubernetes_user_bearer_token}"}
ray_client = RayJobClient(address=ray_dashboard, headers=header, verify=False)

submission_id = ray_client.submit_job(
    entrypoint="python mnist.py",
    runtime_env={
        "env_vars": {
            "NCCL_DEBUG": "INFO",
            "PIP_INDEX_URL":os.environ.get("PIP_INDEX_URL"),
            "PIP_TRUSTED_HOST":os.environ.get("PIP_TRUSTED_HOST"),
        },
        "working_dir": "/opt/app-root/notebooks/..data",
        "pip": "/opt/app-root/notebooks/requirements.txt",
    },
    entrypoint_num_gpus=num_gpus
)

In [None]:
import requests
status = None
error_count = 0
max_errors = 60  # Max consecutive errors before giving up

while status != "SUCCEEDED":
    sleep(1)
    try:
        status = ray_client.get_job_status(submission_id)
        error_count = 0  # Reset on success
        print(f"Job status: {status}")
        if status == "FAILED":
            print("Job failed!")
            break
    except (RuntimeError, requests.exceptions.ConnectionError, ConnectionError) as e:
        error_count += 1
        print(f"Transient error ({error_count}/{max_errors}) checking job status: {type(e).__name__}: {e}")
        if error_count >= max_errors:
            print(f"Too many consecutive errors, giving up")
            break
        continue

if status == "SUCCEEDED":
    print("Job completed Successfully !")
else:
    print(f"Job did not succeed. Final status: {status}")

sleep(10) # Brief pause before cleanup

In [None]:
cluster.down()