In [None]:
import os
import yaml

# Import pieces from codeflare-sdk
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication
from codeflare_sdk.job import RayJobClient
from time import sleep

In [None]:
%pip show codeflare-sdk

In [None]:
#parameters
namespace = "default"
ray_image = "has to be specified"
local_queue = "has to be specified"
openshift_api_url = "has to be specified"
kubernetes_user_bearer_token = "has to be specified"
num_gpus = "has to be specified"

In [None]:
auth = TokenAuthentication(
    token=kubernetes_user_bearer_token,
    server=openshift_api_url,
    skip_tls=True,
)
auth.login()

In [None]:
# Create our cluster and submit appwrapper
cluster = Cluster(
    ClusterConfiguration(
        namespace=namespace,
        name='mnisttest',
        head_cpus=1,
        head_memory=4,
        head_extended_resource_requests={'nvidia.com/gpu':0},
        num_workers=1,
        worker_cpu_requests=1,
        worker_cpu_limits=1,
        worker_memory_requests=1,
        worker_memory_limits=4,
        worker_extended_resource_requests={'nvidia.com/gpu': int(num_gpus)},
        image=ray_image,
        local_queue=local_queue,
        write_to_file=True,
        verify_tls=False
    )
)

In [None]:
directory_path = os.path.expanduser("~/.codeflare/resources/")
outfile = os.path.join(directory_path, "mnisttest.yaml")
cluster_yaml = None
with open(outfile) as f:
    cluster_yaml = yaml.load(f, yaml.FullLoader)

# Add toleration for GPU nodes to Ray cluster worker pod
cluster_yaml["spec"]["workerGroupSpecs"][0]["template"]["spec"]["tolerations"]=[{"key": "nvidia.com/gpu", "value": "NONE", "effect": "NoSchedule"}]

with open(outfile, "w") as f:
    yaml.dump(cluster_yaml, f, default_flow_style=False)

In [None]:
# Bring up the cluster
cluster.up()
# Wait until status is updated
cluster.wait_ready()

In [None]:
cluster.status()

In [None]:
cluster.details()

In [None]:
ray_dashboard = cluster.cluster_dashboard_uri()
header = {"Authorization": f"Bearer {kubernetes_user_bearer_token}"}
client = RayJobClient(address=ray_dashboard, headers=header, verify=False)

submission_id = client.submit_job(
    entrypoint="python mnist.py",
    runtime_env={
        "working_dir": "/opt/app-root/notebooks/..data",
        "pip": "/opt/app-root/notebooks/requirements.txt",
    },
    entrypoint_num_gpus=num_gpus
)

In [None]:
finished = False
while not finished:
    sleep(1)
    status = client.get_job_status(submission_id)
    finished = (status == "SUCCEEDED")
if finished:
    print("Job completed Successfully !")
else:
    print("Job failed !")

In [None]:
cluster.down()