In [None]:
%pip show kubeflow-training  # to check installed kubeflow-traning SDK version during test-run

In [42]:
import sys
sys.path.append("../notebooks")  # needed to make kfto_sdk_mnist
from kfto_sdk_mnist import train_func
from kubeflow.training import TrainingClient
from kubernetes import client as c
import time

In [41]:
# parameters
num_gpus = "${num_gpus}"
openshift_api_url = "${api_url}"
namespace = "${namespace}"
token = "${password}"
training_image= "${training_image}"

In [None]:
def GetDefaultLocalQueue(namespace: str):
    """
    Fetches the LocalQueue in the given namespace whose annotation
    "kueue.x-k8s.io/default-queue" == "true". Returns the dict
    for the queue, or None.
    """
    group   = "kueue.x-k8s.io"
    version = "v1beta1"
    plural  = "localqueues"

    conf = c.Configuration()
    conf.host = openshift_api_url
    conf.verify_ssl = False
    conf.api_key = {"authorization": f"Bearer {token}"}

    api_client = c.ApiClient(configuration=conf)
    api = c.CustomObjectsApi(api_client)

    resp = api.list_namespaced_custom_object(
        group=group, version=version, namespace=namespace, plural=plural
    )

    default_q = None
    for item in resp.get("items", []):
        ann = item.get("metadata", {}).get("annotations") or {}
        if ann.get("kueue.x-k8s.io/default-queue") == "true":
            if default_q is not None:
                raise RuntimeError(
                    f"multiple LocalQueues annotated as default in {namespace}: "
                    f"{default_q['metadata']['name']} and {item['metadata']['name']}"
                )
            default_q = item['metadata']['name']

    if default_q is None:
        raise RuntimeError(f"no LocalQueue annotated as default in namespace {namespace}")

    return default_q

In [None]:
api_key = {"authorization": f"Bearer {token}"}
# config = c.Configuration(host=openshift_api_url, api_key=token)
# config.verify_ssl = False
tc = TrainingClient()

# get default local queue
default_local_queue=GetDefaultLocalQueue(namespace)

In [None]:
import os
tc.create_job(
   name="pytorch-ddp",
   namespace=namespace,
   train_func=train_func,
   num_workers=1,
   resources_per_worker={"gpu": num_gpus},
   base_image=training_image,
   packages_to_install=["torchvision==0.19.0","minio==7.2.13"],
   pip_index_url= os.environ.get("PIP_INDEX_URL"),
   env_vars={
      "NCCL_DEBUG": "INFO", 
      "TORCH_DISTRIBUTED_DEBUG": "DETAIL", 
      "DEFAULT_PIP_INDEX_URL": os.environ.get("PIP_INDEX_URL"),
      "PIP_TRUSTED_HOST": os.environ.get("PIP_TRUSTED_HOST")
   },
   labels={
       "kueue.x-k8s.io/queue-name": default_local_queue,
   }
)

In [None]:
while not tc.is_job_succeeded(name="pytorch-ddp", namespace=namespace): 
    time.sleep(1)
print("PytorchJob Succeeded!")

In [None]:
time.sleep(10)