In [None]:
def load_script_from_url(
    source: str,
    dest: str,
):
    import os
    import shutil
    from urllib.request import urlretrieve
    from urllib.parse import urlparse

    # Make target directories if needed
    parent_dirs = os.path.dirname(dest)
    if not os.path.exists(parent_dirs):
        os.makedirs(parent_dirs)

    source_details = urlparse(source)

    if source_details.scheme == "file":
        if os.path.isdir(source_details.path):
            shutil.copytree(source_details.path, dest)
        else:
            shutil.copyfile(source_details.path, dest)
    elif source_details.scheme in ("http", "https", "ftp", "ftps"):
        urlretrieve(source, filename=dest)
    else:
        raise ValueError(f"source does not use a supported url scheme")


load_script_from_url_comp = kfp.components.create_component_from_func(
    load_script_from_url, base_image=BASE_IMAGE
)

In [None]:
def train_model_distributed(
    model: OutputPath(str),
    results: OutputPath(str),
    epochs: int,
    training_pvc_name: str,
    timeout: int = 60 * 60 * 8,
    base_image: str = "quay.io/ntlawrence/pytorchv1.13:1.0",
    replicas: int = 6,
):
    import os
    import subprocess
    import shutil

    # Install kubeflow-training, hack/fix other dependents
    subprocess.run("pip uninstall -y kfp", shell=True)
    subprocess.run("pip install kubernetes==26.1.0", shell=True, check=True)
    subprocess.run("pip install kubeflow-training==1.6.0", shell=True, check=True)

    from kubernetes.client import (
        V1PodTemplateSpec,
        V1ObjectMeta,
        V1PodSpec,
        V1Container,
        V1EnvVar,
        V1ResourceRequirements,
        V1VolumeMount,
        V1Volume,
        V1PersistentVolumeClaimVolumeSource,
        V1EmptyDirVolumeSource,
        V1OwnerReference,
        V1ObjectFieldSelector,
        V1EnvVarSource,
    )

    from kubeflow.training import (
        V1ReplicaSpec,
        KubeflowOrgV1PyTorchJob,
        KubeflowOrgV1PyTorchJobSpec,
        KubeflowOrgV1ElasticPolicy,
        V1RunPolicy,
        TrainingClient,
    )

    from kubeflow.training.constants import constants
    from kubernetes import client, config, watch

    def copyf(source: str, dest: str(str)):
        """
        Copies a file or directory,
        creating destination parent dirs as needed
        """
        import os
        import shutil

        parent_dirs = os.path.dirname(dest)
        os.makedirs(parent_dirs, exist_ok=True)

        if os.path.isdir(source):
            shutil.copytree(source, dest)
        else:
            shutil.copyfile(source, dest)

    def wait_for_pod_ready(name: str, namespace: str = "{{workflow.namespace}}"):
        """Waits for a Pod to become ready.
        At that point all containers have been started
        """
        config.load_incluster_config()
        w = watch.Watch()
        core_v1 = client.CoreV1Api()

        # Watching a specific pod is done with a field selector on the name.
        # https://github.com/kubernetes-client/python/issues/467
        for event in w.stream(
            func=core_v1.list_namespaced_pod,
            namespace=namespace,
            field_selector=f"metadata.name={name}",
            timeout_seconds=120,
        ):
            # Phases: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
            if event["object"].status.phase not in {"Pending"}:
                w.stop()
                return
            # event.type: ADDED, MODIFIED, DELETED
            if event["type"] == "DELETED":
                print(f" {name} deleted before it started")
                w.stop()
                return

    # The "training pvc is used to share data between the torch jobs and this component
    # We need to copy the config and initial weights to it
    copyf(model_cfg_path, f"{training_pvc_mount}/input/config.yaml")
    copyf(initial_weights_path, f"{training_pvc_mount}/input/weights.pt")

    # Create a directory to store the output of training, we'll
    # copy data from here to output paths
    os.makedirs(f"{training_pvc_mount}/output", exist_ok=True)

    ###########################################
    # PyTorchJob template for the training job
    ###########################################

    # Define GPU device list and job name
    devices = ",".join([str(d) for d in range(gpus)])
    job_name = "{{workflow.name}}-training"
    batch_size = batch_size * replicas

    # An owner reference for the workflow
    # When the workflow is deleted, the torch job is
    # garbage collected.
    workflow_ownership = V1OwnerReference(
        api_version="v1",
        kind="Workflow",
        name="{{workflow.name}}",
        uid="{{workflow.uid}}",
    )

    # Pod definition for each worker replica
    # Defines the container image, command, and volume mounts
    # yolov5 handles the distributed training for us, but
    # there is an example of how to write your own
    # training code here: https://github.com/kubeflow/training-operator/blob/master/examples/pytorch/elastic/imagenet/
    # https://github.com/ultralytics/yolov5/issues/475
    # https://github.com/pytorch/pytorch/issues/65992#issuecomment-954392973
    pod_template = V1PodTemplateSpec(
        metadata=V1ObjectMeta(
            name=job_name,
            namespace="{{workflow.namespace}}",
            owner_references=[workflow_ownership],
            annotations={"sidecar.istio.io/inject": "false"},
        ),
        spec=V1PodSpec(
            containers=[
                V1Container(
                    name=constants.PYTORCHJOB_CONTAINER,
                    image=yolov5_base_image,
                    image_pull_policy="IfNotPresent",
                    working_dir="/yolov5",
                    command=[
                        "python",
                        "-m",
                        "torch.distributed.run",
                        # "--nproc_per_node",
                        # f"{gpus}",
                        # "--nnodes",
                        # "$(WORLD_SIZE)",
                        # "--node_rank",
                        # "$(RANK)",
                        # "--rdzv_backend",
                        # "etcd",
                        # "--rdzv_endpoint",
                        # f"etcd-server-ntl:2379",
                        "train.py",
                        f"--device={devices}",
                        "--img=640",
                        f"--batch-size={batch_size}",
                        "--noplots",
                        f"--epochs={epochs}",
                        "--weights=/input/weights.pt",
                        "--cache=/home/jovyan/cache",
                        "--cfg=/input/config.yaml",
                        "--data=/dataset/data.yaml",
                        "--optimizer=Adam",
                    ],
                    env=[
                        V1EnvVar(name="LOGLEVEL", value="DEBUG"),
                        V1EnvVar(name="NCCL_DEBUG", value="DEBUG"),
                        V1EnvVar(name="TORCH_CPP_LOG_LEVEL", value="INFO"),
                        # V1EnvVar(name="TORCH_DISTRIBUTED_DEBUG", value="DETAIL"),
                        V1EnvVar(name="C10D_DEBUG_MODE", value="DETAIL"),
                        V1EnvVar(name="PET_VERBOSE", value="1"),
                        # V1EnvVar(
                        #    name="RANK",
                        #    value_from=V1EnvVarSource(
                        #        field_ref=V1ObjectFieldSelector(
                        #            field_path="metadata.labels['replica-index']"
                        #        )
                        #    ),
                        # ),
                        # V1EnvVar(name="WORLD_SIZE", value=f"{replicas}"),
                    ],
                    # Allocate GPUs to each pod
                    resources=V1ResourceRequirements(
                        limits={"nvidia.com/gpu": f"{gpus}"}
                    ),
                    volume_mounts=[
                        # Mount the input files directory from the training pvc
                        V1VolumeMount(
                            mount_path="/input", name="training", sub_path="input"
                        ),
                        # This mounts the output directory of the training pvc
                        # to a path where yolov5 stores the model and results.
                        # And then we can copy those files back into kubeflow,
                        # Because this component has that directory accessible
                        # to it.
                        V1VolumeMount(
                            mount_path="/yolov5/runs/",
                            name="training",
                            sub_path="output",
                        ),
                        # PyTorch requires shared memory on each pod
                        V1VolumeMount(mount_path="/dev/shm", name="dshm"),
                        # Mount the dataset pvc
                        V1VolumeMount(
                            mount_path="/dataset",
                            sub_path=dataset_subpath,
                            name="dataset",
                        ),
                    ],
                )
            ],
            volumes=[
                V1Volume(
                    name="training",
                    persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
                        claim_name=training_pvc_name
                    ),
                ),
                V1Volume(
                    name="dataset",
                    persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
                        claim_name=dataset_pvc_name
                    ),
                ),
                V1Volume(
                    name="dshm", empty_dir=V1EmptyDirVolumeSource(medium="Memory")
                ),
            ],
            service_account_name="default-editor",
        ),
    )

    # The PyTorchJob supports the concept of a master replica, but not
    # specified the first worker takes that role. Having only workers
    # simplifies the template a bit. This is how the imagenet example is
    # setup. https://github.com/kubeflow/training-operator/blob/master/examples/pytorch/elastic/imagenet/
    worker_spec = V1ReplicaSpec(
        replicas=replicas, restart_policy="Never", template=pod_template
    )

    # The owner references and replica spec are used to define the torch job
    pytorchjob = KubeflowOrgV1PyTorchJob(
        api_version=f"{constants.KUBEFLOW_GROUP}/{constants.OPERATOR_VERSION}",
        kind=constants.PYTORCHJOB_KIND,
        metadata=V1ObjectMeta(name=job_name, owner_references=[workflow_ownership]),
        spec=KubeflowOrgV1PyTorchJobSpec(
            # c10d is the most commonly used because it doesn't require additional
            # packages. The primary advantage that elastic solutions offer is the
            # ability to use cheap hardware in the cloud that can be taken away
            # at any time. That doesn't apply so much for Power servers that are
            # running on on-premise. Here we default to a fixed size of the
            # number of replicias.
            elastic_policy=KubeflowOrgV1ElasticPolicy(
                rdzv_backend="c10d",
                # rdzv_host="etcd-client.kubeflow-ntl.svc.cluster.local",
                # rdzv_port=2379,
                rdzv_id=job_name,
                n_proc_per_node=gpus,
                min_replicas=replicas,
                max_replicas=replicas,
                max_restarts=0,
            ),
            run_policy=V1RunPolicy(clean_pod_policy="None"),
            pytorch_replica_specs={"Worker": worker_spec},
        ),
    )

    ##########################
    # Submit training job
    ##########################
    training_client = TrainingClient()
    training_client.create_pytorchjob(pytorchjob)

    try:
        training_client.wait_for_job_conditions(
            job_name,
            expected_conditions={
                constants.JOB_CONDITION_RUNNING,
                constants.JOB_CONDITION_SUCCEEDED,
                constants.JOB_CONDITION_FAILED,
            },
            job_kind=constants.PYTORCHJOB_KIND,
            timeout=120,
        )
    except RuntimeError as e:
        # https://github.com/kubeflow/training-operator/issues/1806#issue-1708084586
        pass

    # Wait for pods to be ready, must do this before reading logs
    pod_names = training_client.get_job_pod_names(name=job_name, is_master=None)
    for pod in pod_names:
        wait_for_pod_ready(pod)

    # stream logs for all workers
    # (most of the interesting stuff is in worker 0)
    training_client.get_job_logs(
        name=job_name,
        is_master=False,
        container=constants.PYTORCHJOB_CONTAINER,
        follow=True,
    )

    # No more logs means workers have finished, wait for the rest of the job
    try:
        training_client.wait_for_job_conditions(
            job_name,
            expected_conditions={
                constants.JOB_CONDITION_SUCCEEDED,
                constants.JOB_CONDITION_FAILED,
            },
            timeout=timeout,
            job_kind=constants.PYTORCHJOB_KIND,
        )
    except RuntimeError as e:
        # https://github.com/kubeflow/training-operator/issues/1806#issue-1708084586
        pass

    if training_client.is_job_failed(name=job_name, job_kind=constants.PYTORCHJOB_KIND):
        raise RuntimeError(f"Job {job_name} Failed!")

    ########################################################
    # Copy trained model and results to output parameters
    ########################################################
    os.makedirs(os.path.dirname(model), exist_ok=True)
    os.makedirs(os.path.dirname(results), exist_ok=True)
    shutil.copyfile(f"/{training_pvc_mount}/output/train/exp/weights/best.pt", model)
    shutil.copyfile(f"/{training_pvc_mount}/output/train/exp/results.csv", results)


train_model_comp = kfp.components.create_component_from_func(
    train_model_distributed,
    base_image=BASE_IMAGE,
)

In [None]:
# Upload model