In [None]:
# https://medium.com/engineering-at-netbook/running-yolo-v5-with-ddp-on-aws-8a4f07a77cf
# https://medium.com/pytorch/reduce-time-and-cost-by-running-distributed-elastic-pytorch-jobs-on-kubernetes-4f7ac3986307
# https://github.com/pytorch/elastic/issues/117#issuecomment-966877263
# https://v0-6.kubeflow.org/docs/reference/pytorchjob/v1/pytorch/#kubeflow.org
# https://github.com/kubeflow/training-operator/blob/master/docs/api/kubeflow.org_v1_generated.asciidoc#kubefloworgv1
# https://github.com/lehrig/kubeflow-ppc64le-components/blob/main/model-building/train-model-job/train-model-job.ipynb
# https://github.com/kubeflow/pipelines/issues/6880

In [None]:
!kubectl delete pytorchjobs --all

In [None]:
from typing import Dict, Any, Any

BASE_IMAGE = 'quay.io/ntlawrence/yolov5:pt1.12.1-yolov7.0-v2.0'

def train_model_distributed(
    shared_pvc_name: str,
    replicas: int,
    epochs: int = 100,
    base_image: str = BASE_IMAGE,
) -> Dict[Any, Any]:

    from kubernetes import client, config, utils
    import os
    import shutil
    from textwrap import dedent
    from ruamel.yaml import YAML

    # https://v0-6.kubeflow.org/docs/reference/pytorchjob/v1/pytorch/#kubeflow.org
    torch_job_spec = dedent(
        f"""
            apiVersion: "kubeflow.org/v1"
            kind: PyTorchJob
            metadata:
              name: yolov5-training
              annotations:
                sidecar.istio.io/inject: "false"

            spec:
              cleanPodPolicy: Running
              elasticPolicy:
                rdzvBackend: c10d
                minReplicas: 1
                maxReplicas: 4
                maxRestarts: 0
              pytorchReplicaSpecs:
                worker:
                  replicas: {replicas}
                  restartPolicy: Never
                  template:
                    metadata:
                      annotations:
                        sidecar.istio.io/inject: "false"
                    spec:
                      containers:
                        - name: pytorch
                          image: "{base_image}"
                          imagePullPolicy: IfNotPresent
                          workingDir: "/yolov5"
                          env:
                            - name: LOGLEVEL
                              value: DEBUG
                          command:
                            - python
                            - -m
                            - torch.distributed.run
                            - "--nproc_per_node=1"
                            - "train.py"
                            - "--img=640"
                            - "--batch-size=96"
                            - "--noplots"
                            - "--epochs={epochs}"
                            - "--weights=https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt"
                            - "--cache=/home/jovyan/cache"
                            - "--cfg=/shared/shared/yolov5s.yaml"
                            - "--data=/shared/bee_dataset/data.yaml"
                            - "--optimizer=Adam"
                          resources:
                            limits:
                              nvidia.com/gpu: 1
                          securityContext:
                            runAsGroup: 0
                          volumeMounts:
                             - mountPath: /shared
                               name: shared
                             - mountPath: /dev/shm
                               name: dshm
                      volumes:
                      - name: shared
                        persistentVolumeClaim:
                          claimName: {shared_pvc_name}
                      - emptyDir:
                          medium: Memory
                        name: dshm
    """.strip(
            "\n"
        )
    )

    yaml = YAML(typ="safe")
    doc = yaml.load(torch_job_spec)
    config.load_incluster_config()
    k8s_client = client.CustomObjectsApi()
  
    api_response = k8s_client.create_namespaced_custom_object(
        "kubeflow.org", "v1", "kubeflow-ntl", "pytorchjobs", doc, pretty=True
    )

    return api_response


train_model_distributed(
    shared_pvc_name="yolov5-work",
    replicas=2,
    epochs=100,
    base_image=BASE_IMAGE,
)