In [1]:
%load_ext lab_black

In [2]:
import kfp
from kfp.components import InputPath, OutputPath
import kfp.dsl as dsl
from kfp.dsl import PipelineConf, data_passing_methods
from kubernetes.client.models import V1Volume, V1PersistentVolumeClaimVolumeSource
import numpy as np
import os
from typing import List, NamedTuple

BASE_IMAGE = (
    "quay.io/ibm/kubeflow-notebook-image-ppc64le:elyra3.14.1-py3.9-tf2.9.2-pt1.12.1-v0"
)

In [3]:
def prepare_shared_storage():
    from torchvision.datasets import MNIST
    import urllib

    # Download training data
    _ = MNIST("/workspace", download=True, train=True)
    _ = MNIST("/workspace", download=True, train=False)

    # Download python script
    r = urllib.request.urlretrieve(
        "https://raw.githubusercontent.com/ntl-ibm/kubeflow-ppc64le-examples/multi-gpu-yolov5/pytorch_distributed/mnist/mnist.py",
        "/workspace/mnist.py",
    )


prepare_shared_storage_comp = kfp.components.create_component_from_func(
    prepare_shared_storage, base_image=BASE_IMAGE
)

In [4]:
def train_and_test_model(
    shared_pvc_name: str, mlpipeline_ui_metadata_path: OutputPath(str)
):
    import subprocess

    subprocess.run(
        "pip install 'pytorch_distributed_kf_tools @ "
        "git+https://github.com/ntl-ibm/kubeflow-ppc64le-examples@multi-gpu-yolov5#"
        "subdirectory=pytorch_distributed/pytorch_distributed_kf_tools'",
        shell=True,
    )

    from pytorch_distributed_kf_tools.deploy import (
        run_pytorch_job,
        PvcMount,
        OwningWorkFlow,
    )
    import shutil

    run_pytorch_job(
        owning_workflow=OwningWorkFlow(
            name="{{workflow.name}}", uid="{{workflow.uid}}"
        ),
        namespace="{{workflow.namespace}}",
        pytorch_job_name="{{workflow.name}}",
        pvcs=[
            PvcMount(
                pvc_name=(shared_pvc_name),
                mount_path="/workspace",
            )
        ],
        command=[
            "python",
            "-m",
            "torch.distributed.run",
            "/workspace/mnist.py",
            "--root_dir=/workspace",
            "--data_dir=/workspace",
            "--model=/workspace/mnist_model.pt",
            f"--kubeflow_ui_metadata=/workspace/metadata.json",
            "--max_epochs=10",
        ],
        num_workers=6,
        gpus_per_worker=1,
        worker_image="quay.io/ntlawrence/pytorchv1.13:1.1",
    )

    shutil.copyfile("/workspace/metadata.json", mlpipeline_ui_metadata_path)


train_and_test_model_comp = kfp.components.create_component_from_func(
    train_and_test_model, base_image=BASE_IMAGE
)

In [5]:
def copy_data(source: str, dest: str):
    import os
    import shutil

    # Make target directories if needed
    parent_dirs = os.path.basename(dest)
    if not os.path.exists(parent_dirs):
        os.makedirs(parent_dirs)

    if os.path.isdir(source):
        shutil.copytree(source, dest)
    else:
        shutil.copyfile(source, dest)


copy_data_comp = kfp.components.create_component_from_func(
    copy_data, base_image=BASE_IMAGE
)

In [6]:
from kubernetes.client import V1VolumeMount


@dsl.pipeline(
    name="Handwritten digit classification",
    description="An example pipeline that trains using distributed pytorch",
)
def mnist_pipeline(notebook_pvc_name: str = "pytorch-minst-volume"):

    create_shared_volume_volop = dsl.VolumeOp(
        name="Create shared volume for training",
        resource_name="shared-pvc",
        modes=dsl.VOLUME_MODE_RWM,
        size="4Gi",
        set_owner_reference=True,
    )

    prepare_shared_storage_task = prepare_shared_storage_comp()
    prepare_shared_storage_task.add_pvolumes(
        {"/workspace": create_shared_volume_volop.volume}
    )

    train_model_task = train_and_test_model_comp(
        create_shared_volume_volop.volume.persistent_volume_claim.claim_name
    )
    train_model_task.add_pvolumes({"/workspace": create_shared_volume_volop.volume})
    train_model_task.after(prepare_shared_storage_task)
    train_model_task.set_display_name("Train and Test Model")

    copy_model_task = copy_data_comp(
        "/workspace/mnist_model.pt", "/target/mnist_model.pt"
    )
    copy_model_task.add_pvolumes({"/workspace": create_shared_volume_volop.volume})
    copy_model_task.add_volume(
        V1Volume(
            name=notebook_pvc_name,
            persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
                notebook_pvc_name
            ),
        )
    )
    copy_model_task.add_volume_mount(
        V1VolumeMount(name=notebook_pvc_name, mount_path="/target")
    )
    copy_model_task.set_display_name(f"Copy Model to target PVC")
    copy_model_task.after(train_model_task)

In [7]:
PIPELINE_NAME = "MNIST Classification Pipeline"

In [8]:
pipeline_conf = kfp.dsl.PipelineConf()

# Disable Caching
def disable_cache_transformer(op: dsl.ContainerOp):
    if isinstance(op, dsl.ContainerOp):
        op.execution_options.caching_strategy.max_cache_staleness = "P0D"
    else:
        op.add_pod_annotation(
            name="pipelines.kubeflow.org/max_cache_staleness", value="P0D"
        )
    return op


pipeline_conf.add_op_transformer(disable_cache_transformer)

In [9]:
kfp.compiler.Compiler().compile(
    pipeline_func=mnist_pipeline,
    package_path=f"{PIPELINE_NAME}.yaml",
    pipeline_conf=pipeline_conf,
)

In [10]:
def delete_pipeline(pipeline_name: str):
    """Delete's a pipeline with the specified name"""

    client = kfp.Client()
    existing_pipelines = client.list_pipelines(page_size=999).pipelines
    matches = (
        [ep.id for ep in existing_pipelines if ep.name == pipeline_name]
        if existing_pipelines
        else []
    )
    for id in matches:
        client.delete_pipeline(id)

In [11]:
def get_experiment_id(experiment_name: str) -> str:
    """Returns the id for the experiment, creating the experiment if needed"""
    client = kfp.Client()
    existing_experiments = client.list_experiments(page_size=999).experiments
    matches = (
        [ex.id for ex in existing_experiments if ex.name == experiment_name]
        if existing_experiments
        else []
    )

    if matches:
        return matches[0]

    exp = client.create_experiment(experiment_name)
    return exp.id

In [12]:
# Pipeline names need to be unique, so before we upload,
# check for and delete any pipeline with the same name
delete_pipeline(PIPELINE_NAME)

# upload
client = kfp.Client()
uploaded_pipeline = client.upload_pipeline(f"{PIPELINE_NAME}.yaml", PIPELINE_NAME)

In [13]:
run = client.run_pipeline(
    experiment_id=get_experiment_id("mnist"),
    job_name="mnist-classification-pipeline",
    pipeline_id=uploaded_pipeline.id,
)