In [122]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


# Honey Bee Computer Vision Example

This example trains a yolov5 model to detect Honey Bees using the V7 version of the https://github.com/ultralytics/yolov5. The purpose of the example is to explain how to train a model using Kubeflow, YOLOV5 and sample data. 

Accessed classes are:
* Bees (workers or foragers)
* Bees carrying pollen
* Drones
* Queens


For our purposes, we've included a sample of training data to use for this exercise. 
We also included initial weights calculated from a much larger version of this dataset with 500 epochs (This takes many hours to train).
This should allow us to experiment with the pipeline design without needing to wait hours for training to complete.

The dataset was sampled from here: https://universe.roboflow.com/matt-nudi/honey-bee-detection-model-zgjnb (creative commons license).


The assumption is that this notebook has a data volume mounted (you can set this up when you create the notebook). The PVC should have been created with an access mode of ReadWriteMany. 
This allows other pods to mount the volume.

The data set will be extracted to the data volume in these next few cells. The data will be loaded into the pipeline via a volume, rather than a download. This simulates a use case where the training data has been pre loaded onto SCOUT, and is large enough where a download is expensive.

The volume name is defined in this next cell, as is the path to the extracted Roboflow data set for the bees. To keep things simple, the mount point is the same for both the notebook server, and also for the containers that mount the volume.

In [123]:
VOLUME_CLAIM_NAME = "yolov5-work"
MOUNT_POINT = "/home/jovyan/vol-1"
BEE_DATA_SET_PATH = f"{MOUNT_POINT}/bee_dataset"

In [124]:
!mkdir -p {BEE_DATA_SET_PATH}
!tar -xf data/dataset.tar.gz -C {BEE_DATA_SET_PATH} --strip-components 1
!cp data/model.pt {MOUNT_POINT}/pre_trained_initial_weights.pt

## Imports and constants

The base image is an image that has been built to include the libraries for yolov5. The docker file is included in the "Notebook Container Image Source" directory. You can build this from the command line on SCOUT, but not from within a Jupyter notebook.

In [125]:
import kfp
from kfp import dsl
from kfp.components import InputPath, OutputPath
from kubernetes.client.models import (
    V1Volume,
    V1VolumeMount,
    V1PersistentVolumeClaimVolumeSource,
)
import os
from typing import Optional

BASE_IMAGE = "quay.io/ntlawrence/yolov5:pt1.12.1-yolo7.0-v1.1"
COMPONENT_CATALOG_FOLDER = f"{os.getenv('HOME')}/components"

## Load Data component
The first component in the pipeline copies the data from an input path to an output parameter.

Essentially this moves the data from the volume into the pipeline.

In [136]:
def load_from_url(
    source: str,
    dest: OutputPath(str),
):
    import os
    import shutil
    from urllib.request import urlretrieve
    from urllib.parse import urlparse

    # Make target directories if needed
    parent_dirs = os.path.basename(dest)
    if not os.path.exists(parent_dirs):
        os.makedirs(parent_dirs)

    # Option to use an empty file to indicate no weights
    if not source:
        with open(dest, "w") as _:
            pass

    source_details = urlparse(source)

    if source_details.scheme == "file":
        if os.path.isdir(source_details.path):
            shutil.copytree(source_details.path, dest)
        else:
            shutil.copyfile(source_details.path, dest)
    elif source_details.scheme in ("http", "https", "ftp", "ftps"):
        urlretrieve(source, filename=dest)
    else:
        raise ValueError(f"source does not use a supported url scheme")


load_from_url_comp = kfp.components.create_component_from_func(
    load_from_url, base_image=BASE_IMAGE
)

## Train Model component

The training component has several steps to it:

* Update the data.yaml with the paths to the train, test, and validation data sets.
* Run the python train.py CLI to train the model
* Convert the trained model to ONNX

When the model is converted to ONNX, it is quantized from FP32 to int8. A subseet of the training data is used in the quantization.

The training is initialized with the weights from yolov5s.pt.

Performance could be improved by using distributed training.

In [137]:
def train_model(
    onnx_model: OutputPath(str),
    pt_model: OutputPath(str),
    results: OutputPath(str),
    model_cfg: InputPath(str),
    initial_weights: InputPath(str),
    epochs: int,
):
    import subprocess
    import pathlib
    from ruamel.yaml import YAML
    import os
    import shutil

    # data_dir is from an input path, so we have to
    # update the data.yaml with the path names
    # yaml = YAML()
    # dataf = pathlib.Path(f"{data_dir}/data.yaml")
    # d = yaml.load(dataf)
    # d["train"] = f"{data_dir}/train"
    # d["test"] = f"{data_dir}/test"
    # d["val"] = f"{data_dir}/valid"
    # yaml.dump(d, dataf)

    # Option to pass an empty file to train from scratch
    weights = initial_weights if os.path.getsize(initial_weights) > 0 else ""

    subprocess.run(
        f"python train.py --img 640 --batch -1 --noplots --epochs {epochs} --cache ram --cfg={model_cfg} "
        f"--data /dataset/data.yaml --weights {weights} --workers=0 --device=0 --optimizer=Adam",
        check=True,
        cwd="/yolov5",
        shell=True,
    )

    subprocess.run(
        f"python export.py --img 640 --include=onnx --int8 "
        f"--data /dataset/data.yaml --weights /yolov5/runs/train/exp/weights/best.pt --device=0 ",
        check=True,
        cwd="/yolov5",
        shell=True,
    )

    os.makedirs(os.path.dirname(pt_model), exist_ok=True)
    os.makedirs(os.path.dirname(onnx_model), exist_ok=True)
    os.makedirs(os.path.dirname(results), exist_ok=True)

    shutil.copyfile("/yolov5/runs/train/exp/weights/best.pt", pt_model)
    shutil.copyfile("/yolov5/runs/train/exp/weights/best.onnx", onnx_model)
    shutil.copyfile("/yolov5/runs/train/exp/results.csv", results)


train_model_comp = kfp.components.create_component_from_func(
    train_model, base_image=BASE_IMAGE
)

In [138]:
def evaluate_model(
    model: InputPath(str),
    model_format: str = "onnx",  # onnx, pt, tf ....
    conf_thres: float = 0.001,
    iou_thres: float = 0.6,
    max_det: int = 300,
):
    import subprocess
    import os
    import torch
    from ruamel.yaml import YAML
    import pathlib

    print(f"The size of the model is {os.path.getsize(model)}")

    if model_format == "onnx" and not torch.cuda.is_available():
        # the base image is built with an onnxruntime for GPU
        # This should work for both CPU and GPU, but val.py
        # does it's own checking for CPU onnxruntime only
        # Since that's not installed and not on pypl for ppc64le,
        # The script won't work unless we change up the version
        subprocess.run(
            "mamba install -c rocketce onnxruntime=1.13.1=hea80eff_cpu_py39_pb3.19_1 -y",
            check=True,
            shell=True,
        )

    # valy.py uses the file name to determine the type of model
    # mode is an input path where the name is generated by kubeflow
    # We need to control the name that is used...
    named_model = f"/tmp/{os.path.basename(model)}.{model_format}"
    os.symlink(model, named_model)

    # Update data.yaml so that the directories point to where the
    # data is
    # yaml = YAML()
    # dataf = pathlib.Path(f"{data_dir}/data.yaml")
    # d = yaml.load(dataf)
    # d["train"] = f"{data_dir}/train"
    # d["test"] = f"{data_dir}/test"
    # d["val"] = f"{data_dir}/valid"
    # yaml.dump(d, dataf)

    subprocess.run(
        f"python val.py --weights {named_model} --data /dataset/data.yaml --img 640 "
        f"--conf-thres {conf_thres} --iou-thres {iou_thres} --max-det {max_det} --workers=0 ",
        check=True,
        shell=True,
        cwd="/yolov5",
    )

    subprocess.run("find . -print", cwd="/yolov5", shell=True, check=True)


evaluate_model_comp = kfp.components.create_component_from_func(
    evaluate_model, base_image=BASE_IMAGE
)

## Upload the ONNX model, using a previously defined component

In [139]:
UPLOAD_MODEL_COMPONENT = (
    f"{COMPONENT_CATALOG_FOLDER}/model-building/upload-model/component.yaml"
)

upload_model_comp = kfp.components.load_component_from_file(UPLOAD_MODEL_COMPONENT)

## Deploy Model Component

In [140]:
DEPLOY_MODEL_COMPONENT = f"./deploy_inference_service_component.yaml"
deploy_model_comp = kfp.components.load_component_from_file(DEPLOY_MODEL_COMPONENT)

## Pipeline Definition

In [150]:
@dsl.pipeline(name="bee-yolov5")
def bee_yolov5(
    model_config_url: str,
    initial_weights_url: str,
    data_vol_pvc_name: str,
    data_vol_subpath: str,
    blackboard: str = "mlpipeline-artefacts",
    epochs: int = 750,
    minio_url="minio-service.kubeflow:9000",
    model_version: int = 1,
):
    def mount_volume(task, pvc_name, mount_path, volume_subpath):
        task.add_volume(
            V1Volume(
                name="vol",
                persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(pvc_name),
            )
        )

        task.add_volume_mount(
            V1VolumeMount(name="vol", mount_path=mount_path, sub_path=volume_subpath)
        )

    create_blackboard = dsl.VolumeOp(
        name="Create Artefacts Blackboard",
        resource_name=blackboard,
        modes=dsl.VOLUME_MODE_RWO,
        size="4Gi",
        set_owner_reference=True,
    )

    # Load config Tasks
    model_config_task = load_from_url_comp(model_config_url)
    model_config_task.after(create_blackboard)
    initial_weights_task = load_from_url_comp(initial_weights_url)
    initial_weights_task.after(create_blackboard)

    # Train Model, also converts to ONNX
    train_model_task = train_model_comp(
        model_cfg=model_config_task.outputs["dest"],
        initial_weights=initial_weights_task.outputs["dest"],
        epochs=epochs,
    )
    train_model_task.set_gpu_limit(1)
    train_model_task.set_memory_limit("30G")
    mount_volume(train_model_task, data_vol_pvc_name, "/dataset", data_vol_subpath)

    # Evaluate
    evaluate_onnx_model_task = evaluate_model_comp(
        model=train_model_task.outputs["onnx_model"],
    )
    evaluate_onnx_model_task.set_display_name("Evaluate ONNX")
    mount_volume(
        evaluate_onnx_model_task, data_vol_pvc_name, "/dataset", data_vol_subpath
    )

    evaluate_pt_model_task = evaluate_model_comp(
        model=train_model_task.outputs["pt_model"],
        model_format="pt",
    )
    evaluate_pt_model_task.set_display_name("Evaluate with best weights")
    mount_volume(
        evaluate_pt_model_task, data_vol_pvc_name, "/dataset", data_vol_subpath
    )

    # Upload ONNX model
    upload_model_task = upload_model_comp(
        train_model_task.outputs["onnx_model"],
        minio_url=minio_url,
        export_bucket="{{workflow.namespace}}-bee",
        model_format="onnx",
        model_name="bee",
        model_version=model_version,
    )

    # Deploy Inference Service
    deploy_model_task = deploy_model_comp(
        name="bee",
        rm_existing=True,
        storage_uri="s3://{{workflow.namespace}}-bee/onnx",
        minio_url=minio_url,
        predictor_protocol="v2",
    )
    deploy_model_task.after(upload_model_task)

In [151]:
PIPELINE_NAME = "Bee detector pipeline"

kfp.compiler.Compiler().compile(
    pipeline_func=bee_yolov5,
    package_path=f"{PIPELINE_NAME}.yaml",
)

In [152]:
def delete_pipeline(pipeline_name: str):
    """Delete's a pipeline with the specified name"""

    client = kfp.Client()
    existing_pipelines = client.list_pipelines(page_size=999).pipelines
    matches = (
        [ep.id for ep in existing_pipelines if ep.name == pipeline_name]
        if existing_pipelines
        else []
    )
    for id in matches:
        client.delete_pipeline(id)


def get_experiment_id(experiment_name: str) -> str:
    """Returns the id for the experiment, creating the experiment if needed"""
    client = kfp.Client()
    existing_experiments = client.list_experiments(page_size=999).experiments
    matches = (
        [ex.id for ex in existing_experiments if ex.name == experiment_name]
        if existing_experiments
        else []
    )

    if matches:
        return matches[0]

    exp = client.create_experiment(experiment_name)
    return exp.id

In [153]:
# Pipeline names need to be unique, so before we upload,
# check for and delete any pipeline with the same name
delete_pipeline(PIPELINE_NAME)

# upload
client = kfp.Client()
uploaded_pipeline = client.upload_pipeline(f"{PIPELINE_NAME}.yaml", PIPELINE_NAME)

In [155]:
pipeline_params = {
    "data_vol_pvc_name": VOLUME_CLAIM_NAME,
    "data_vol_subpath": "bee_dataset",
    "model_config_url": "https://github.com/ultralytics/yolov5/raw/v7.0/models/yolov5s.yaml",
    "initial_weights_url": "https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt",
    "blackboard": "mlpipeline-artefacts",
    "epochs": "1",
    "model_version": "1",
}

run = client.run_pipeline(
    experiment_id=get_experiment_id("bee-exp"),
    job_name="bees",
    pipeline_id=uploaded_pipeline.id,
    params=pipeline_params,
)

In [121]:
TWENTY_MIN = 20 * 60
result = client.wait_for_run_completion(run.id, timeout=TWENTY_MIN)
{
    "status": result.run.status,
    "error": result.run.error,
    "time": str(result.run.finished_at - result.run.created_at),
    "metrics": result.run.metrics,
}

{'status': 'Failed', 'error': None, 'time': '0:03:49', 'metrics': None}

In [None]:
IMAGE = "/home/jovyan/vol-1/bee_data/test/images/DLQueenIMG_8012-680x538_jpg.rf.aa539ec13ba2b9c5bf7b4de6107f23cd.jpg"
# image = mpimg.imread(IMAGE)

In [None]:
!python /home/jovyan/vol-1/yolov5/detect.py --weights=http://bee.kubeflow-ntl.svc.cluster.local/v2/models/bee/infer --data=/home/jovyan/vol-1/bee_data/data.yaml --source=$IMAGE --conf-thres=.7 --iou-thres=.2 --max-det=500

In [None]:
from matplotlib import pyplot as plt
import cv2

img = cv2.imread(
    "/home/jovyan/vol-1/yolov5/runs/detect/exp21/DLQueenIMG_8012-680x538_jpg.rf.aa539ec13ba2b9c5bf7b4de6107f23cd.jpg"
)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))