In [1]:
%load_ext lab_black

# Optimizing Inference for Image Classification

This notebook shows how to deploy an inference service on a GPU with auto scaling.
The pipeline will train and deploy a model for classifying images into one of 10 species of Monkeys.

**References**

This notebook is inspired by the prior work of Sebastian Lehrig and Marvin Giessing. 
https://github.com/lehrig/kubeflow-ppc64le-examples/blob/main/computer-vision/monkey-classification/Monkey%20Classification.ipynb

Comments and questions should be addressed to Nick Lawrence (ntl@us.ibm.com)

**License**

Apache-2.0 License

In [2]:
import json
import kfp
from kfp.components import InputPath, OutputPath
import kfp.dsl as dsl
from kfp.dsl import PipelineConf, data_passing_methods
from kubernetes.client.models import V1Volume, V1PersistentVolumeClaimVolumeSource
import numpy as np
import os
from typing import List, NamedTuple

In [3]:
COMPONENT_CATALOG_FOLDER = f"{os.getenv('HOME')}/components"
UPDATED_COMPONENT_CATALOG_FOLDER = (
    f"{os.getenv('HOME')}/kf-inference-example/components"
)
# COMPONENT_CATALOG_GIT = "https://github.com/lehrig/kubeflow-ppc64le-components.git"
# COMPONENT_CATALOG_RELEASE = "main"

DEPLOY_MODEL_WITH_KSERVE_COMPONENT = f"{COMPONENT_CATALOG_FOLDER}/model-deployment/deploy-model-with-kserve/component.yaml"

BASE_IMAGE = (
    "quay.io/ibm/kubeflow-notebook-image-ppc64le:elyra3.14.1-py3.9-tf2.9.2-pt1.12.1-v0"
)

## Import directories of reusable Kubeflow components

In [4]:
# if not os.path.exists(COMPONENT_CATALOG_FOLDER):
#    !git clone --branch $COMPONENT_CATALOG_RELEASE $COMPONENT_CATALOG_GIT $COMPONENT_CATALOG_FOLDER

## Load and prepare dataset component

This component will load the data set from huggingface, preprocess it, and save it as a tensorflow model for training.

The preprocessing step coverts the images to tensors, and expands the class labels to one-hot vectors. The training images are all of the same size and can be batched without needing to resize here. Rescaling is also done as a layer of the model itself.

We want the model to include resize and rescale layers for two reasons:

* Inferencing can reuse these layers later without needing additional code
* The model training component will run on the GPU, and performace of these layers will be enhanced.

This component also outputs the class labels in a file for later use. These are not needed for training, but the inference service can use them to present a more readable response to the client.

This component also splits the data into train/validation/test data sets.

In [5]:
def load_test_train_dataset_for_tf(
    train_dataset_dir: OutputPath(str),
    validation_dataset_dir: OutputPath(str),
    test_dataset_dir: OutputPath(str),
    class_names: OutputPath(str),
    dataset_url="Lehrig/Monkey-Species-Collection",
    dataset_configuration="downsized",
):
    import os

    import datasets
    import json
    import numpy as np
    import tensorflow as tf
    import base64

    def process(examples):
        examples["image"] = [np.array(img) for img in examples["image"]]

        examples["label"] = [ONE_HOT_MATRIX[label] for label in examples["label"]]

        return examples

    def save_as_tf_dataset(dataset, directory):
        os.makedirs(directory, exist_ok=True)
        tf.data.experimental.save(
            dataset.to_tf_dataset(
                batch_size=16, columns=["image"], label_cols=["label"]
            ),
            directory,
        )

    dataset = datasets.load_dataset(dataset_url, dataset_configuration)

    CLASSES = dataset["train"].features["label"].names
    ONE_HOT_MATRIX = np.identity(len(CLASSES))

    dataset = dataset.shuffle(seed=42)

    # The input dataset is known to have images of shape (224, 224, 3)
    # The images are all of the same size, and so they can be batched
    # together. The model will have a resizing layer so that inference will
    # resize the input images if necessary, but there is an assumption
    # that images in the batch are all the same size for both
    # training and inferencing. We assume that is true with the
    # hugging face data and don't resize here.
    dataset = dataset.map(
        process,
        batched=True,
        batch_size=16,
        features=datasets.Features(
            {
                "image": datasets.Array3D(dtype="uint8", shape=(224, 224, 3)),
                "label": datasets.Sequence(
                    feature=datasets.Value(dtype="int32"), length=len(CLASSES)
                ),
            }
        ),
        num_proc=8,
    )

    dev_test_dataset = dataset["test"].train_test_split(test_size=0.5, shuffle=False)

    save_as_tf_dataset(dataset["train"], train_dataset_dir)
    save_as_tf_dataset(dev_test_dataset["train"], validation_dataset_dir)
    save_as_tf_dataset(dev_test_dataset["test"], test_dataset_dir)

    os.makedirs(os.path.dirname(class_names), exist_ok=True)
    with open(class_names, "w") as f:
        json.dump(CLASSES, f)

In [6]:
load_data_comp = kfp.components.create_component_from_func(
    load_test_train_dataset_for_tf, base_image=BASE_IMAGE
)

## Model training component

Several important optimizations are included here.

* Resizing and Rescaling are done here to take advantage of the GPU.
* Image augmentation layers are included here for accuracy. These also benefit from the GPU. 
* The dataset is cached and prefetched, so that the GPU does not have to wait for data during training.

In [7]:
def train_model(
    train_dataset_dir: InputPath(str),
    validation_dataset_dir: InputPath(str),
    model_dir: OutputPath(str),
    epochs: int = 100,
    batch_size: int = 32,
):
    """Uses transfer learning on a prepared dataset. Once trained, the model is persisted to `model_dir`."""
    import os
    import tensorflow as tf
    from tensorflow.keras import Sequential
    from tensorflow.keras.applications import InceptionV3
    from tensorflow.keras.layers import (
        BatchNormalization,
        Dense,
        Dropout,
        GlobalAveragePooling2D,
        Resizing,
        Rescaling,
        RandomContrast,
        RandomBrightness,
        RandomRotation,
        Input,
    )

    from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

    def build_model():

        backbone = InceptionV3(include_top=False, weights="imagenet")

        for layer in backbone.layers:
            layer.trainable = False

        model = Sequential()
        model.add(Input(shape=(None, None, 3), name="image", dtype=tf.uint8))
        model.add(Resizing(height=224, width=224, interpolation="nearest"))
        model.add(Rescaling(scale=1.0 / 255))
        model.add(RandomContrast(factor=0.2, seed=42))
        model.add(RandomBrightness(factor=0.2, value_range=(0.0, 1.0), seed=42))
        model.add(RandomRotation(factor=0.2, seed=42))
        model.add(backbone)
        model.add(GlobalAveragePooling2D())
        model.add(Dense(128, activation="relu"))
        model.add(BatchNormalization())
        model.add(Dropout(0.3))
        model.add(Dense(64, activation="relu"))
        model.add(BatchNormalization())
        model.add(Dropout(0.3))
        model.add(Dense(10, activation="softmax", name="scores"))

        return model

    callbacks = [
        EarlyStopping(monitor="val_loss", patience=20, verbose=0, mode="min"),
        ReduceLROnPlateau(
            monitor="val_loss",
            factor=0.1,
            patience=7,
            verbose=1,
            min_delta=0.0001,
            mode="min",
        ),
    ]

    # Datasets were saved in batches so no need to batch again
    train_dataset = (
        tf.data.experimental.load(train_dataset_dir).cache().prefetch(tf.data.AUTOTUNE)
    )

    validation_dataset = (
        tf.data.experimental.load(validation_dataset_dir)
        .cache()
        .prefetch(tf.data.AUTOTUNE)
    )

    model = build_model()

    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["categorical_accuracy"],
    )

    hist = model.fit(
        train_dataset,
        validation_data=validation_dataset,
        epochs=epochs,
        callbacks=callbacks,
    )

    print("Model train history:")
    print(hist.history)

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model.save(model_dir)
    print(f"Model saved to: {model_dir}")

In [8]:
train_model_comp = kfp.components.create_component_from_func(
    train_model, base_image=BASE_IMAGE
)

## Evaluate model component
The model is evaluated against the test dataset using a tensorflow evaluate call.

The results are output from the component.

In [9]:
# Pipeline metrics have problems in Kubeflow 1.6.0
# so using ui metadata here instead
# https://github.com/kubeflow/pipelines/issues/8356
def evaluate_model(
    test_dataset_dir: InputPath(str),
    mlpipeline_ui_metadata_path: OutputPath(str),
    model_dir: InputPath(str),
    batch_size: int = 20,
):
    import json
    import tensorflow as tf
    import os

    test_dataset = tf.data.experimental.load(test_dataset_dir)
    model = tf.keras.models.load_model(model_dir)
    (loss, accuracy) = model.evaluate(test_dataset)

    print((loss, accuracy))

    metadata = {
        "outputs": [
            {
                "type": "table",
                "storage": "inline",
                "format": "csv",
                "header": ["Loss", "Accuracy"],
                "source": f"{loss},{accuracy}",
            }
        ]
    }

    with open(mlpipeline_ui_metadata_path, "w") as f:
        json.dump(metadata, f)

In [10]:
evaluate_model_comp = kfp.components.create_component_from_func(
    func=evaluate_model, base_image=BASE_IMAGE
)

## Convert the model to ONNX component
Once the model is in ONNX format, we can use common tools without a dependency on TensorFlow technologies.

In [11]:
CONVERT_MODEL_TO_ONNX_COMPONENT = (
    f"{COMPONENT_CATALOG_FOLDER}/model-building/convert-to-onnx/component.yaml"
)

convert_model_to_onnx_comp = kfp.components.load_component_from_file(
    CONVERT_MODEL_TO_ONNX_COMPONENT
)

## Upload model to MinIO component

S3 buckets are the most common way to store ML models. We use MinIO, which is installed on the cluster.

In [12]:
UPLOAD_MODEL_COMPONENT = (
    f"{COMPONENT_CATALOG_FOLDER}/model-building/upload-model/component.yaml"
)

upload_model_comp = kfp.components.load_component_from_file(UPLOAD_MODEL_COMPONENT)

## Create Configmap with class names

The config map is used to set environment variables for the transformer

In [13]:
def create_config_map(name: str, namespace: str, class_names: InputPath(str)):
    from kubernetes import client, config

    config.load_incluster_config()

    with open(class_names, "r") as f:
        class_names_json = f.read()

    metadata = client.V1ObjectMeta(name=name, namespace=namespace)
    # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ConfigMap.md
    configmap = client.V1ConfigMap(
        api_version="v1",
        kind="ConfigMap",
        metadata=metadata,
        data={"CLASS_LABELS": class_names_json},
    )

    api_instance = client.CoreV1Api(client.ApiClient())
    try:
        api_instance.delete_namespaced_config_map(name=name, namespace=namespace)
    except Exception as ex:
        pass

    api_instance.create_namespaced_config_map(
        namespace=namespace,
        body=configmap,
    )


create_config_map_comp = kfp.components.create_component_from_func(
    func=create_config_map, base_image=BASE_IMAGE
)

## Component to deploy the inference service

This component builds the K8S resource templates for our inference service. It has been customized to include a transformer and autoscaing options.

In [55]:
DEPLOY_INFERENCE_SERVICE_COMPONENT = f"{os.getenv('HOME')}/kubeflow-ppc64le-examples/deploy_triton_inference_service_component/deploy_triton_inference_service_component.yaml"

deploy_inference_service_comp = kfp.components.load_component_from_file(
    DEPLOY_INFERENCE_SERVICE_COMPONENT
)

## Define the Kubeflow Pipeline

This pipeline uses MinIO for parameter passing. This allows it to cache the results of training the model, which makes testing the deployment faster when the model has not changed since the last run.

In [64]:
@dsl.pipeline(
    name="End-to-end monkey species classification pipeline",
    description="An example pipeline that performs an image classification and determines different monkey species",
)
def monkey_pipeline(
    model_name: str = "monkey-classification",
    model_version: int = 1,
    minio_url: str = "minio-service.kubeflow:9000",
):
    load_dataset_task = load_data_comp()

    train_model_task = train_model_comp(
        train_dataset_dir=load_dataset_task.outputs["train_dataset_dir"],
        validation_dataset_dir=load_dataset_task.outputs["validation_dataset_dir"],
    )
    train_model_task.set_gpu_limit(1)
    evaluate_model_task = evaluate_model_comp(
        load_dataset_task.outputs["test_dataset_dir"],
        train_model_task.outputs["model_dir"],
    )

    convert_model_to_onnx_task = convert_model_to_onnx_comp(
        model_dir=train_model_task.outputs["model_dir"]
    )

    upload_model_task = upload_model_comp(
        convert_model_to_onnx_task.outputs["onnx_model_dir"],
        minio_url="minio-service.kubeflow:9000",
        export_bucket="{{workflow.namespace}}-models",
        model_format="onnx",
        model_name=model_name,
        model_version=model_version,
    )

    create_config_map_task = create_config_map_comp(
        name=f"{model_name}-transformer-env",
        namespace="{{workflow.namespace}}",
        class_names=load_dataset_task.outputs["class_names"],
    )

    transformer_specification = {
        "image": "quay.io/ntlawrence/monkeytransform:v4.0",
        "env_configmap": f"{model_name}-transformer-env",
        "minReplicas": 1,
        "maxReplicas": 4,
    }

    deploy_model_task = deploy_inference_service_comp(
        name=model_name,
        rm_existing=True,
        storage_uri="s3://{{workflow.namespace}}-models/onnx/",
        minio_url=minio_url,
        concurrency_target=4,  # soft limit, may be exceeded for short periods of time
        predictor_min_replicas=0,  # min_replicas supports scale to 0
        predictor_max_replicas=4,  # We don't want to scale till we consume all the available GPUs
        predictor_protocol="grpc-v2",
        # Uncomment these next two lines to use the GPU runtime and allocate GPUs
        #triton_runtime_version="21.08-py3-gpu",
        #predictor_gpu_allocation=1,  # gpu per replica
        transformer_specification=transformer_specification,
    )
    # For testing, we might need to force just the depoyement to run, even
    # if it would otherwise be cached. This next line will force
    # that to happen.
    deploy_model_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    deploy_model_task.after(upload_model_task)
    deploy_model_task.after(create_config_map_task)

In [65]:
PIPELINE_NAME = "Monkey Classification Pipeline"

In [66]:
kfp.compiler.Compiler().compile(
    pipeline_func=monkey_pipeline,
    package_path=f"{PIPELINE_NAME}.yaml",
)

## Helper Functions
These two functions make it a little easier to manage pipelines and experiments that have already been created in KubeFlow.

In [67]:
def delete_pipeline(pipeline_name: str):
    """Delete's a pipeline with the specified name"""

    client = kfp.Client()
    existing_pipelines = client.list_pipelines(page_size=999).pipelines
    matches = (
        [ep.id for ep in existing_pipelines if ep.name == pipeline_name]
        if existing_pipelines
        else []
    )
    for id in matches:
        client.delete_pipeline(id)

In [68]:
def get_experiment_id(experiment_name: str) -> str:
    """Returns the id for the experiment, creating the experiment if needed"""
    client = kfp.Client()
    existing_experiments = client.list_experiments(page_size=999).experiments
    matches = (
        [ex.id for ex in existing_experiments if ex.name == experiment_name]
        if existing_experiments
        else []
    )

    if matches:
        return matches[0]

    exp = client.create_experiment(experiment_name)
    return exp.id

## Upload the Pipeline
This will make the pipeline we just compiled appear in the Pipelines panel.

In [69]:
# Pipeline names need to be unique, so before we upload,
# check for and delete any pipeline with the same name
delete_pipeline(PIPELINE_NAME)

# upload
client = kfp.Client()
uploaded_pipeline = client.upload_pipeline(f"{PIPELINE_NAME}.yaml", PIPELINE_NAME)

## Run the pipeline
Runs the pipeline that was just uploaded. Clicking on the returned link will show the progress of the run.

In [70]:
run = client.run_pipeline(
    experiment_id=get_experiment_id("monkey-classification-exp"),
    job_name="monkey-classification-pipeline",
    pipeline_id=uploaded_pipeline.id,
)

## Wait for completion

Wait until the pipeline finishes, for up to 20 Min

In [71]:
TWENTY_MIN = 20 * 60
result = client.wait_for_run_completion(run.id, timeout=TWENTY_MIN)
{
    "status": result.run.status,
    "error": result.run.error,
    "time": str(result.run.finished_at - result.run.created_at),
    "metrics": result.run.metrics,
}

{'status': 'Succeeded', 'error': None, 'time': '0:02:59', 'metrics': None}

## Next steps

The inference service is now deployed!

You can move on to the Inference notebook to learn how to use it.