In [4]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [1]:
from kfp.components import create_component_from_func, InputPath
from typing import Dict, Union
BASE_IMAGE = (
    "quay.io/ibm/kubeflow-notebook-image-ppc64le:elyra3.13.0-py3.8-tf2.9.2-pt1.12.1-v0"
)

In [2]:
def deploy_inference_service(
    name: str,
    storage_uri: str,
    minio_url: str,
    class_labels: InputPath(str),
    transformer_image: str = "quay.io/ntlawrence/monkeytransform:latest",
    rm_existing: bool = False,
    minio_credential_secret="mlpipeline-minio-artifact",
    min_replicas: int = None,
    max_replicas: int = None,
    pred_gpus: int = 0,
    concurrency_target: int = 1,
    triton_runtime_version: str = "22.03-py3",
    transformer_specification: Dict[str, Union[str, int]] = None,
):
    import os
    import subprocess
    import yaml

    # https://kserve.github.io/website/modelserving/storage/s3/s3/#predict-on-a-inferenceservice-with-a-saved-model-on-s3
    # https://kserve.github.io/website/reference/api/
    # https://kserve.github.io/website/modelserving/autoscaling/autoscaling/
    # https://kserve.github.io/website/modelserving/v1beta1/triton/torchscript/#inference-with-http-endpoint
    # https://github.com/kserve/kserve/blob/master/docs/samples/multimodelserving/triton/README.md
    # https://kserve.github.io/website/modelserving/v1beta1/triton/torchscript/#inference-with-grpc-endpoint

    # It happens that the credentials for the minio user name and password are already in a secret
    # This just loads them so that we can create our own secret to store the S3 connection information
    # for the Inference service
    r = subprocess.run(
        ["kubectl", "get", "secret", minio_credential_secret, "-oyaml"],
        stdout=subprocess.PIPE,
        check=True,
        text=True,
    )
    secret = yaml.safe_load(r.stdout)

    s3_credentials_spec = f"""
    apiVersion: v1
    kind: Secret
    metadata:
      name: minio-credentials
      annotations:
        serving.kserve.io/s3-endpoint: {minio_url} 
        serving.kserve.io/s3-usehttps: "0"
        serving.kserve.io/s3-region: "us-west1"
        serving.kserve.io/s3-useanoncredential: "false"
    type: Opaque
    data:
      AWS_ACCESS_KEY_ID: {secret['data']['accesskey']}
      AWS_SECRET_ACCESS_KEY: {secret['data']['secretkey']}
    """

    print(s3_credentials_spec)
    subprocess.run(
        ["kubectl", "apply", "-f", "-"],
        input=s3_credentials_spec,
        check=True,
        text=True,
    )

    sa_spec = """
    apiVersion: v1
    kind: ServiceAccount
    metadata:
      name: kserve-inference-sa
    secrets:
    - name: minio-credentials
    """

    print(sa_spec)
    subprocess.run(
        ["kubectl", "apply", "-f", "-"], input=sa_spec, check=True, text=True
    )

    if rm_existing:
        subprocess.run(["kubectl", "delete", "inferenceservice", name], check=False)

    with open(class_labels, "r") as clf:
        labels_json = clf.read()

    gpu_resources = f"nvidia.com/gpu: {pred_gpus}" if pred_gpus else ""

    minReplicas = f"minReplicas: {min_replicas}" if min_replicas is not None else ""
    maxReplicas = f"maxReplicas: {max_replicas}" if max_replicas else ""

    service_spec = f"""
    apiVersion: serving.kserve.io/v1beta1
    kind: InferenceService
    metadata:
      name: {name}
      annotations:
        sidecar.istio.io/inject: "false"
        # https://knative.dev/docs/serving/autoscaling/concurrency/#soft-limit
        autoscaling.knative.dev/target: "{concurrency_target}"
        autoscaling.knative.dev/metric: "concurrency"
    spec:
      transformer:
        {minReplicas}
        {maxReplicas}
        serviceAccountName: kserve-inference-sa
        containers:
        - image: "{transformer_image}"
          name: {name}-transformer
          command: ["python", "transform.py"]
          args: ["--protocol=grpc-v2"]
          env:
            - name: STORAGE_URI
              value: {storage_uri}
            - name: CLASS_LABELS
              value: |
                     {labels_json}

      predictor:
        {minReplicas}
        {maxReplicas}
        serviceAccountName: kserve-inference-sa
        triton:
          runtimeVersion: {triton_runtime_version}
          args: [ "--strict-model-config=false"]
          storageUri: {storage_uri}
          ports:
          - containerPort: 9000
            name: h2c
            protocol: TCP
          env:
          - name: OMP_NUM_THREADS
            value: "1"
          resources:
            limits:
               {gpu_resources}
    """

    print(service_spec)
    subprocess.run(
        ["kubectl", "apply", "-f", "-"], input=service_spec, check=True, text=True
    )

    print("Waiting for inference service to become available")
    subprocess.run(
        [
            "kubectl",
            "wait",
            "--for=condition=Ready",
            f"inferenceservice/{name}",
            "--timeout=600s",
        ],
        check=True,
    )

In [3]:
deploy_inference_service_comp = create_component_from_func(
    func=deploy_inference_service,
    output_component_file="deploy_inference_service_component.yaml",
    base_image=BASE_IMAGE,
)