In [3]:
%load_ext lab_black

## Defines a component to deploy an inference service using NVIDIA Triton

The component is intended to be reusable across multiple pipelines. Our NVIDIA Triton builds support ONNX models.

The component builds the InferenceService yaml, and waits for the inference service to deploy.

In [4]:
from kfp.components import create_component_from_func, InputPath
from typing import Dict, Union

BASE_IMAGE = (
    "quay.io/ibm/kubeflow-notebook-image-ppc64le:elyra3.13.0-py3.8-tf2.9.2-pt1.12.1-v0"
)

In [11]:
def deploy_inference_service(
    name: str,
    storage_uri: str,
    minio_url: str,
    rm_existing: bool = False,
    minio_credential_secret="mlpipeline-minio-artifact",
    concurrency_target: int = None,
    predictor_min_replicas: int = None,
    predictor_max_replicas: int = None,
    predictor_gpu_allocation: int = 0,
    predictor_protocol: str = "v2",  # or grpc-v2
    triton_runtime_version: str = "22.03-py3",
    predictor_node_selector: str = "",  # Requires admin to enable the capability
    transformer_specification: Dict[str, Union[str, int]] = None,
):
    import os
    import subprocess
    import yaml
    import base64

    # https://kserve.github.io/website/modelserving/storage/s3/s3/#predict-on-a-inferenceservice-with-a-saved-model-on-s3
    # https://kserve.github.io/website/reference/api/
    # https://kserve.github.io/website/modelserving/autoscaling/autoscaling/
    # https://kserve.github.io/website/modelserving/v1beta1/triton/torchscript/#inference-with-http-endpoint
    # https://github.com/kserve/kserve/blob/master/docs/samples/multimodelserving/triton/README.md
    # https://kserve.github.io/website/modelserving/v1beta1/triton/torchscript/#inference-with-grpc-endpoint

    # Caution: If using nodeSelector, the nodeSelector capability must be enabled for knative by an admin
    # https://knative.dev/docs/serving/configuration/feature-flags/#kubernetes-node-selector
    # The default for our installs is FALSE
    # once enabled, the value should be 'label: "value"', to force the predictor/transformer to
    # run on specific hardware

    # It happens that the credentials for the minio user name and password are already in a secret
    # This just loads them so that we can create our own secret to store the S3 connection information
    # for the Inference service
    r = subprocess.run(
        ["kubectl", "get", "secret", minio_credential_secret, "-oyaml"],
        stdout=subprocess.PIPE,
        check=True,
        text=True,
    )
    secret = yaml.safe_load(r.stdout)

    s3_credentials_spec = f"""
    apiVersion: v1
    kind: Secret
    metadata:
      name: minio-credentials
      annotations:
        serving.kserve.io/s3-endpoint: {minio_url} 
        serving.kserve.io/s3-usehttps: "0"
        serving.kserve.io/s3-region: "us-west1"
        serving.kserve.io/s3-useanoncredential: "false"
    type: Opaque
    data:
      AWS_ACCESS_KEY_ID: {secret['data']['accesskey']}
      AWS_SECRET_ACCESS_KEY: {secret['data']['secretkey']}
    """

    print(s3_credentials_spec)
    subprocess.run(
        ["kubectl", "apply", "-f", "-"],
        input=s3_credentials_spec,
        check=True,
        text=True,
    )

    sa_spec = """
    apiVersion: v1
    kind: ServiceAccount
    metadata:
      name: kserve-inference-sa
    secrets:
    - name: minio-credentials
    """

    print(sa_spec)
    subprocess.run(
        ["kubectl", "apply", "-f", "-"], input=sa_spec, check=True, text=True
    )

    ### Remove Existing Inferenceservice, if requested
    ### Ignores errrors if service does not already exist
    if rm_existing:
        subprocess.run(["kubectl", "delete", "inferenceservice", name], check=False)

    ####### Transformer Spec ######
    if transformer_specification:
        min_t_replicas = (
            ("minReplicas: " + transformer_specification["min_replicas"])
            if "min_replicas" in transformer_specification
            else ""
        )
        max_t_replicas = (
            ("maxReplicas: " + transformer_specification["max_replicas"])
            if "min_replicas" in transformer_specification
            else ""
        )

        # EnvFrom allows all vars to be read from a config map
        # If a variable is defined by both env and envFrom,
        # env takes precedance. If a variable is defined twice
        # in env from, then the last one wins.
        if "env_configmap" in transformer_specification:
            envFrom = f"""
          envFrom:
            - configMapRef:
                name: {transformer_specification["env_configmap"]}
          """
        else:
            envFrom = ""

        # Node selector
        # https://kserve.github.io/website/0.9/modelserving/nodescheduling/inferenceservicenodescheduling/#usage
        if "node_selector" in transformer_specification:
            t_node_selector = (
                f'nodeSelector:\n          {transformer_specification["node_selector"]}'
            )
        else:
            t_node_selector = ""

        #### Transformer specification ####
        transform_spec = f"""
      transformer:
        {min_t_replicas}
        {max_t_replicas}
        serviceAccountName: kserve-inference-sa
        {t_node_selector}
        containers:
        - image: "{transformer_specification["image"]}"
          name: "{name}-transformer"
          command: {transformer_specification.get("command", '["python", "transform.py"]')}
          args: ["--protocol={predictor_protocol}"]
          env:
            - name: STORAGE_URI
              value: {storage_uri}
          {envFrom}
          """
    else:
        transform_spec = ""

    gpu_resources = (
        f"nvidia.com/gpu: {predictor_gpu_allocation}"
        if predictor_gpu_allocation
        else ""
    )

    min_p_replicas = (
        f"minReplicas: {predictor_min_replicas}"
        if predictor_min_replicas is not None
        else ""
    )
    max_p_replicas = (
        f"maxReplicas: {predictor_max_replicas}"
        if predictor_max_replicas is not None
        else ""
    )

    predictor_port_spec = (
        '[{"containerPort": 9000, "name": "h2c", "protocol": "TCP"}]'
        if predictor_protocol == "grpc-v2"
        else ""
    )

    if concurrency_target:
        autoscaling_target = f"""
        autoscaling.knative.dev/target: "{concurrency_target}"
        autoscaling.knative.dev/metric: "concurrency"
        """
    else:
        autoscaling_target = ""

    # Node selector
    # https://kserve.github.io/website/0.9/modelserving/nodescheduling/inferenceservicenodescheduling/#usage
    if predictor_node_selector:
        p_node_selector = f"nodeSelector:\n          {predictor_node_selector}"
    else:
        p_node_selector = ""

    ##### Inference Service Spec ####
    service_spec = f"""
    apiVersion: serving.kserve.io/v1beta1
    kind: InferenceService
    metadata:
      name: {name}
      annotations:
        sidecar.istio.io/inject: "false"
        # https://knative.dev/docs/serving/autoscaling/concurrency/#soft-limit
        {autoscaling_target}
    spec:
      {transform_spec}
      predictor:
        {min_p_replicas}
        {max_p_replicas}
        serviceAccountName: kserve-inference-sa
        {p_node_selector}
        triton:
          runtimeVersion: {triton_runtime_version}
          args: [ "--strict-model-config=false"]
          storageUri: {storage_uri}
          ports: {predictor_port_spec}
          env:
          - name: OMP_NUM_THREADS
            value: "1"
          resources:
            limits:
               {gpu_resources}
    """

    print(service_spec)
    subprocess.run(
        ["kubectl", "apply", "-f", "-"], input=service_spec, check=True, text=True
    )

    print("Waiting for inference service to become available")
    subprocess.run(
        [
            "kubectl",
            "wait",
            "--for=condition=Ready",
            f"inferenceservice/{name}",
            "--timeout=600s",
        ],
        check=True,
    )

In [12]:
deploy_inference_service_comp = create_component_from_func(
    func=deploy_inference_service,
    output_component_file="deploy_triton_inference_service_component.yaml",
    base_image=BASE_IMAGE,
)