From ecff0797f50bb942fa5a86a2387bd7aa7019afd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edgar=20Hern=C3=A1ndez?= <23639005+israel-hdez@users.noreply.github.com> Date: Tue, 25 Jul 2023 11:51:39 -0600 Subject: [PATCH] Onboarding on openshift-ci MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These are the needed changes to have openshift-ci running the E2E tests successfully. There are several groups of E2E tests that can be deduced from the .github/workflows/e2e-test.yaml file: fast, slow, explainer, transformer-mms, qpext, grpc, helm, raw and kourier. For ODH, the `fast`, `slow` and `grpc` groups are the ones that cover the features that are going to be supported in the initial adoption of ODH. This commit contains the needed adaptations to the E2E tests of the `fast` and `slow` groups to successfully run them in an openshift cluster. It also adds a few scripts on test/scripts/openshift-ci to run these E2Es in the openshift-ci operator. Some of these changes should be seen as provisional and should be rolled back: * test/e2e/common/utils.py: because of the networking/DNS expectations, that are currently not covered in ODH's installation. * test/e2e/predictor/*: * In general all changes under this path should be seen as provisional. However, since ODH won't support all ServingRuntimes, it is possible that some of the tests will stay out. * There are some GRPC-related tests marked as skipped. Since this work is not enabling the `grpc` group, a subsequent commit/PR for enabling GRPC E2Es should remove/revert those skip marks. * Also, there are some tests skipped with the `Not testable in ODH at the moment` reason. The root cause of the failure should be investigated to re-enable these tests. * python/kserve/kserve/models/v1beta1_inference_service.py: This is injecting an annotation that is required given the specifics of OSSM/Maistra and OpenShift-Serverless that are used in ODH. This annotation is, currently, user responsibility and this was the cleanest way to add it in the E2Es. Being platform-specific, it's been discussed that this (and some other) annotation should be injected by some controller to relief the user from this responsibility. If this happens, this change should be reverted. Also, ideally, changes to the following files should be contributed back to upstream. Those changes are not required in upstream and should have no effect, but in openshift-ci become required because a different builder image is being used: * Dockerfile * agent.Dockerfile Signed-off-by: Edgar Hernández <23639005+israel-hdez@users.noreply.github.com> --- Dockerfile | 2 +- agent.Dockerfile | 2 +- .../models/v1beta1_inference_service.py | 2 + test/e2e/common/utils.py | 19 +- test/e2e/predictor/test_paddle.py | 1 + test/e2e/predictor/test_sklearn.py | 5 + test/e2e/predictor/test_tensorflow.py | 3 + test/e2e/predictor/test_torchserve.py | 2 + test/e2e/predictor/test_triton.py | 3 + test/scripts/openshift-ci/deploy.ossm.sh | 160 ++++++++++++++++ .../scripts/openshift-ci/deploy.serverless.sh | 174 ++++++++++++++++++ test/scripts/openshift-ci/run-e2e-tests.sh | 111 +++++++++++ 12 files changed, 473 insertions(+), 11 deletions(-) create mode 100755 test/scripts/openshift-ci/deploy.ossm.sh create mode 100755 test/scripts/openshift-ci/deploy.serverless.sh create mode 100755 test/scripts/openshift-ci/run-e2e-tests.sh diff --git a/Dockerfile b/Dockerfile index 5066adcc20..934ea8fc88 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ COPY cmd/ cmd/ COPY pkg/ pkg/ # Build -RUN CGO_ENABLED=0 GOOS=linux go build -a -o manager ./cmd/manager +RUN CGO_ENABLED=0 GOOS=linux GOFLAGS=-mod=mod go build -a -o manager ./cmd/manager # Copy the controller-manager into a thin image FROM gcr.io/distroless/static:nonroot diff --git a/agent.Dockerfile b/agent.Dockerfile index a70ee0fb09..7b22f4fbda 100644 --- a/agent.Dockerfile +++ b/agent.Dockerfile @@ -12,7 +12,7 @@ COPY pkg/ pkg/ COPY cmd/ cmd/ # Build -RUN CGO_ENABLED=0 GOOS=linux go build -a -o agent ./cmd/agent +RUN CGO_ENABLED=0 GOOS=linux GOFLAGS=-mod=mod go build -a -o agent ./cmd/agent # Copy the inference-agent into a thin image FROM gcr.io/distroless/static:nonroot diff --git a/python/kserve/kserve/models/v1beta1_inference_service.py b/python/kserve/kserve/models/v1beta1_inference_service.py index 0f64aabf8b..d6bd3f89f2 100644 --- a/python/kserve/kserve/models/v1beta1_inference_service.py +++ b/python/kserve/kserve/models/v1beta1_inference_service.py @@ -150,6 +150,8 @@ def metadata(self, metadata): :param metadata: The metadata of this V1beta1InferenceService. # noqa: E501 :type: V1ObjectMeta """ + if metadata is not None: + metadata.annotations = {"serving.knative.openshift.io/enablePassthrough": "true"} self._metadata = metadata diff --git a/test/e2e/common/utils.py b/test/e2e/common/utils.py index 058c4c6c0a..4a2bbca8a0 100644 --- a/test/e2e/common/utils.py +++ b/test/e2e/common/utils.py @@ -80,22 +80,23 @@ def predict_str(service_name, input_json, protocol_version="v1", ) # temporary sleep until this is fixed https://github.com/kserve/kserve/issues/604 time.sleep(10) - cluster_ip = get_cluster_ip() - host = urlparse(isvc["status"]["url"]).netloc - path = urlparse(isvc["status"]["url"]).path + # cluster_ip = get_cluster_ip() + host = urlparse(isvc["status"]["components"]["predictor"]["url"]).netloc + path = urlparse(isvc["status"]["components"]["predictor"]["url"]).path + cluster_ip = host headers = {"Host": host, "Content-Type": "application/json"} if model_name is None: model_name = service_name - url = f"http://{cluster_ip}{path}/v1/models/{model_name}:predict" + url = f"https://{cluster_ip}{path}/v1/models/{model_name}:predict" if protocol_version == "v2": - url = f"http://{cluster_ip}{path}/v2/models/{model_name}/infer" + url = f"https://{cluster_ip}{path}/v2/models/{model_name}/infer" logging.info("Sending Header = %s", headers) logging.info("Sending url = %s", url) logging.info("Sending request data: %s", input_json) - response = requests.post(url, input_json, headers=headers) + response = requests.post(url, input_json, headers=headers, verify=False) logging.info("Got response code %s, content %s", response.status_code, response.content) if response.status_code == 200: preds = json.loads(response.content.decode("utf-8")) @@ -118,7 +119,7 @@ def predict_ig(ig_name, input_json, protocol_version="v1", ) cluster_ip = get_cluster_ip() - host = urlparse(ig["status"]["url"]).netloc + host = urlparse(ig["status"]["components"]["predictor"]["url"]).netloc headers = {"Host": host} url = f"http://{cluster_ip}" @@ -154,7 +155,7 @@ def explain_response(service_name, input_json): # temporary sleep until this is fixed https://github.com/kserve/kserve/issues/604 time.sleep(10) cluster_ip = get_cluster_ip() - host = urlparse(isvc["status"]["url"]).netloc + host = urlparse(isvc["status"]["components"]["predictor"]["url"]).netloc url = "http://{}/v1/models/{}:explain".format(cluster_ip, service_name) headers = {"Host": host} with open(input_json) as json_file: @@ -217,7 +218,7 @@ def predict_grpc(service_name, payload, parameters=None, version=constants.KSERV namespace=KSERVE_TEST_NAMESPACE, version=version, ) - host = urlparse(isvc["status"]["url"]).netloc + host = urlparse(isvc["status"]["components"]["predictor"]["url"]).netloc if ":" not in cluster_ip: cluster_ip = cluster_ip + ":80" diff --git a/test/e2e/predictor/test_paddle.py b/test/e2e/predictor/test_paddle.py index 687fec741a..5fb0b8aa92 100644 --- a/test/e2e/predictor/test_paddle.py +++ b/test/e2e/predictor/test_paddle.py @@ -162,6 +162,7 @@ def test_paddle_v2_kserve(): @pytest.mark.slow +@pytest.mark.skip("GRPC tests are failing in ODH at the moment") def test_paddle_v2_grpc(): service_name = "isvc-paddle-v2-grpc" model_name = "paddle" diff --git a/test/e2e/predictor/test_sklearn.py b/test/e2e/predictor/test_sklearn.py index 58546d31af..cb3f9efa67 100644 --- a/test/e2e/predictor/test_sklearn.py +++ b/test/e2e/predictor/test_sklearn.py @@ -206,6 +206,7 @@ def test_sklearn_v2(): @pytest.mark.slow +@pytest.mark.skip("GRPC tests are failing in ODH at the moment") def test_sklearn_v2_grpc(): service_name = "isvc-sklearn-v2-grpc" model_name = "sklearn" @@ -254,7 +255,10 @@ def test_sklearn_v2_grpc(): kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE) +# In ODH, this test generates the following response: +# Code 500 - 'ColumnTransformer' object has no attribute '_name_to_fitted_passthrough' @pytest.mark.slow +@pytest.mark.skip("Not testable in ODH at the moment") def test_sklearn_v2_mixed(): service_name = "isvc-sklearn-v2-mixed" predictor = V1beta1PredictorSpec( @@ -291,6 +295,7 @@ def test_sklearn_v2_mixed(): @pytest.mark.slow +@pytest.mark.skip("GRPC tests are failing in ODH at the moment") def test_sklearn_v2_mixed_grpc(): service_name = "isvc-sklearn-v2-mixed-grpc" model_name = "sklearn" diff --git a/test/e2e/predictor/test_tensorflow.py b/test/e2e/predictor/test_tensorflow.py index 457d6fd95d..e1c060dbdb 100644 --- a/test/e2e/predictor/test_tensorflow.py +++ b/test/e2e/predictor/test_tensorflow.py @@ -59,7 +59,10 @@ def test_tensorflow_kserve(): kserve_client.delete(service_name, namespace=KSERVE_TEST_NAMESPACE) +# In ODH, this test generates the following response: +# 502 Server Error: Bad Gateway for url @pytest.mark.slow +@pytest.mark.skip("Not testable in ODH at the moment") def test_tensorflow_runtime_kserve(): service_name = 'isvc-tensorflow-runtime' predictor = V1beta1PredictorSpec( diff --git a/test/e2e/predictor/test_torchserve.py b/test/e2e/predictor/test_torchserve.py index 6ca35a5b7a..5b537a4d7e 100644 --- a/test/e2e/predictor/test_torchserve.py +++ b/test/e2e/predictor/test_torchserve.py @@ -32,6 +32,8 @@ from ..common.utils import KSERVE_TEST_NAMESPACE from ..common import inference_pb2 +pytest.skip("ODH does not support torchserve at the moment", allow_module_level=True) + @pytest.mark.slow def test_torchserve_kserve(): diff --git a/test/e2e/predictor/test_triton.py b/test/e2e/predictor/test_triton.py index 0029344509..b5f5aab565 100644 --- a/test/e2e/predictor/test_triton.py +++ b/test/e2e/predictor/test_triton.py @@ -70,7 +70,10 @@ def test_triton(): kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE) +# Not testable in ODH until the following issue is solved: +# https://github.com/opendatahub-io/odh-model-controller/issues/59 @pytest.mark.fast +@pytest.mark.skip(reason="Not testable in ODH at the moment") def test_triton_runtime_with_transformer(): service_name = 'isvc-triton-runtime' predictor = V1beta1PredictorSpec( diff --git a/test/scripts/openshift-ci/deploy.ossm.sh b/test/scripts/openshift-ci/deploy.ossm.sh new file mode 100755 index 0000000000..2e3876a39c --- /dev/null +++ b/test/scripts/openshift-ci/deploy.ossm.sh @@ -0,0 +1,160 @@ +#!/usr/bin/env bash +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eu + +waitforpodlabeled() { + local ns=${1?namespace is required}; shift + local podlabel=${1?pod label is required}; shift + + echo "Waiting for pod -l $podlabel to be created" + until oc get pod -n "$ns" -l $podlabel -o=jsonpath='{.items[0].metadata.name}' >/dev/null 2>&1; do + sleep 1 + done +} + +waitpodready() { + local ns=${1?namespace is required}; shift + local podlabel=${1?pod label is required}; shift + + waitforpodlabeled "$ns" "$podlabel" + echo "Waiting for pod -l $podlabel to become ready" + oc wait --for=condition=ready --timeout=180s pod -n $ns -l $podlabel +} + + +# Deploy Distributed tracing operator (Jaeger) +cat </dev/null 2>&1; do + sleep 1 + done +} + +waitpodready() { + local ns=${1?namespace is required}; shift + local podlabel=${1?pod label is required}; shift + + waitforpodlabeled "$ns" "$podlabel" + sleep 10 + oc get pod -n $ns -l $podlabel + + echo "Waiting for pod -l $podlabel to become ready" + oc wait --for=condition=ready --timeout=600s pod -n $ns -l $podlabel || (oc get pod -n $ns -l $podlabel && false) +} + +# Deploy Serverless operator +cat < /dev/null; then + echo "Installing Kustomize" + curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | bash -s -- 5.0.1 $HOME/.local/bin +fi + +# If minio CLI is not installed, install it +if ! command -v mc &> /dev/null; then + echo "Installing Minio CLI" + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o $HOME/.local/bin/mc + chmod +x $HOME/.local/bin/mc +fi + +# +echo "Installing KServe Python SDK ..." +pushd $PROJECT_ROOT >/dev/null + ./test/scripts/gh-actions/setup-poetry.sh + ./test/scripts/gh-actions/check-poetry-lockfile.sh +popd +pushd $PROJECT_ROOT/python/kserve >/dev/null + poetry install --with=test --no-interaction +popd + +# Install KServe stack +echo "Installing OSSM" +$MY_PATH/deploy.ossm.sh +echo "Installing Serverless" +$MY_PATH/deploy.serverless.sh + +echo "Installing KServe with Minio" +kustomize build $PROJECT_ROOT/config/overlays/test | \ + sed "s|kserve/storage-initializer:latest|${STORAGE_INITIALIZER_IMAGE}|" | \ + sed "s|kserve/agent:latest|${KSERVE_AGENT_IMAGE}|" | \ + sed "s|kserve/router:latest|${KSERVE_ROUTER_IMAGE}|" | \ + sed "s|kserve/kserve-controller:latest|${KSERVE_CONTROLLER_IMAGE}|" | \ + oc apply -f - +oc wait --for=condition=ready pod -l control-plane=kserve-controller-manager -n kserve --timeout=300s + +echo "Add testing models to minio storage ..." # Reference: config/overlays/test/minio/minio-init-job.yaml +curl -L https://storage.googleapis.com/kfserving-examples/models/sklearn/1.0/model/model.joblib -o /tmp/sklearn-model.joblib +oc expose service minio-service -n kserve && sleep 5 +MINIO_ROUTE=$(oc get routes -n kserve minio-service -o jsonpath="{.spec.host}") +mc alias set storage http://$MINIO_ROUTE minio minio123 +mc mb storage/example-models +mc cp /tmp/sklearn-model.joblib storage/example-models/sklearn/model.joblib +oc delete route -n kserve minio-service + +# +echo "Prepare CI namespace and install ServingRuntimes" +cat </dev/null + ./test/scripts/gh-actions/run-e2e-tests.sh "$1" +popd