From c12abdb30301a1119febc1f6456213f6f9b2d05a Mon Sep 17 00:00:00 2001 From: Zheng Feng Date: Wed, 18 Mar 2026 14:34:29 +0800 Subject: [PATCH 1/3] Add e2e-aws-spot periodic job for HyperShift spot instance verification (OCPSTRAT-1677) --- ...ft-hypershift-release-4.22__periodics.yaml | 7 + ...ift-hypershift-release-4.22-periodics.yaml | 83 +++++ ...rshift-aws-e2e-spot-workflow.metadata.json | 15 + .../hypershift-aws-e2e-spot-workflow.yaml | 21 ++ .../step-registry/hypershift/aws/spot/OWNERS | 8 + ...ypershift-aws-spot-sqs-cleanup-commands.sh | 50 +++ ...ift-aws-spot-sqs-cleanup-ref.metadata.json | 15 + .../hypershift-aws-spot-sqs-cleanup-ref.yaml | 23 ++ .../hypershift-aws-spot-sqs-setup-commands.sh | 90 +++++ ...shift-aws-spot-sqs-setup-ref.metadata.json | 15 + .../hypershift-aws-spot-sqs-setup-ref.yaml | 25 ++ .../hypershift-aws-spot-verify-commands.sh | 323 ++++++++++++++++++ ...pershift-aws-spot-verify-ref.metadata.json | 15 + .../hypershift-aws-spot-verify-ref.yaml | 21 ++ 14 files changed, 711 insertions(+) create mode 100644 ci-operator/step-registry/hypershift/aws/e2e/spot/hypershift-aws-e2e-spot-workflow.metadata.json create mode 100644 ci-operator/step-registry/hypershift/aws/e2e/spot/hypershift-aws-e2e-spot-workflow.yaml create mode 100644 ci-operator/step-registry/hypershift/aws/spot/OWNERS create mode 100755 ci-operator/step-registry/hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-commands.sh create mode 100644 ci-operator/step-registry/hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-ref.metadata.json create mode 100644 ci-operator/step-registry/hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-ref.yaml create mode 100755 ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-commands.sh create mode 100644 ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-ref.metadata.json create mode 100644 ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-ref.yaml create mode 100755 ci-operator/step-registry/hypershift/aws/spot/verify/hypershift-aws-spot-verify-commands.sh create mode 100644 ci-operator/step-registry/hypershift/aws/spot/verify/hypershift-aws-spot-verify-ref.metadata.json create mode 100644 ci-operator/step-registry/hypershift/aws/spot/verify/hypershift-aws-spot-verify-ref.yaml diff --git a/ci-operator/config/openshift/hypershift/openshift-hypershift-release-4.22__periodics.yaml b/ci-operator/config/openshift/hypershift/openshift-hypershift-release-4.22__periodics.yaml index bd9abd4362712..bcc9e0a1e10ab 100644 --- a/ci-operator/config/openshift/hypershift/openshift-hypershift-release-4.22__periodics.yaml +++ b/ci-operator/config/openshift/hypershift/openshift-hypershift-release-4.22__periodics.yaml @@ -381,6 +381,13 @@ tests: LVM_OPERATOR_SUB_SOURCE: lvm-catalogsource TECH_PREVIEW_NO_UPGRADE: "true" workflow: hypershift-openstack-nested-conformance +- as: e2e-aws-spot + cron: 0 6 * * 1 + steps: + cluster_profile: hypershift-aws + env: + CI_TESTS_RUN: ^TestNodePool$/HostedCluster0/TestSpotTerminationHandler$ + workflow: hypershift-aws-e2e-spot zz_generated_metadata: branch: release-4.22 org: openshift diff --git a/ci-operator/jobs/openshift/hypershift/openshift-hypershift-release-4.22-periodics.yaml b/ci-operator/jobs/openshift/hypershift/openshift-hypershift-release-4.22-periodics.yaml index 861428672e41e..8550c25a52394 100644 --- a/ci-operator/jobs/openshift/hypershift/openshift-hypershift-release-4.22-periodics.yaml +++ b/ci-operator/jobs/openshift/hypershift/openshift-hypershift-release-4.22-periodics.yaml @@ -830,6 +830,89 @@ periodics: - name: result-aggregator secret: secretName: result-aggregator +- agent: kubernetes + cluster: build01 + cron: 0 6 * * 1 + decorate: true + decoration_config: + skip_cloning: true + extra_refs: + - base_ref: release-4.22 + org: openshift + repo: hypershift + labels: + ci-operator.openshift.io/cloud: hypershift-aws + ci-operator.openshift.io/cloud-cluster-profile: hypershift-aws + ci-operator.openshift.io/variant: periodics + ci.openshift.io/generator: prowgen + job-release: "4.22" + pj-rehearse.openshift.io/can-be-rehearsed: "true" + name: periodic-ci-openshift-hypershift-release-4.22-periodics-e2e-aws-spot + spec: + containers: + - args: + - --gcs-upload-secret=/secrets/gcs/service-account.json + - --image-import-pull-secret=/etc/pull-secret/.dockerconfigjson + - --lease-server-credentials-file=/etc/boskos/credentials + - --report-credentials-file=/etc/report/credentials + - --secret-dir=/secrets/ci-pull-credentials + - --target=e2e-aws-spot + - --variant=periodics + command: + - ci-operator + env: + - name: HTTP_SERVER_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay-proxy.ci.openshift.org/openshift/ci:ci_ci-operator_latest + imagePullPolicy: Always + name: "" + ports: + - containerPort: 8080 + name: http + resources: + requests: + cpu: 10m + volumeMounts: + - mountPath: /etc/boskos + name: boskos + readOnly: true + - mountPath: /secrets/ci-pull-credentials + name: ci-pull-credentials + readOnly: true + - mountPath: /secrets/gcs + name: gcs-credentials + readOnly: true + - mountPath: /secrets/manifest-tool + name: manifest-tool-local-pusher + readOnly: true + - mountPath: /etc/pull-secret + name: pull-secret + readOnly: true + - mountPath: /etc/report + name: result-aggregator + readOnly: true + serviceAccountName: ci-operator + volumes: + - name: boskos + secret: + items: + - key: credentials + path: credentials + secretName: boskos-credentials + - name: ci-pull-credentials + secret: + secretName: ci-pull-credentials + - name: manifest-tool-local-pusher + secret: + secretName: manifest-tool-local-pusher + - name: pull-secret + secret: + secretName: registry-pull-credentials + - name: result-aggregator + secret: + secretName: result-aggregator - agent: kubernetes cluster: build01 cron: 0 */12 * * * diff --git a/ci-operator/step-registry/hypershift/aws/e2e/spot/hypershift-aws-e2e-spot-workflow.metadata.json b/ci-operator/step-registry/hypershift/aws/e2e/spot/hypershift-aws-e2e-spot-workflow.metadata.json new file mode 100644 index 0000000000000..06de44e807116 --- /dev/null +++ b/ci-operator/step-registry/hypershift/aws/e2e/spot/hypershift-aws-e2e-spot-workflow.metadata.json @@ -0,0 +1,15 @@ +{ + "path": "hypershift/aws/e2e/spot/hypershift-aws-e2e-spot-workflow.yaml", + "owners": { + "approvers": [ + "csrwng", + "enxebre", + "sjenning" + ], + "reviewers": [ + "csrwng", + "enxebre", + "sjenning" + ] + } +} diff --git a/ci-operator/step-registry/hypershift/aws/e2e/spot/hypershift-aws-e2e-spot-workflow.yaml b/ci-operator/step-registry/hypershift/aws/e2e/spot/hypershift-aws-e2e-spot-workflow.yaml new file mode 100644 index 0000000000000..8b05a83849ef6 --- /dev/null +++ b/ci-operator/step-registry/hypershift/aws/e2e/spot/hypershift-aws-e2e-spot-workflow.yaml @@ -0,0 +1,21 @@ +workflow: + as: hypershift-aws-e2e-spot + documentation: |- + The HyperShift AWS spot instance e2e workflow creates an SQS queue and + EventBridge rules, then executes spot instance tests against a new + ephemeral HyperShift cluster. + + This workflow extends hypershift-aws-e2e-external with SQS infrastructure + setup/cleanup for the AWS Node Termination Handler spot instance test. + steps: + pre: + - ref: ipi-install-rbac + - chain: hypershift-setup-nested-management-cluster + - ref: hypershift-install + - ref: hypershift-aws-spot-sqs-setup + test: + - ref: hypershift-aws-run-e2e-external + - ref: hypershift-aws-spot-verify + post: + - ref: hypershift-aws-spot-sqs-cleanup + - chain: hypershift-destroy-nested-management-cluster diff --git a/ci-operator/step-registry/hypershift/aws/spot/OWNERS b/ci-operator/step-registry/hypershift/aws/spot/OWNERS new file mode 100644 index 0000000000000..9cd67cb3b6b08 --- /dev/null +++ b/ci-operator/step-registry/hypershift/aws/spot/OWNERS @@ -0,0 +1,8 @@ +approvers: + - csrwng + - enxebre + - sjenning +reviewers: + - csrwng + - enxebre + - sjenning diff --git a/ci-operator/step-registry/hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-commands.sh b/ci-operator/step-registry/hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-commands.sh new file mode 100755 index 0000000000000..5d66b8cf7f109 --- /dev/null +++ b/ci-operator/step-registry/hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-commands.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +set -o nounset +set -o pipefail +set -o xtrace + +# This step cleans up the SQS queue and EventBridge rules created by +# hypershift-aws-spot-sqs-setup. Best-effort cleanup; errors are logged +# but do not fail the job. + +AWS_CREDS_FILE="/etc/hypershift-pool-aws-credentials/credentials" +AWS_REGION="${AWS_REGION:-us-east-1}" + +export AWS_SHARED_CREDENTIALS_FILE="${AWS_CREDS_FILE}" +export AWS_DEFAULT_REGION="${AWS_REGION}" + +# Read state from setup step +QUEUE_URL="" +RULE_PREFIX="" + +if [[ -f "${SHARED_DIR}/spot_sqs_queue_url" ]]; then + QUEUE_URL=$(cat "${SHARED_DIR}/spot_sqs_queue_url") +fi + +if [[ -f "${SHARED_DIR}/spot_eventbridge_rule_prefix" ]]; then + RULE_PREFIX=$(cat "${SHARED_DIR}/spot_eventbridge_rule_prefix") +fi + +# Clean up EventBridge rules +if [[ -n "${RULE_PREFIX}" ]]; then + echo "Cleaning up EventBridge rules with prefix: ${RULE_PREFIX}" + + for SUFFIX in interruption rebalance; do + RULE_NAME="${RULE_PREFIX}-${SUFFIX}" + echo "Removing targets and deleting rule: ${RULE_NAME}" + aws events remove-targets --rule "${RULE_NAME}" --ids 1 --region "${AWS_REGION}" 2>/dev/null || true + aws events delete-rule --name "${RULE_NAME}" --region "${AWS_REGION}" 2>/dev/null || true + done + + echo "EventBridge rules cleaned up" +fi + +# Clean up SQS queue +if [[ -n "${QUEUE_URL}" ]]; then + echo "Deleting SQS queue: ${QUEUE_URL}" + aws sqs delete-queue --queue-url "${QUEUE_URL}" --region "${AWS_REGION}" 2>/dev/null || true + echo "SQS queue deleted" +fi + +echo "Spot SQS cleanup complete" diff --git a/ci-operator/step-registry/hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-ref.metadata.json b/ci-operator/step-registry/hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-ref.metadata.json new file mode 100644 index 0000000000000..f38a34fb0a092 --- /dev/null +++ b/ci-operator/step-registry/hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-ref.metadata.json @@ -0,0 +1,15 @@ +{ + "path": "hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-ref.yaml", + "owners": { + "approvers": [ + "csrwng", + "enxebre", + "sjenning" + ], + "reviewers": [ + "csrwng", + "enxebre", + "sjenning" + ] + } +} diff --git a/ci-operator/step-registry/hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-ref.yaml b/ci-operator/step-registry/hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-ref.yaml new file mode 100644 index 0000000000000..5b01fbc42f05b --- /dev/null +++ b/ci-operator/step-registry/hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-ref.yaml @@ -0,0 +1,23 @@ +ref: + as: hypershift-aws-spot-sqs-cleanup + cli: latest + commands: hypershift-aws-spot-sqs-cleanup-commands.sh + credentials: + - mount_path: /etc/hypershift-pool-aws-credentials + name: hypershift-pool-aws-credentials + namespace: test-credentials + env: + - default: "us-east-1" + name: AWS_REGION + documentation: "AWS region for the SQS queue" + best_effort: true + from_image: + namespace: ocp + name: cli + tag: latest + grace_period: 5m0s + resources: + requests: + cpu: 100m + memory: 100Mi + timeout: 10m0s diff --git a/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-commands.sh b/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-commands.sh new file mode 100755 index 0000000000000..da827fd6b8200 --- /dev/null +++ b/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-commands.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +set -o nounset +set -o errexit +set -o pipefail +set -o xtrace + +# This step creates the SQS queue and EventBridge rules needed for the +# AWS Node Termination Handler spot instance e2e test. +# +# The queue name matches the hardcoded value in the hypershift e2e test: +# test/e2e/nodepool_spot_termination_handler_test.go +# +# Uses hypershift-pool-aws-credentials which is the same credential +# the e2e test binary uses to discover the queue. + +AWS_CREDS_FILE="/etc/hypershift-pool-aws-credentials/credentials" +AWS_REGION="${AWS_REGION:-us-east-1}" +QUEUE_NAME="${SQS_QUEUE_NAME:-agarcial-nth-queue}" + +export AWS_SHARED_CREDENTIALS_FILE="${AWS_CREDS_FILE}" +export AWS_DEFAULT_REGION="${AWS_REGION}" + +echo "Creating SQS queue: ${QUEUE_NAME} in region ${AWS_REGION}" + +# Create the queue (idempotent - returns existing queue if it already exists) +QUEUE_URL=$(aws sqs create-queue --queue-name "${QUEUE_NAME}" --region "${AWS_REGION}" --query 'QueueUrl' --output text) +echo "Queue URL: ${QUEUE_URL}" + +# Get the queue ARN +QUEUE_ARN=$(aws sqs get-queue-attributes --queue-url "${QUEUE_URL}" --attribute-names QueueArn --region "${AWS_REGION}" --query 'Attributes.QueueArn' --output text) +echo "Queue ARN: ${QUEUE_ARN}" + +# Set queue policy to allow EventBridge to send messages +POLICY=$(cat < "${SHARED_DIR}/spot_sqs_queue_url" +echo "${QUEUE_NAME}" > "${SHARED_DIR}/spot_sqs_queue_name" +echo "${RULE_PREFIX}" > "${SHARED_DIR}/spot_eventbridge_rule_prefix" + +echo "SQS queue and EventBridge rules setup complete" diff --git a/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-ref.metadata.json b/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-ref.metadata.json new file mode 100644 index 0000000000000..7ab7428e9054a --- /dev/null +++ b/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-ref.metadata.json @@ -0,0 +1,15 @@ +{ + "path": "hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-ref.yaml", + "owners": { + "approvers": [ + "csrwng", + "enxebre", + "sjenning" + ], + "reviewers": [ + "csrwng", + "enxebre", + "sjenning" + ] + } +} diff --git a/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-ref.yaml b/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-ref.yaml new file mode 100644 index 0000000000000..43380a0b35649 --- /dev/null +++ b/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-ref.yaml @@ -0,0 +1,25 @@ +ref: + as: hypershift-aws-spot-sqs-setup + cli: latest + commands: hypershift-aws-spot-sqs-setup-commands.sh + credentials: + - mount_path: /etc/hypershift-pool-aws-credentials + name: hypershift-pool-aws-credentials + namespace: test-credentials + env: + - default: "us-east-1" + name: AWS_REGION + documentation: "AWS region for the SQS queue" + - default: "agarcial-nth-queue" + name: SQS_QUEUE_NAME + documentation: "Name of the SQS queue for spot termination handler testing" + from_image: + namespace: ocp + name: cli + tag: latest + grace_period: 5m0s + resources: + requests: + cpu: 100m + memory: 100Mi + timeout: 10m0s diff --git a/ci-operator/step-registry/hypershift/aws/spot/verify/hypershift-aws-spot-verify-commands.sh b/ci-operator/step-registry/hypershift/aws/spot/verify/hypershift-aws-spot-verify-commands.sh new file mode 100755 index 0000000000000..a86f25249154c --- /dev/null +++ b/ci-operator/step-registry/hypershift/aws/spot/verify/hypershift-aws-spot-verify-commands.sh @@ -0,0 +1,323 @@ +#!/bin/bash + +set -o nounset +set -o errexit +set -o pipefail +set -o xtrace + +# Spot Instance Feature Verification Step +# Verifies all aspects of OCPSTRAT-1677 / CNTRLPLANE-1388 implementation. +# +# This step runs AFTER the e2e test has created a HostedCluster and spot NodePool. +# It performs comprehensive checks on the spot instance feature: +# - aws-node-termination-handler image in release payload +# - terminationHandlerQueueURL set on HostedCluster +# - aws-node-termination-handler deployment in HCP namespace +# - Termination handler runs management-side (not in guest cluster) +# - Web identity token auth (token-minter sidecar) +# - Spot NodePool resources (AWSMachineTemplate, MachineDeployment, MachineHealthCheck) +# - CEL validation rules for invalid configurations + +export KUBECONFIG="${KUBECONFIG:-${SHARED_DIR}/management_cluster_kubeconfig}" +AWS_REGION="${AWS_REGION:-us-east-1}" + +PASS_COUNT=0 +FAIL_COUNT=0 +SKIP_COUNT=0 + +pass() { + echo "[PASS] $1" + PASS_COUNT=$((PASS_COUNT + 1)) +} + +fail() { + echo "[FAIL] $1" + FAIL_COUNT=$((FAIL_COUNT + 1)) +} + +skip() { + echo "[SKIP] $1" + SKIP_COUNT=$((SKIP_COUNT + 1)) +} + +echo "============================================" +echo "Spot Instance Feature Verification" +echo "OCPSTRAT-1677 / CNTRLPLANE-1388" +echo "============================================" + +# Step 0: Check aws-node-termination-handler image in release payload +echo "" +echo "--- Step 0: Check NTH image in release payload ---" +RELEASE_IMAGE=$(oc get hostedcluster -A -o jsonpath='{.items[0].spec.release.image}' 2>/dev/null || true) +if [[ -n "${RELEASE_IMAGE}" ]]; then + NTH_IMAGE=$(oc adm release info "${RELEASE_IMAGE}" --image-for=aws-node-termination-handler 2>/dev/null || true) + if [[ -n "${NTH_IMAGE}" ]]; then + pass "aws-node-termination-handler image found in release payload: ${NTH_IMAGE}" + else + fail "aws-node-termination-handler image NOT found in release payload" + fi +else + skip "No HostedCluster found to check release image" +fi + +# Find a HostedCluster +HC_NAME=$(oc get hostedcluster -A -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) +HC_NAMESPACE=$(oc get hostedcluster -A -o jsonpath='{.items[0].metadata.namespace}' 2>/dev/null || true) + +if [[ -z "${HC_NAME}" ]]; then + echo "No HostedCluster found. Skipping remaining checks." + exit 0 +fi + +echo "Found HostedCluster: ${HC_NAMESPACE}/${HC_NAME}" +HCP_NAMESPACE=$(oc get hostedcluster "${HC_NAME}" -n "${HC_NAMESPACE}" -o jsonpath='{.status.controlPlaneNamespace}' 2>/dev/null || echo "${HC_NAMESPACE}-${HC_NAME}") +echo "HCP namespace: ${HCP_NAMESPACE}" + +# Step 1: Check terminationHandlerQueueURL on HostedCluster +echo "" +echo "--- Step 1: Check terminationHandlerQueueURL ---" +QUEUE_URL=$(oc get hostedcluster "${HC_NAME}" -n "${HC_NAMESPACE}" -o jsonpath='{.spec.platform.aws.terminationHandlerQueueURL}' 2>/dev/null || true) +if [[ -n "${QUEUE_URL}" ]]; then + pass "terminationHandlerQueueURL is set: ${QUEUE_URL}" +else + skip "terminationHandlerQueueURL not set on HostedCluster (may have been cleaned up by e2e test)" +fi + +# Step 2: Check aws-node-termination-handler deployment in HCP namespace +echo "" +echo "--- Step 2: Check NTH deployment in HCP namespace ---" +NTH_DEPLOY=$(oc get deployment aws-node-termination-handler -n "${HCP_NAMESPACE}" -o jsonpath='{.metadata.name}' 2>/dev/null || true) +if [[ -n "${NTH_DEPLOY}" ]]; then + pass "aws-node-termination-handler deployment exists in HCP namespace ${HCP_NAMESPACE}" + + # Check replicas + REPLICAS=$(oc get deployment aws-node-termination-handler -n "${HCP_NAMESPACE}" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0") + READY=$(oc get deployment aws-node-termination-handler -n "${HCP_NAMESPACE}" -o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo "0") + if [[ "${REPLICAS}" -gt 0 ]]; then + pass "NTH deployment replicas: ${REPLICAS}, ready: ${READY}" + else + skip "NTH deployment has 0 replicas (may have been scaled down after test)" + fi + + # Check QUEUE_URL env var + DEPLOY_QUEUE_URL=$(oc get deployment aws-node-termination-handler -n "${HCP_NAMESPACE}" \ + -o jsonpath='{.spec.template.spec.containers[?(@.name=="aws-node-termination-handler")].env[?(@.name=="QUEUE_URL")].value}' 2>/dev/null || true) + if [[ -n "${DEPLOY_QUEUE_URL}" ]]; then + pass "NTH deployment has QUEUE_URL env set: ${DEPLOY_QUEUE_URL}" + else + skip "NTH deployment QUEUE_URL env is empty (may have been cleared after test)" + fi + + # Step 3: Check token-minter sidecar (web identity token auth) + echo "" + echo "--- Step 3: Check token-minter sidecar ---" + TOKEN_MINTER=$(oc get deployment aws-node-termination-handler -n "${HCP_NAMESPACE}" \ + -o jsonpath='{.spec.template.spec.containers[?(@.name=="token-minter-kube")].name}' 2>/dev/null || true) + if [[ "${TOKEN_MINTER}" == "token-minter-kube" ]]; then + pass "NTH deployment has token-minter-kube sidecar (web identity token auth)" + else + fail "NTH deployment missing token-minter-kube sidecar" + fi +else + skip "aws-node-termination-handler deployment not found in ${HCP_NAMESPACE} (may not have been deployed)" +fi + +# Step 4: Check NTH does NOT exist as a daemonset in guest cluster +echo "" +echo "--- Step 4: Check NTH is management-side only ---" +GUEST_KUBECONFIG="${SHARED_DIR}/guest_kubeconfig" +if [[ -f "${GUEST_KUBECONFIG}" ]]; then + GUEST_DS=$(KUBECONFIG="${GUEST_KUBECONFIG}" oc get daemonset -A -o name 2>/dev/null | grep -i termination || true) + if [[ -z "${GUEST_DS}" ]]; then + pass "No termination handler daemonset in guest cluster (management-side only)" + else + fail "Found termination handler daemonset in guest cluster: ${GUEST_DS}" + fi +else + # Try to extract guest kubeconfig + GUEST_KUBECONFIG_SECRET=$(oc get hostedcluster "${HC_NAME}" -n "${HC_NAMESPACE}" -o jsonpath='{.status.kubeConfig.name}' 2>/dev/null || true) + if [[ -n "${GUEST_KUBECONFIG_SECRET}" ]]; then + oc get secret "${GUEST_KUBECONFIG_SECRET}" -n "${HC_NAMESPACE}" -o jsonpath='{.data.kubeconfig}' 2>/dev/null | base64 -d > /tmp/guest_kubeconfig || true + if [[ -s /tmp/guest_kubeconfig ]]; then + GUEST_DS=$(KUBECONFIG=/tmp/guest_kubeconfig oc get daemonset -A -o name 2>/dev/null | grep -i termination || true) + if [[ -z "${GUEST_DS}" ]]; then + pass "No termination handler daemonset in guest cluster (management-side only)" + else + fail "Found termination handler daemonset in guest cluster: ${GUEST_DS}" + fi + else + skip "Could not extract guest kubeconfig" + fi + else + skip "No guest kubeconfig available" + fi +fi + +# Step 5: Check spot NodePool resources +echo "" +echo "--- Step 5: Check spot NodePool resources ---" + +# Find spot NodePool (look for one with spot annotation or marketType) +SPOT_NP=$(oc get nodepool -n "${HC_NAMESPACE}" -o json 2>/dev/null | \ + jq -r '.items[] | select(.metadata.annotations["hypershift.openshift.io/enable-spot"] != null or .spec.platform.aws.placement.marketType == "Spot") | .metadata.name' 2>/dev/null | head -1 || true) + +if [[ -n "${SPOT_NP}" ]]; then + pass "Found spot NodePool: ${SPOT_NP}" + + # Check spot MachineHealthCheck + SPOT_MHC="${SPOT_NP}-spot" + MHC_EXISTS=$(oc get machinehealthcheck "${SPOT_MHC}" -n "${HCP_NAMESPACE}" -o jsonpath='{.metadata.name}' 2>/dev/null || true) + if [[ "${MHC_EXISTS}" == "${SPOT_MHC}" ]]; then + pass "Spot MachineHealthCheck ${SPOT_MHC} exists" + + # Check MHC selector + MHC_LABEL=$(oc get machinehealthcheck "${SPOT_MHC}" -n "${HCP_NAMESPACE}" \ + -o jsonpath='{.spec.selector.matchLabels.hypershift\.openshift\.io/interruptible-instance}' 2>/dev/null || echo "NOT_FOUND") + if [[ "${MHC_LABEL}" != "NOT_FOUND" ]]; then + pass "Spot MHC has correct interruptible-instance label selector" + else + fail "Spot MHC missing interruptible-instance label selector" + fi + + # Check MHC maxUnhealthy + MAX_UNHEALTHY=$(oc get machinehealthcheck "${SPOT_MHC}" -n "${HCP_NAMESPACE}" \ + -o jsonpath='{.spec.maxUnhealthy}' 2>/dev/null || true) + if [[ "${MAX_UNHEALTHY}" == "100%" ]]; then + pass "Spot MHC maxUnhealthy is 100%" + else + fail "Spot MHC maxUnhealthy is '${MAX_UNHEALTHY}', expected '100%'" + fi + else + fail "Spot MachineHealthCheck ${SPOT_MHC} not found" + fi + + # Check MachineDeployment has interruptible-instance label + MD_LABEL=$(oc get machinedeployment -n "${HCP_NAMESPACE}" -o json 2>/dev/null | \ + jq -r '.items[] | select(.spec.template.metadata.labels["hypershift.openshift.io/interruptible-instance"] != null) | .metadata.name' 2>/dev/null | head -1 || true) + if [[ -n "${MD_LABEL}" ]]; then + pass "MachineDeployment ${MD_LABEL} has interruptible-instance label" + else + fail "No MachineDeployment found with interruptible-instance label" + fi + + # Check AWSMachineTemplate has spotMarketOptions + SPOT_AMT=$(oc get awsmachinetemplate -n "${HCP_NAMESPACE}" -o json 2>/dev/null | \ + jq -r '.items[] | select(.spec.template.spec.spotMarketOptions != null) | .metadata.name' 2>/dev/null | head -1 || true) + if [[ -n "${SPOT_AMT}" ]]; then + pass "AWSMachineTemplate ${SPOT_AMT} has spotMarketOptions configured" + else + skip "No AWSMachineTemplate with spotMarketOptions (test uses annotation-based spot, not real spot instances)" + fi +else + skip "No spot NodePool found (e2e test may have cleaned up)" +fi + +# Step 6: CEL validation rules +echo "" +echo "--- Step 6: CEL validation rules ---" + +# Test: Spot + Capacity Reservation should be rejected +CEL_RESULT=$(oc apply --dry-run=server -f - 2>&1 <&1 <&1 < Date: Wed, 18 Mar 2026 15:04:00 +0800 Subject: [PATCH 2/3] Fix from_image for spot steps: use ocp/4.22:upi-installer and remove cli: latest --- .../sqs-cleanup/hypershift-aws-spot-sqs-cleanup-ref.yaml | 5 ++--- .../spot/sqs-setup/hypershift-aws-spot-sqs-setup-ref.yaml | 5 ++--- .../aws/spot/verify/hypershift-aws-spot-verify-ref.yaml | 1 - 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/ci-operator/step-registry/hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-ref.yaml b/ci-operator/step-registry/hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-ref.yaml index 5b01fbc42f05b..1d6d8c887c996 100644 --- a/ci-operator/step-registry/hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-ref.yaml +++ b/ci-operator/step-registry/hypershift/aws/spot/sqs-cleanup/hypershift-aws-spot-sqs-cleanup-ref.yaml @@ -1,6 +1,5 @@ ref: as: hypershift-aws-spot-sqs-cleanup - cli: latest commands: hypershift-aws-spot-sqs-cleanup-commands.sh credentials: - mount_path: /etc/hypershift-pool-aws-credentials @@ -13,8 +12,8 @@ ref: best_effort: true from_image: namespace: ocp - name: cli - tag: latest + name: "4.22" + tag: upi-installer grace_period: 5m0s resources: requests: diff --git a/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-ref.yaml b/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-ref.yaml index 43380a0b35649..608d089f50192 100644 --- a/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-ref.yaml +++ b/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-ref.yaml @@ -1,6 +1,5 @@ ref: as: hypershift-aws-spot-sqs-setup - cli: latest commands: hypershift-aws-spot-sqs-setup-commands.sh credentials: - mount_path: /etc/hypershift-pool-aws-credentials @@ -15,8 +14,8 @@ ref: documentation: "Name of the SQS queue for spot termination handler testing" from_image: namespace: ocp - name: cli - tag: latest + name: "4.22" + tag: upi-installer grace_period: 5m0s resources: requests: diff --git a/ci-operator/step-registry/hypershift/aws/spot/verify/hypershift-aws-spot-verify-ref.yaml b/ci-operator/step-registry/hypershift/aws/spot/verify/hypershift-aws-spot-verify-ref.yaml index 5703c1bba86a0..86a843b23cb75 100644 --- a/ci-operator/step-registry/hypershift/aws/spot/verify/hypershift-aws-spot-verify-ref.yaml +++ b/ci-operator/step-registry/hypershift/aws/spot/verify/hypershift-aws-spot-verify-ref.yaml @@ -1,6 +1,5 @@ ref: as: hypershift-aws-spot-verify - cli: latest commands: hypershift-aws-spot-verify-commands.sh credentials: - mount_path: /etc/hypershift-pool-aws-credentials From 3e76288720bd539b1c6cca38cf6828366844aaf5 Mon Sep 17 00:00:00 2001 From: Zheng Feng Date: Wed, 18 Mar 2026 21:29:58 +0800 Subject: [PATCH 3/3] Fix SQS set-queue-attributes JSON quoting in sqs-setup step --- .../spot/sqs-setup/hypershift-aws-spot-sqs-setup-commands.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-commands.sh b/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-commands.sh index da827fd6b8200..6c61931dcd916 100755 --- a/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-commands.sh +++ b/ci-operator/step-registry/hypershift/aws/spot/sqs-setup/hypershift-aws-spot-sqs-setup-commands.sh @@ -49,9 +49,10 @@ POLICY=$(cat <