From f2bf419c080552189fd728f4bd62e87720d2e941 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Mon, 13 Apr 2026 10:06:50 -0400 Subject: [PATCH 01/12] feat(ci): add e2e-azure-v2-self-managed CI job for self-managed Azure Add CI workflow, step registry chains, and job config for running self-managed Azure e2e v2 tests using nested management cluster pattern. Co-Authored-By: Claude Opus 4.6 --- .../hypershift/openshift-hypershift-main.yaml | 10 ++ .../openshift-hypershift-main-presubmits.yaml | 83 +++++++++ .../azure/create-selfmanaged-guests/OWNERS | 11 ++ ...ate-selfmanaged-guests-chain.metadata.json | 17 ++ ...azure-create-selfmanaged-guests-chain.yaml | 168 ++++++++++++++++++ .../azure/destroy-selfmanaged-guests/OWNERS | 11 ++ ...roy-selfmanaged-guests-chain.metadata.json | 17 ++ ...zure-destroy-selfmanaged-guests-chain.yaml | 45 +++++ .../azure/e2e/v2-self-managed/OWNERS | 11 ++ ...e2e-v2-self-managed-workflow.metadata.json | 17 ++ ...ft-azure-e2e-v2-self-managed-workflow.yaml | 39 ++++ .../azure/run-e2e-v2-selfmanaged/OWNERS | 11 ++ ...run-e2e-v2-selfmanaged-chain.metadata.json | 17 ++ ...ft-azure-run-e2e-v2-selfmanaged-chain.yaml | 63 +++++++ 14 files changed, 520 insertions(+) create mode 100644 ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/OWNERS create mode 100644 ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.metadata.json create mode 100644 ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml create mode 100644 ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/OWNERS create mode 100644 ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.metadata.json create mode 100644 ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.yaml create mode 100644 ci-operator/step-registry/hypershift/azure/e2e/v2-self-managed/OWNERS create mode 100644 ci-operator/step-registry/hypershift/azure/e2e/v2-self-managed/hypershift-azure-e2e-v2-self-managed-workflow.metadata.json create mode 100644 ci-operator/step-registry/hypershift/azure/e2e/v2-self-managed/hypershift-azure-e2e-v2-self-managed-workflow.yaml create mode 100644 ci-operator/step-registry/hypershift/azure/run-e2e-v2-selfmanaged/OWNERS create mode 100644 ci-operator/step-registry/hypershift/azure/run-e2e-v2-selfmanaged/hypershift-azure-run-e2e-v2-selfmanaged-chain.metadata.json create mode 100644 ci-operator/step-registry/hypershift/azure/run-e2e-v2-selfmanaged/hypershift-azure-run-e2e-v2-selfmanaged-chain.yaml diff --git a/ci-operator/config/openshift/hypershift/openshift-hypershift-main.yaml b/ci-operator/config/openshift/hypershift/openshift-hypershift-main.yaml index a1b4cdf514d85..af7d7a6bc1562 100644 --- a/ci-operator/config/openshift/hypershift/openshift-hypershift-main.yaml +++ b/ci-operator/config/openshift/hypershift/openshift-hypershift-main.yaml @@ -175,6 +175,16 @@ tests: HYPERSHIFT_AZURE_LOCATION: centralus TEST_CPO_OVERRIDE: "1" workflow: hypershift-azure-aks-e2e +- always_run: false + as: e2e-azure-v2-self-managed + optional: true + pipeline_run_if_changed: ^(test/e2e/v2/selfmanagedazure|test/e2e/util) + steps: + cluster_profile: hypershift-azure + env: + ENABLE_HYPERSHIFT_CERT_ROTATION_SCALE: "true" + HYPERSHIFT_AZURE_LOCATION: centralus + workflow: hypershift-azure-e2e-v2-self-managed - always_run: false as: e2e-aws capabilities: diff --git a/ci-operator/jobs/openshift/hypershift/openshift-hypershift-main-presubmits.yaml b/ci-operator/jobs/openshift/hypershift/openshift-hypershift-main-presubmits.yaml index 78aff6632a2bf..7ca1dd89726dc 100644 --- a/ci-operator/jobs/openshift/hypershift/openshift-hypershift-main-presubmits.yaml +++ b/ci-operator/jobs/openshift/hypershift/openshift-hypershift-main-presubmits.yaml @@ -1305,6 +1305,89 @@ presubmits: secret: secretName: result-aggregator trigger: (?m)^/test( | .* )e2e-azure-self-managed,?($|\s.*) + - agent: kubernetes + always_run: false + annotations: + pipeline_run_if_changed: ^(test/e2e/v2/selfmanagedazure|test/e2e/util) + branches: + - ^main$ + - ^main- + cluster: build01 + context: ci/prow/e2e-azure-v2-self-managed + decorate: true + labels: + ci-operator.openshift.io/cloud: hypershift-azure + ci-operator.openshift.io/cloud-cluster-profile: hypershift-azure + ci.openshift.io/generator: prowgen + pj-rehearse.openshift.io/can-be-rehearsed: "true" + name: pull-ci-openshift-hypershift-main-e2e-azure-v2-self-managed + optional: true + rerun_command: /test e2e-azure-v2-self-managed + spec: + containers: + - args: + - --gcs-upload-secret=/secrets/gcs/service-account.json + - --image-import-pull-secret=/etc/pull-secret/.dockerconfigjson + - --lease-server-credentials-file=/etc/boskos/credentials + - --report-credentials-file=/etc/report/credentials + - --secret-dir=/secrets/ci-pull-credentials + - --target=e2e-azure-v2-self-managed + command: + - ci-operator + env: + - name: HTTP_SERVER_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay-proxy.ci.openshift.org/openshift/ci:ci_ci-operator_latest + imagePullPolicy: Always + name: "" + ports: + - containerPort: 8080 + name: http + resources: + requests: + cpu: 10m + volumeMounts: + - mountPath: /etc/boskos + name: boskos + readOnly: true + - mountPath: /secrets/ci-pull-credentials + name: ci-pull-credentials + readOnly: true + - mountPath: /secrets/gcs + name: gcs-credentials + readOnly: true + - mountPath: /secrets/manifest-tool + name: manifest-tool-local-pusher + readOnly: true + - mountPath: /etc/pull-secret + name: pull-secret + readOnly: true + - mountPath: /etc/report + name: result-aggregator + readOnly: true + serviceAccountName: ci-operator + volumes: + - name: boskos + secret: + items: + - key: credentials + path: credentials + secretName: boskos-credentials + - name: ci-pull-credentials + secret: + secretName: ci-pull-credentials + - name: manifest-tool-local-pusher + secret: + secretName: manifest-tool-local-pusher + - name: pull-secret + secret: + secretName: registry-pull-credentials + - name: result-aggregator + secret: + secretName: result-aggregator + trigger: (?m)^/test( | .* )e2e-azure-v2-self-managed,?($|\s.*) - agent: kubernetes always_run: false branches: diff --git a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/OWNERS b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/OWNERS new file mode 100644 index 0000000000000..6c30e7a30d980 --- /dev/null +++ b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/OWNERS @@ -0,0 +1,11 @@ +approvers: +- csrwng +- enxebre +- sjenning +- bryan-cox +options: {} +reviewers: +- csrwng +- enxebre +- sjenning +- bryan-cox diff --git a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.metadata.json b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.metadata.json new file mode 100644 index 0000000000000..2ce0fb6bae6f2 --- /dev/null +++ b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.metadata.json @@ -0,0 +1,17 @@ +{ + "path": "hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml", + "owners": { + "approvers": [ + "csrwng", + "enxebre", + "sjenning", + "bryan-cox" + ], + "reviewers": [ + "csrwng", + "enxebre", + "sjenning", + "bryan-cox" + ] + } +} \ No newline at end of file diff --git a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml new file mode 100644 index 0000000000000..b135067740adf --- /dev/null +++ b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml @@ -0,0 +1,168 @@ +chain: + as: hypershift-azure-create-selfmanaged-guests + steps: + - as: create-guests + cli: latest + env: + - name: HYPERSHIFT_NODE_COUNT + default: "3" + documentation: "The number of nodes per guest cluster." + - name: HYPERSHIFT_BASE_DOMAIN + default: "hcp-sm-azure.azure.devcluster.openshift.com" + documentation: "The cluster's FQDN will be a subdomain of the base domain." + - name: HYPERSHIFT_AZURE_LOCATION + default: "centralus" + documentation: "Specifies the Azure location of the clusters." + - name: HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_PUBLISHER + default: "" + documentation: "The Azure Marketplace image publisher." + - name: HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_OFFER + default: "" + documentation: "The Azure Marketplace image offer." + - name: HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_SKU + default: "" + documentation: "The Azure Marketplace image SKU." + - name: HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_VERSION + default: "" + documentation: "The Azure Marketplace image version." + commands: |- + set -exuo pipefail + + # Use the nested management cluster kubeconfig + export KUBECONFIG="${SHARED_DIR}/management_cluster_kubeconfig" + + # Generate unique cluster names from job ID + PUBLIC_NAME="$(echo -n "${PROW_JOB_ID}-pub"|sha256sum|cut -c-20)" + PRIVATE_NAME="$(echo -n "${PROW_JOB_ID}-prv"|sha256sum|cut -c-20)" + OAUTH_LB_NAME="$(echo -n "${PROW_JOB_ID}-oau"|sha256sum|cut -c-20)" + + # Self-managed Azure credentials + AZURE_CREDS="/etc/hypershift-ci-jobs-self-managed-azure/credentials.json" + AZURE_OIDC_ISSUER_URL="https://smazure.blob.core.windows.net/smazure" + AZURE_SA_TOKEN_ISSUER_KEY_PATH="/etc/hypershift-ci-jobs-self-managed-azure-e2e/serviceaccount-signer.private" + AZURE_WORKLOAD_IDENTITIES_FILE="/etc/hypershift-ci-jobs-self-managed-azure-e2e/workload-identities.json" + + PULL_SECRET_PATH="/etc/ci-pull-credentials/.dockerconfigjson" + + RELEASE_IMAGE="${RELEASE_IMAGE_LATEST}" + HC_LOCATION="${HYPERSHIFT_AZURE_LOCATION:-centralus}" + + # Read private NAT subnet ID from SHARED_DIR (written by setup-private-link step) + AZURE_PRIVATE_NAT_SUBNET_ID="" + if [[ -f "${SHARED_DIR}/azure_private_nat_subnet_id" ]]; then + AZURE_PRIVATE_NAT_SUBNET_ID="$(cat "${SHARED_DIR}/azure_private_nat_subnet_id")" + fi + + # Marketplace image flags + MARKETPLACE_ARGS="" + if [[ -n "${HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_PUBLISHER:-}" ]]; then + MARKETPLACE_ARGS="--marketplace-publisher=${HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_PUBLISHER} --marketplace-offer=${HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_OFFER}" + if [[ -n "${HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_SKU:-}" ]]; then + MARKETPLACE_ARGS="${MARKETPLACE_ARGS} --marketplace-sku=${HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_SKU}" + elif [[ -f "${SHARED_DIR}/azure-marketplace-image-sku" ]]; then + MARKETPLACE_ARGS="${MARKETPLACE_ARGS} --marketplace-sku=$(cat "${SHARED_DIR}/azure-marketplace-image-sku")" + fi + if [[ -n "${HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_VERSION:-}" ]]; then + MARKETPLACE_ARGS="${MARKETPLACE_ARGS} --marketplace-version=${HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_VERSION}" + elif [[ -f "${SHARED_DIR}/azure-marketplace-image-version" ]]; then + MARKETPLACE_ARGS="${MARKETPLACE_ARGS} --marketplace-version=$(cat "${SHARED_DIR}/azure-marketplace-image-version")" + fi + fi + + # Common flags for all self-managed clusters + COMMON_FLAGS="--node-pool-replicas=${HYPERSHIFT_NODE_COUNT} \ + --base-domain=${HYPERSHIFT_BASE_DOMAIN} \ + --pull-secret=${PULL_SECRET_PATH} \ + --azure-creds=${AZURE_CREDS} \ + --location=${HC_LOCATION} \ + --release-image=${RELEASE_IMAGE} \ + --oidc-issuer-url=${AZURE_OIDC_ISSUER_URL} \ + --sa-token-issuer-private-key-path=${AZURE_SA_TOKEN_ISSUER_KEY_PATH} \ + --workload-identities-file=${AZURE_WORKLOAD_IDENTITIES_FILE} \ + --generate-ssh \ + ${MARKETPLACE_ARGS}" + + # Create public cluster + echo "$(date) Creating public self-managed cluster: ${PUBLIC_NAME}" + /usr/bin/hypershift create cluster azure \ + --name="${PUBLIC_NAME}" \ + ${COMMON_FLAGS} & + PUBLIC_PID=$! + + # Create private cluster + PRIVATE_EXTRA="" + if [[ -n "${AZURE_PRIVATE_NAT_SUBNET_ID}" ]]; then + PRIVATE_EXTRA="${PRIVATE_EXTRA} --endpoint-access-private-nat-subnet-id=${AZURE_PRIVATE_NAT_SUBNET_ID}" + fi + echo "$(date) Creating private self-managed cluster: ${PRIVATE_NAME}" + /usr/bin/hypershift create cluster azure \ + --name="${PRIVATE_NAME}" \ + --endpoint-access=Private \ + ${COMMON_FLAGS} \ + ${PRIVATE_EXTRA} & + PRIVATE_PID=$! + + # Create OAuth LoadBalancer cluster + echo "$(date) Creating OAuth LB self-managed cluster: ${OAUTH_LB_NAME}" + /usr/bin/hypershift create cluster azure \ + --name="${OAUTH_LB_NAME}" \ + --oauth-publishing-strategy=LoadBalancer \ + ${COMMON_FLAGS} & + OAUTH_LB_PID=$! + + # Wait for create commands to complete + echo "$(date) Waiting for cluster create commands to finish..." + wait ${PUBLIC_PID} + echo "$(date) Public cluster create command completed" + wait ${PRIVATE_PID} + echo "$(date) Private cluster create command completed" + wait ${OAUTH_LB_PID} + echo "$(date) OAuth LB cluster create command completed" + + # Wait for clusters to become available + echo "$(date) Waiting for public cluster to become available..." + oc wait --timeout=30m --for=condition=Available --namespace=clusters "hostedcluster/${PUBLIC_NAME}" + echo "$(date) Public cluster is available" + + echo "$(date) Waiting for private cluster to become available..." + oc wait --timeout=30m --for=condition=Available --namespace=clusters "hostedcluster/${PRIVATE_NAME}" + echo "$(date) Private cluster is available" + + echo "$(date) Waiting for OAuth LB cluster to become available..." + oc wait --timeout=30m --for=condition=Available --namespace=clusters "hostedcluster/${OAUTH_LB_NAME}" + echo "$(date) OAuth LB cluster is available" + + # Wait for clusteroperators on each cluster + for CLUSTER in "${PUBLIC_NAME}" "${PRIVATE_NAME}" "${OAUTH_LB_NAME}"; do + echo "$(date) Creating kubeconfig for ${CLUSTER}" + bin/hypershift create kubeconfig --namespace=clusters --name="${CLUSTER}" > "${SHARED_DIR}/nested_kubeconfig_${CLUSTER}" + echo "$(date) Waiting for clusteroperators on ${CLUSTER}..." + KUBECONFIG="${SHARED_DIR}/nested_kubeconfig_${CLUSTER}" oc wait clusterversion/version --for='condition=Available=True' --timeout=30m || true + done + + # Write cluster names to shared dir + echo "${PUBLIC_NAME}" > "${SHARED_DIR}/cluster-name-public" + echo "${PRIVATE_NAME}" > "${SHARED_DIR}/cluster-name-private" + echo "${OAUTH_LB_NAME}" > "${SHARED_DIR}/cluster-name-oauth-lb" + + echo "$(date) All self-managed guest clusters are ready" + from: hypershift-operator + grace_period: 5m0s + resources: + requests: + cpu: 100m + memory: 100Mi + timeout: 60m0s + credentials: + - mount_path: /etc/ci-pull-credentials + name: ci-pull-credentials + namespace: test-credentials + - mount_path: /etc/hypershift-ci-jobs-self-managed-azure + name: hypershift-ci-jobs-self-managed-azure + namespace: test-credentials + - mount_path: /etc/hypershift-ci-jobs-self-managed-azure-e2e + name: hypershift-ci-jobs-self-managed-azure-e2e + namespace: test-credentials + dependencies: + - name: "release:latest" + env: RELEASE_IMAGE_LATEST diff --git a/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/OWNERS b/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/OWNERS new file mode 100644 index 0000000000000..6c30e7a30d980 --- /dev/null +++ b/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/OWNERS @@ -0,0 +1,11 @@ +approvers: +- csrwng +- enxebre +- sjenning +- bryan-cox +options: {} +reviewers: +- csrwng +- enxebre +- sjenning +- bryan-cox diff --git a/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.metadata.json b/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.metadata.json new file mode 100644 index 0000000000000..6e7260c7b30f9 --- /dev/null +++ b/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.metadata.json @@ -0,0 +1,17 @@ +{ + "path": "hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.yaml", + "owners": { + "approvers": [ + "csrwng", + "enxebre", + "sjenning", + "bryan-cox" + ], + "reviewers": [ + "csrwng", + "enxebre", + "sjenning", + "bryan-cox" + ] + } +} \ No newline at end of file diff --git a/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.yaml b/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.yaml new file mode 100644 index 0000000000000..b39188327a0c8 --- /dev/null +++ b/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.yaml @@ -0,0 +1,45 @@ +chain: + as: hypershift-azure-destroy-selfmanaged-guests + steps: + - as: destroy-guests + best_effort: true + cli: latest + env: + - name: HYPERSHIFT_AZURE_LOCATION + default: "centralus" + documentation: "Specifies the Azure location of the clusters." + commands: |- + set -xuo pipefail + + # Use the nested management cluster kubeconfig + export KUBECONFIG="${SHARED_DIR}/management_cluster_kubeconfig" + + AZURE_CREDS="/etc/hypershift-ci-jobs-self-managed-azure/credentials.json" + HC_LOCATION="${HYPERSHIFT_AZURE_LOCATION:-centralus}" + + # Re-derive cluster names from job ID (same logic as create step) + PUBLIC_NAME="$(echo -n "${PROW_JOB_ID}-pub"|sha256sum|cut -c-20)" + PRIVATE_NAME="$(echo -n "${PROW_JOB_ID}-prv"|sha256sum|cut -c-20)" + OAUTH_LB_NAME="$(echo -n "${PROW_JOB_ID}-oau"|sha256sum|cut -c-20)" + + # Destroy all clusters, continuing on errors + for CLUSTER in "${PUBLIC_NAME}" "${PRIVATE_NAME}" "${OAUTH_LB_NAME}"; do + echo "$(date) Destroying self-managed cluster: ${CLUSTER}" + bin/hypershift destroy cluster azure \ + --azure-creds="${AZURE_CREDS}" \ + --name="${CLUSTER}" \ + --location="${HC_LOCATION}" \ + --cluster-grace-period=40m || echo "$(date) WARNING: Failed to destroy cluster ${CLUSTER}" + echo "$(date) Finished destroying cluster ${CLUSTER}" + done + from: hypershift-operator + grace_period: 5m0s + resources: + requests: + cpu: 100m + memory: 100Mi + timeout: 30m0s + credentials: + - mount_path: /etc/hypershift-ci-jobs-self-managed-azure + name: hypershift-ci-jobs-self-managed-azure + namespace: test-credentials diff --git a/ci-operator/step-registry/hypershift/azure/e2e/v2-self-managed/OWNERS b/ci-operator/step-registry/hypershift/azure/e2e/v2-self-managed/OWNERS new file mode 100644 index 0000000000000..6c30e7a30d980 --- /dev/null +++ b/ci-operator/step-registry/hypershift/azure/e2e/v2-self-managed/OWNERS @@ -0,0 +1,11 @@ +approvers: +- csrwng +- enxebre +- sjenning +- bryan-cox +options: {} +reviewers: +- csrwng +- enxebre +- sjenning +- bryan-cox diff --git a/ci-operator/step-registry/hypershift/azure/e2e/v2-self-managed/hypershift-azure-e2e-v2-self-managed-workflow.metadata.json b/ci-operator/step-registry/hypershift/azure/e2e/v2-self-managed/hypershift-azure-e2e-v2-self-managed-workflow.metadata.json new file mode 100644 index 0000000000000..6bbdd6096ef08 --- /dev/null +++ b/ci-operator/step-registry/hypershift/azure/e2e/v2-self-managed/hypershift-azure-e2e-v2-self-managed-workflow.metadata.json @@ -0,0 +1,17 @@ +{ + "path": "hypershift/azure/e2e/v2-self-managed/hypershift-azure-e2e-v2-self-managed-workflow.yaml", + "owners": { + "approvers": [ + "csrwng", + "enxebre", + "sjenning", + "bryan-cox" + ], + "reviewers": [ + "csrwng", + "enxebre", + "sjenning", + "bryan-cox" + ] + } +} \ No newline at end of file diff --git a/ci-operator/step-registry/hypershift/azure/e2e/v2-self-managed/hypershift-azure-e2e-v2-self-managed-workflow.yaml b/ci-operator/step-registry/hypershift/azure/e2e/v2-self-managed/hypershift-azure-e2e-v2-self-managed-workflow.yaml new file mode 100644 index 0000000000000..4362a59bcb253 --- /dev/null +++ b/ci-operator/step-registry/hypershift/azure/e2e/v2-self-managed/hypershift-azure-e2e-v2-self-managed-workflow.yaml @@ -0,0 +1,39 @@ +workflow: + as: hypershift-azure-e2e-v2-self-managed + documentation: |- + The HyperShift Azure e2e v2 self-managed workflow provisions a nested management + cluster on self-managed Azure infrastructure, installs the HyperShift operator, creates + three self-managed Azure guest clusters (public, private, OAuth LoadBalancer), and runs + the v2 Ginkgo test suite against each. Each test spec produces a separate JUnit XML + entry for Sippy. + + The HyperShift launch capability is currently supported by the HyperShift + team. For now, please direct all questions and comments to: + + - Alberto Lamela (agarcial@redhat.com) + - Seth Jennings (sjenning@redhat.com) + - Cesar Wong (cewong@redhat.com) + - Bryan Cox (brcox@redhat.com) + + Learn more about HyperShift here: https://github.com/openshift/hypershift + + Track HyperShift's development here: https://issues.redhat.com/projects/CNTRLPLANE/summary + steps: + pre: + - ref: ipi-install-rbac + - chain: hypershift-setup-nested-management-cluster + - ref: hypershift-azure-setup-private-link + - ref: hypershift-install + - chain: hypershift-azure-create-selfmanaged-guests + test: + - chain: hypershift-azure-run-e2e-v2-selfmanaged + post: + - ref: hypershift-analyze-e2e-failure + - chain: hypershift-azure-destroy-selfmanaged-guests + - chain: hypershift-destroy-nested-management-cluster + env: + CLOUD_PROVIDER: "Azure" + HYPERSHIFT_NODE_COUNT: "3" + HYPERSHIFT_AZURE_LOCATION: "centralus" + AZURE_SELF_MANAGED: "true" + HYPERSHIFT_EXTERNAL_DNS_DOMAIN: "aks-e2e.hypershift.azure.devcluster.openshift.com" diff --git a/ci-operator/step-registry/hypershift/azure/run-e2e-v2-selfmanaged/OWNERS b/ci-operator/step-registry/hypershift/azure/run-e2e-v2-selfmanaged/OWNERS new file mode 100644 index 0000000000000..6c30e7a30d980 --- /dev/null +++ b/ci-operator/step-registry/hypershift/azure/run-e2e-v2-selfmanaged/OWNERS @@ -0,0 +1,11 @@ +approvers: +- csrwng +- enxebre +- sjenning +- bryan-cox +options: {} +reviewers: +- csrwng +- enxebre +- sjenning +- bryan-cox diff --git a/ci-operator/step-registry/hypershift/azure/run-e2e-v2-selfmanaged/hypershift-azure-run-e2e-v2-selfmanaged-chain.metadata.json b/ci-operator/step-registry/hypershift/azure/run-e2e-v2-selfmanaged/hypershift-azure-run-e2e-v2-selfmanaged-chain.metadata.json new file mode 100644 index 0000000000000..e2104c39cc8da --- /dev/null +++ b/ci-operator/step-registry/hypershift/azure/run-e2e-v2-selfmanaged/hypershift-azure-run-e2e-v2-selfmanaged-chain.metadata.json @@ -0,0 +1,17 @@ +{ + "path": "hypershift/azure/run-e2e-v2-selfmanaged/hypershift-azure-run-e2e-v2-selfmanaged-chain.yaml", + "owners": { + "approvers": [ + "csrwng", + "enxebre", + "sjenning", + "bryan-cox" + ], + "reviewers": [ + "csrwng", + "enxebre", + "sjenning", + "bryan-cox" + ] + } +} \ No newline at end of file diff --git a/ci-operator/step-registry/hypershift/azure/run-e2e-v2-selfmanaged/hypershift-azure-run-e2e-v2-selfmanaged-chain.yaml b/ci-operator/step-registry/hypershift/azure/run-e2e-v2-selfmanaged/hypershift-azure-run-e2e-v2-selfmanaged-chain.yaml new file mode 100644 index 0000000000000..5da816b993776 --- /dev/null +++ b/ci-operator/step-registry/hypershift/azure/run-e2e-v2-selfmanaged/hypershift-azure-run-e2e-v2-selfmanaged-chain.yaml @@ -0,0 +1,63 @@ +chain: + as: hypershift-azure-run-e2e-v2-selfmanaged + steps: + - as: tests + cli: latest + commands: |- + set -xuo pipefail + + # Use the nested management cluster kubeconfig + export KUBECONFIG="${SHARED_DIR}/management_cluster_kubeconfig" + export EVENTUALLY_VERBOSE="false" + + # Export private NAT subnet ID for private topology tests + AZURE_PRIVATE_NAT_SUBNET_ID="" + if [[ -f "${SHARED_DIR}/azure_private_nat_subnet_id" ]]; then + AZURE_PRIVATE_NAT_SUBNET_ID="$(cat "${SHARED_DIR}/azure_private_nat_subnet_id")" + fi + export AZURE_PRIVATE_NAT_SUBNET_ID + + PUBLIC_NAME="$(cat "${SHARED_DIR}/cluster-name-public")" + PRIVATE_NAME="$(cat "${SHARED_DIR}/cluster-name-private")" + OAUTH_LB_NAME="$(cat "${SHARED_DIR}/cluster-name-oauth-lb")" + + OVERALL_EXIT=0 + + # Run public cluster tests + echo "$(date) Running public cluster tests against ${PUBLIC_NAME}..." + E2E_HOSTED_CLUSTER_NAME="${PUBLIC_NAME}" \ + E2E_HOSTED_CLUSTER_NAMESPACE=clusters \ + bin/test-e2e-self-managed-azure \ + --ginkgo.label-filter="self-managed-azure-public" \ + --ginkgo.junit-report="${ARTIFACT_DIR}/junit_self_managed_azure_public.xml" \ + --ginkgo.v || OVERALL_EXIT=1 + echo "$(date) Public cluster tests finished" + + # Run private topology tests + echo "$(date) Running private topology tests against ${PRIVATE_NAME}..." + E2E_HOSTED_CLUSTER_NAME="${PRIVATE_NAME}" \ + E2E_HOSTED_CLUSTER_NAMESPACE=clusters \ + bin/test-e2e-self-managed-azure \ + --ginkgo.label-filter="self-managed-azure-private" \ + --ginkgo.junit-report="${ARTIFACT_DIR}/junit_self_managed_azure_private.xml" \ + --ginkgo.v || OVERALL_EXIT=1 + echo "$(date) Private topology tests finished" + + # Run OAuth LoadBalancer tests + echo "$(date) Running OAuth LB tests against ${OAUTH_LB_NAME}..." + E2E_HOSTED_CLUSTER_NAME="${OAUTH_LB_NAME}" \ + E2E_HOSTED_CLUSTER_NAMESPACE=clusters \ + bin/test-e2e-self-managed-azure \ + --ginkgo.label-filter="self-managed-azure-oauth-lb" \ + --ginkgo.junit-report="${ARTIFACT_DIR}/junit_self_managed_azure_oauth_lb.xml" \ + --ginkgo.v || OVERALL_EXIT=1 + echo "$(date) OAuth LB tests finished" + + exit ${OVERALL_EXIT} + timeout: 45m + grace_period: 5m + from: hypershift-tests + resources: + requests: + cpu: "1" + memory: 200Mi From 0d74efd7ca24c1fdc99e0ead250241928372d946 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Mon, 13 Apr 2026 12:17:16 -0400 Subject: [PATCH 02/12] fix(ci): address CodeRabbit review findings - Use /usr/bin/hypershift instead of bin/hypershift for consistency - Log warning on clusterversion wait timeout instead of silent || true - Wait for all background cluster create PIDs before failing Co-Authored-By: Claude Opus 4.6 --- ...ift-azure-create-selfmanaged-guests-chain.yaml | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml index b135067740adf..a1553c9202e16 100644 --- a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml +++ b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml @@ -112,12 +112,17 @@ chain: # Wait for create commands to complete echo "$(date) Waiting for cluster create commands to finish..." - wait ${PUBLIC_PID} + FAILED=0 + wait ${PUBLIC_PID} || FAILED=1 echo "$(date) Public cluster create command completed" - wait ${PRIVATE_PID} + wait ${PRIVATE_PID} || FAILED=1 echo "$(date) Private cluster create command completed" - wait ${OAUTH_LB_PID} + wait ${OAUTH_LB_PID} || FAILED=1 echo "$(date) OAuth LB cluster create command completed" + if [[ ${FAILED} -ne 0 ]]; then + echo "$(date) ERROR: One or more cluster create commands failed" + exit 1 + fi # Wait for clusters to become available echo "$(date) Waiting for public cluster to become available..." @@ -135,9 +140,9 @@ chain: # Wait for clusteroperators on each cluster for CLUSTER in "${PUBLIC_NAME}" "${PRIVATE_NAME}" "${OAUTH_LB_NAME}"; do echo "$(date) Creating kubeconfig for ${CLUSTER}" - bin/hypershift create kubeconfig --namespace=clusters --name="${CLUSTER}" > "${SHARED_DIR}/nested_kubeconfig_${CLUSTER}" + /usr/bin/hypershift create kubeconfig --namespace=clusters --name="${CLUSTER}" > "${SHARED_DIR}/nested_kubeconfig_${CLUSTER}" echo "$(date) Waiting for clusteroperators on ${CLUSTER}..." - KUBECONFIG="${SHARED_DIR}/nested_kubeconfig_${CLUSTER}" oc wait clusterversion/version --for='condition=Available=True' --timeout=30m || true + KUBECONFIG="${SHARED_DIR}/nested_kubeconfig_${CLUSTER}" oc wait clusterversion/version --for='condition=Available=True' --timeout=30m || echo "$(date) WARNING: clusterversion wait timed out for ${CLUSTER}, continuing anyway" done # Write cluster names to shared dir From 8ac08aa063f33a42976511d1721959ebfc9789e1 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Mon, 13 Apr 2026 12:24:18 -0400 Subject: [PATCH 03/12] fix(ci): increase create-selfmanaged-guests step timeout to 120m The step runs sequential 30m waits for cluster availability and clusterversion across 3 clusters, so 60m was too tight. Co-Authored-By: Claude Opus 4.6 --- .../hypershift-azure-create-selfmanaged-guests-chain.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml index a1553c9202e16..5e6e71825e683 100644 --- a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml +++ b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml @@ -157,7 +157,7 @@ chain: requests: cpu: 100m memory: 100Mi - timeout: 60m0s + timeout: 120m0s credentials: - mount_path: /etc/ci-pull-credentials name: ci-pull-credentials From 1c45c36fdf6f64b7370d41d9af9295260dfa3d80 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Mon, 13 Apr 2026 12:41:58 -0400 Subject: [PATCH 04/12] fix(ci): address CodeRabbit review - fail fast on missing prereqs - Fail if azure_private_nat_subnet_id file is missing instead of silently omitting the private-link flag - Fail the step if any cluster doesn't reach clusterversion Available instead of warning and continuing to e2e Co-Authored-By: Claude Opus 4.6 --- ...azure-create-selfmanaged-guests-chain.yaml | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml index 5e6e71825e683..cf0e3ab9063e3 100644 --- a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml +++ b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml @@ -48,10 +48,11 @@ chain: HC_LOCATION="${HYPERSHIFT_AZURE_LOCATION:-centralus}" # Read private NAT subnet ID from SHARED_DIR (written by setup-private-link step) - AZURE_PRIVATE_NAT_SUBNET_ID="" - if [[ -f "${SHARED_DIR}/azure_private_nat_subnet_id" ]]; then - AZURE_PRIVATE_NAT_SUBNET_ID="$(cat "${SHARED_DIR}/azure_private_nat_subnet_id")" + if [[ ! -s "${SHARED_DIR}/azure_private_nat_subnet_id" ]]; then + echo "$(date) ERROR: azure_private_nat_subnet_id is required for the private guest cluster" + exit 1 fi + AZURE_PRIVATE_NAT_SUBNET_ID="$(cat "${SHARED_DIR}/azure_private_nat_subnet_id")" # Marketplace image flags MARKETPLACE_ARGS="" @@ -90,10 +91,7 @@ chain: PUBLIC_PID=$! # Create private cluster - PRIVATE_EXTRA="" - if [[ -n "${AZURE_PRIVATE_NAT_SUBNET_ID}" ]]; then - PRIVATE_EXTRA="${PRIVATE_EXTRA} --endpoint-access-private-nat-subnet-id=${AZURE_PRIVATE_NAT_SUBNET_ID}" - fi + PRIVATE_EXTRA="--endpoint-access-private-nat-subnet-id=${AZURE_PRIVATE_NAT_SUBNET_ID}" echo "$(date) Creating private self-managed cluster: ${PRIVATE_NAME}" /usr/bin/hypershift create cluster azure \ --name="${PRIVATE_NAME}" \ @@ -138,12 +136,19 @@ chain: echo "$(date) OAuth LB cluster is available" # Wait for clusteroperators on each cluster + FAILED_READY=0 for CLUSTER in "${PUBLIC_NAME}" "${PRIVATE_NAME}" "${OAUTH_LB_NAME}"; do echo "$(date) Creating kubeconfig for ${CLUSTER}" /usr/bin/hypershift create kubeconfig --namespace=clusters --name="${CLUSTER}" > "${SHARED_DIR}/nested_kubeconfig_${CLUSTER}" echo "$(date) Waiting for clusteroperators on ${CLUSTER}..." - KUBECONFIG="${SHARED_DIR}/nested_kubeconfig_${CLUSTER}" oc wait clusterversion/version --for='condition=Available=True' --timeout=30m || echo "$(date) WARNING: clusterversion wait timed out for ${CLUSTER}, continuing anyway" + if ! KUBECONFIG="${SHARED_DIR}/nested_kubeconfig_${CLUSTER}" oc wait clusterversion/version --for='condition=Available=True' --timeout=30m; then + echo "$(date) ERROR: clusterversion wait timed out for ${CLUSTER}" + FAILED_READY=1 + fi done + if [[ ${FAILED_READY} -ne 0 ]]; then + exit 1 + fi # Write cluster names to shared dir echo "${PUBLIC_NAME}" > "${SHARED_DIR}/cluster-name-public" From 2703af268211c3523a215a626b49ede505d55b03 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Mon, 13 Apr 2026 13:38:35 -0400 Subject: [PATCH 05/12] fix(ci): use shared e2e v2 binary for self-managed Azure tests Update the run chain to use bin/test-e2e-v2 (shared binary) instead of bin/test-e2e-self-managed-azure (separate binary). Azure tests now live in the shared test/e2e/v2/tests/ package and self-select via Ginkgo label filters and capability-based Skip() logic. Co-Authored-By: Claude Opus 4.6 --- .../hypershift-azure-run-e2e-v2-selfmanaged-chain.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci-operator/step-registry/hypershift/azure/run-e2e-v2-selfmanaged/hypershift-azure-run-e2e-v2-selfmanaged-chain.yaml b/ci-operator/step-registry/hypershift/azure/run-e2e-v2-selfmanaged/hypershift-azure-run-e2e-v2-selfmanaged-chain.yaml index 5da816b993776..a86a749313096 100644 --- a/ci-operator/step-registry/hypershift/azure/run-e2e-v2-selfmanaged/hypershift-azure-run-e2e-v2-selfmanaged-chain.yaml +++ b/ci-operator/step-registry/hypershift/azure/run-e2e-v2-selfmanaged/hypershift-azure-run-e2e-v2-selfmanaged-chain.yaml @@ -27,7 +27,7 @@ chain: echo "$(date) Running public cluster tests against ${PUBLIC_NAME}..." E2E_HOSTED_CLUSTER_NAME="${PUBLIC_NAME}" \ E2E_HOSTED_CLUSTER_NAMESPACE=clusters \ - bin/test-e2e-self-managed-azure \ + bin/test-e2e-v2 \ --ginkgo.label-filter="self-managed-azure-public" \ --ginkgo.junit-report="${ARTIFACT_DIR}/junit_self_managed_azure_public.xml" \ --ginkgo.v || OVERALL_EXIT=1 @@ -37,7 +37,7 @@ chain: echo "$(date) Running private topology tests against ${PRIVATE_NAME}..." E2E_HOSTED_CLUSTER_NAME="${PRIVATE_NAME}" \ E2E_HOSTED_CLUSTER_NAMESPACE=clusters \ - bin/test-e2e-self-managed-azure \ + bin/test-e2e-v2 \ --ginkgo.label-filter="self-managed-azure-private" \ --ginkgo.junit-report="${ARTIFACT_DIR}/junit_self_managed_azure_private.xml" \ --ginkgo.v || OVERALL_EXIT=1 @@ -47,7 +47,7 @@ chain: echo "$(date) Running OAuth LB tests against ${OAUTH_LB_NAME}..." E2E_HOSTED_CLUSTER_NAME="${OAUTH_LB_NAME}" \ E2E_HOSTED_CLUSTER_NAMESPACE=clusters \ - bin/test-e2e-self-managed-azure \ + bin/test-e2e-v2 \ --ginkgo.label-filter="self-managed-azure-oauth-lb" \ --ginkgo.junit-report="${ARTIFACT_DIR}/junit_self_managed_azure_oauth_lb.xml" \ --ginkgo.v || OVERALL_EXIT=1 From bcb8b366fbd0073b22b3a4d41742d1c544d85178 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Mon, 13 Apr 2026 13:48:22 -0400 Subject: [PATCH 06/12] fix(ci): preserve destroy failure signal and fix binary path Address CodeRabbit review on destroy chain: - Replace || echo pattern with failed flag + exit non-zero to surface leaked resources when cluster destroy fails - Use /usr/bin/hypershift for path consistency with create chain Co-Authored-By: Claude Opus 4.6 --- ...rshift-azure-destroy-selfmanaged-guests-chain.yaml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.yaml b/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.yaml index b39188327a0c8..5c45c3a62e36c 100644 --- a/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.yaml +++ b/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.yaml @@ -22,16 +22,21 @@ chain: PRIVATE_NAME="$(echo -n "${PROW_JOB_ID}-prv"|sha256sum|cut -c-20)" OAUTH_LB_NAME="$(echo -n "${PROW_JOB_ID}-oau"|sha256sum|cut -c-20)" - # Destroy all clusters, continuing on errors + # Destroy all clusters, continue per-cluster, but preserve failure signal + FAILED=0 for CLUSTER in "${PUBLIC_NAME}" "${PRIVATE_NAME}" "${OAUTH_LB_NAME}"; do echo "$(date) Destroying self-managed cluster: ${CLUSTER}" - bin/hypershift destroy cluster azure \ + if ! /usr/bin/hypershift destroy cluster azure \ --azure-creds="${AZURE_CREDS}" \ --name="${CLUSTER}" \ --location="${HC_LOCATION}" \ - --cluster-grace-period=40m || echo "$(date) WARNING: Failed to destroy cluster ${CLUSTER}" + --cluster-grace-period=40m; then + echo "$(date) WARNING: Failed to destroy cluster ${CLUSTER}" >&2 + FAILED=1 + fi echo "$(date) Finished destroying cluster ${CLUSTER}" done + exit "${FAILED}" from: hypershift-operator grace_period: 5m0s resources: From 4c143dee84bc8f9f21ef0d23814665dbc34f7d12 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Mon, 13 Apr 2026 13:54:09 -0400 Subject: [PATCH 07/12] fix: update pipeline_run_if_changed path for v2 self-managed Azure job The selfmanagedazure package was removed and tests now live in the shared v2 binary at test/e2e/v2/tests/hosted_cluster_azure_test.go. Update the path filter to match the new location. Co-Authored-By: Claude Opus 4.6 --- .../config/openshift/hypershift/openshift-hypershift-main.yaml | 2 +- .../hypershift/openshift-hypershift-main-presubmits.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci-operator/config/openshift/hypershift/openshift-hypershift-main.yaml b/ci-operator/config/openshift/hypershift/openshift-hypershift-main.yaml index af7d7a6bc1562..5b19e166d6850 100644 --- a/ci-operator/config/openshift/hypershift/openshift-hypershift-main.yaml +++ b/ci-operator/config/openshift/hypershift/openshift-hypershift-main.yaml @@ -178,7 +178,7 @@ tests: - always_run: false as: e2e-azure-v2-self-managed optional: true - pipeline_run_if_changed: ^(test/e2e/v2/selfmanagedazure|test/e2e/util) + pipeline_run_if_changed: ^(test/e2e/v2/tests/hosted_cluster_azure|test/e2e/util) steps: cluster_profile: hypershift-azure env: diff --git a/ci-operator/jobs/openshift/hypershift/openshift-hypershift-main-presubmits.yaml b/ci-operator/jobs/openshift/hypershift/openshift-hypershift-main-presubmits.yaml index 7ca1dd89726dc..0c274d6d0dc0c 100644 --- a/ci-operator/jobs/openshift/hypershift/openshift-hypershift-main-presubmits.yaml +++ b/ci-operator/jobs/openshift/hypershift/openshift-hypershift-main-presubmits.yaml @@ -1308,7 +1308,7 @@ presubmits: - agent: kubernetes always_run: false annotations: - pipeline_run_if_changed: ^(test/e2e/v2/selfmanagedazure|test/e2e/util) + pipeline_run_if_changed: ^(test/e2e/v2/tests/hosted_cluster_azure|test/e2e/util) branches: - ^main$ - ^main- From 06805ce6b2de78cd08798e895bb0287b3eda119b Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Mon, 13 Apr 2026 17:11:31 -0400 Subject: [PATCH 08/12] fix: increase destroy-guests timeout to 2h for sequential Azure cluster teardown The 30m timeout is insufficient for destroying 3 Azure clusters sequentially, especially when each cluster has a 40m grace period. The first destroy alone can consume the full 30m budget, leaving the remaining clusters orphaned. Co-Authored-By: Claude Opus 4.6 --- .../hypershift-azure-destroy-selfmanaged-guests-chain.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.yaml b/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.yaml index 5c45c3a62e36c..bb2a630462520 100644 --- a/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.yaml +++ b/ci-operator/step-registry/hypershift/azure/destroy-selfmanaged-guests/hypershift-azure-destroy-selfmanaged-guests-chain.yaml @@ -43,7 +43,7 @@ chain: requests: cpu: 100m memory: 100Mi - timeout: 30m0s + timeout: 2h0m0s credentials: - mount_path: /etc/hypershift-ci-jobs-self-managed-azure name: hypershift-ci-jobs-self-managed-azure From 988b0a8683dcc0e2ce259fa126cb3ca160f8f519 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Mon, 13 Apr 2026 17:42:23 -0400 Subject: [PATCH 09/12] fix: use management-side version rollout check instead of guest clusterversion Mirror the AWS/GCP v2 create pattern: poll hostedcluster.status.version.history[].state via the management API instead of connecting to the guest API with oc wait clusterversion. The previous approach created a guest kubeconfig and ran `oc wait clusterversion/version` from the CI pod, which: - Cannot resolve .hypershift.local DNS for private clusters - Requires guest API network access the CI pod doesn't have The new approach polls the HC resource on the management cluster, which works for all endpoint access modes, and emits per-cluster JUnit XML artifacts with diagnostic info on failure. Co-Authored-By: Claude Opus 4.6 --- ...azure-create-selfmanaged-guests-chain.yaml | 46 ++++++++++++++++--- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml index cf0e3ab9063e3..afd9a65cca460 100644 --- a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml +++ b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml @@ -135,15 +135,49 @@ chain: oc wait --timeout=30m --for=condition=Available --namespace=clusters "hostedcluster/${OAUTH_LB_NAME}" echo "$(date) OAuth LB cluster is available" - # Wait for clusteroperators on each cluster + # Wait for version rollout to complete on each cluster (via management API, same as AWS/GCP v2) FAILED_READY=0 for CLUSTER in "${PUBLIC_NAME}" "${PRIVATE_NAME}" "${OAUTH_LB_NAME}"; do - echo "$(date) Creating kubeconfig for ${CLUSTER}" - /usr/bin/hypershift create kubeconfig --namespace=clusters --name="${CLUSTER}" > "${SHARED_DIR}/nested_kubeconfig_${CLUSTER}" - echo "$(date) Waiting for clusteroperators on ${CLUSTER}..." - if ! KUBECONFIG="${SHARED_DIR}/nested_kubeconfig_${CLUSTER}" oc wait clusterversion/version --for='condition=Available=True' --timeout=30m; then - echo "$(date) ERROR: clusterversion wait timed out for ${CLUSTER}" + echo "$(date) Waiting for version rollout on ${CLUSTER}..." + set +e + CLUSTER_CHECK="${CLUSTER}" timeout 30m bash -c ' + until [[ "$(oc get -n clusters hostedcluster/${CLUSTER_CHECK} -o jsonpath='"'"'{.status.version.history[?(@.state!="")].state}'"'"')" = "Completed" ]]; do + sleep 15 + done + ' + ROLLOUT_RC=$? + set -e + if [[ ${ROLLOUT_RC} -ne 0 ]]; then + echo "$(date) ERROR: version rollout timed out for ${CLUSTER}" + cat << EOF > "${ARTIFACT_DIR}/junit_hosted_cluster_${CLUSTER}.xml" + + + + + + + + + EOF FAILED_READY=1 + else + echo "$(date) Version rollout completed for ${CLUSTER}" + cat << EOF > "${ARTIFACT_DIR}/junit_hosted_cluster_${CLUSTER}.xml" + + + + + + + + + EOF fi done if [[ ${FAILED_READY} -ne 0 ]]; then From 4a53e9d28a2e9e3e6a15a43f3c2b46ca6f37c157 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Tue, 14 Apr 2026 06:34:00 -0400 Subject: [PATCH 10/12] fix(ci): parallelize version rollout checks and increase timeout to 45m All 3 cluster version rollout checks were running sequentially with 30m timeouts each. Azure self-managed 3-node clusters need more than 30m for all cluster operators to become available after the HC reaches Available state. The sequential approach also consumed 90m of the 120m step timeout budget just for version rollout polling. Changes: - Increase per-cluster version rollout timeout from 30m to 45m - Run all 3 version rollout checks in parallel (background subshells) - Wait for each individually and generate per-cluster JUnit XML artifacts Co-Authored-By: Claude Opus 4.6 --- ...azure-create-selfmanaged-guests-chain.yaml | 44 ++++++++++++++----- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml index afd9a65cca460..d6cf45a5c56ed 100644 --- a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml +++ b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml @@ -135,18 +135,41 @@ chain: oc wait --timeout=30m --for=condition=Available --namespace=clusters "hostedcluster/${OAUTH_LB_NAME}" echo "$(date) OAuth LB cluster is available" - # Wait for version rollout to complete on each cluster (via management API, same as AWS/GCP v2) + # Wait for version rollout to complete on all clusters in parallel (via management API, same as AWS/GCP v2) + echo "$(date) Starting parallel version rollout checks..." + set +e + + echo "$(date) Waiting for version rollout on ${PUBLIC_NAME}..." + CLUSTER_CHECK="${PUBLIC_NAME}" timeout 45m bash -c ' + until [[ "$(oc get -n clusters hostedcluster/${CLUSTER_CHECK} -o jsonpath='"'"'{.status.version.history[?(@.state!="")].state}'"'"')" = "Completed" ]]; do + sleep 15 + done + ' & + ROLLOUT_PID_PUB=$! + + echo "$(date) Waiting for version rollout on ${PRIVATE_NAME}..." + CLUSTER_CHECK="${PRIVATE_NAME}" timeout 45m bash -c ' + until [[ "$(oc get -n clusters hostedcluster/${CLUSTER_CHECK} -o jsonpath='"'"'{.status.version.history[?(@.state!="")].state}'"'"')" = "Completed" ]]; do + sleep 15 + done + ' & + ROLLOUT_PID_PRV=$! + + echo "$(date) Waiting for version rollout on ${OAUTH_LB_NAME}..." + CLUSTER_CHECK="${OAUTH_LB_NAME}" timeout 45m bash -c ' + until [[ "$(oc get -n clusters hostedcluster/${CLUSTER_CHECK} -o jsonpath='"'"'{.status.version.history[?(@.state!="")].state}'"'"')" = "Completed" ]]; do + sleep 15 + done + ' & + ROLLOUT_PID_OAU=$! + + echo "$(date) Waiting for all version rollout checks to complete..." FAILED_READY=0 - for CLUSTER in "${PUBLIC_NAME}" "${PRIVATE_NAME}" "${OAUTH_LB_NAME}"; do - echo "$(date) Waiting for version rollout on ${CLUSTER}..." - set +e - CLUSTER_CHECK="${CLUSTER}" timeout 30m bash -c ' - until [[ "$(oc get -n clusters hostedcluster/${CLUSTER_CHECK} -o jsonpath='"'"'{.status.version.history[?(@.state!="")].state}'"'"')" = "Completed" ]]; do - sleep 15 - done - ' + for CLUSTER_PID in "${PUBLIC_NAME}:${ROLLOUT_PID_PUB}" "${PRIVATE_NAME}:${ROLLOUT_PID_PRV}" "${OAUTH_LB_NAME}:${ROLLOUT_PID_OAU}"; do + CLUSTER="${CLUSTER_PID%%:*}" + PID="${CLUSTER_PID##*:}" + wait ${PID} ROLLOUT_RC=$? - set -e if [[ ${ROLLOUT_RC} -ne 0 ]]; then echo "$(date) ERROR: version rollout timed out for ${CLUSTER}" cat << EOF > "${ARTIFACT_DIR}/junit_hosted_cluster_${CLUSTER}.xml" @@ -180,6 +203,7 @@ chain: EOF fi done + set -e if [[ ${FAILED_READY} -ne 0 ]]; then exit 1 fi From 31f16c3908f2ddbef8950a269bca92c897304952 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Tue, 14 Apr 2026 06:54:55 -0400 Subject: [PATCH 11/12] fix(ci): add --assign-service-principal-roles and --dns-zone-rg-name Root cause: the create-guests chain was passing --workload-identities-file but NOT --assign-service-principal-roles. Without this flag, no Azure RBAC role assignments are created for the workload identities on the managed resource group. This caused all guest cluster operators (ingress, dns, storage, etc.) to fail with 403 AuthorizationFailed when accessing Azure APIs, preventing version rollout from ever completing. The v1 e2e flow (test/e2e/util/options.go) sets both AssignServicePrincipalRoles=true and DNSZoneRGName="os4-common". Evidence from CVO logs: - ingress: "The client 'd33639a4-...' does not have authorization to perform action 'Microsoft.Network/loadBalancers/read'" - storage: "Waiting for the DaemonSet to deploy the CSI Node Service" - 11 cluster operators stuck not-available, version.history.state=Partial Co-Authored-By: Claude Opus 4.6 --- .../hypershift-azure-create-selfmanaged-guests-chain.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml index d6cf45a5c56ed..13299dc6cb804 100644 --- a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml +++ b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml @@ -80,6 +80,8 @@ chain: --oidc-issuer-url=${AZURE_OIDC_ISSUER_URL} \ --sa-token-issuer-private-key-path=${AZURE_SA_TOKEN_ISSUER_KEY_PATH} \ --workload-identities-file=${AZURE_WORKLOAD_IDENTITIES_FILE} \ + --assign-service-principal-roles \ + --dns-zone-rg-name=os4-common \ --generate-ssh \ ${MARKETPLACE_ARGS}" From fb808fe4ed380fdfe539d8c9cbe1875f62931d00 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Tue, 14 Apr 2026 12:04:40 -0400 Subject: [PATCH 12/12] fix(ci): increase private cluster version rollout timeout to 60m Private topology clusters require additional time for data-plane operator convergence because PrivateLink infrastructure (PLS creation, Private Endpoint, DNS zone provisioning, and DNS propagation) must complete before operators like ingress, dns, console, and storage can reach the management cluster APIs through the PrivateLink tunnel. Evidence from rehearsal 2044008339955781632: - Public cluster: version rollout completed in ~6 min - OAuth LB cluster: version rollout completed - Private cluster: timed out at 45m with 11 operators not-available (console, dns, image-registry, ingress, insights, kube-storage-version-migrator, monitoring, node-tuning, openshift-samples, service-ca, storage) - Cluster was NOT degraded (RBAC fix working), confirming this is a PrivateLink networking convergence timing issue Co-Authored-By: Claude Opus 4.6 --- .../hypershift-azure-create-selfmanaged-guests-chain.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml index 13299dc6cb804..4e85a4cb7ccbe 100644 --- a/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml +++ b/ci-operator/step-registry/hypershift/azure/create-selfmanaged-guests/hypershift-azure-create-selfmanaged-guests-chain.yaml @@ -149,8 +149,11 @@ chain: ' & ROLLOUT_PID_PUB=$! - echo "$(date) Waiting for version rollout on ${PRIVATE_NAME}..." - CLUSTER_CHECK="${PRIVATE_NAME}" timeout 45m bash -c ' + # Private clusters need extra time: PrivateLink setup (PLS creation, Private Endpoint, + # DNS zone provisioning, DNS propagation) adds significant latency before data-plane + # operators can converge through the PrivateLink tunnel. + echo "$(date) Waiting for version rollout on ${PRIVATE_NAME} (60m timeout for PrivateLink)..." + CLUSTER_CHECK="${PRIVATE_NAME}" timeout 60m bash -c ' until [[ "$(oc get -n clusters hostedcluster/${CLUSTER_CHECK} -o jsonpath='"'"'{.status.version.history[?(@.state!="")].state}'"'"')" = "Completed" ]]; do sleep 15 done