From 4c7dbff5f751490fbfe5cc89168050763c39922a Mon Sep 17 00:00:00 2001 From: Simran Date: Mon, 6 Apr 2026 17:55:56 -0700 Subject: [PATCH] Create bare-metal BMC install steps, chain, and sample health workflow --- ci-operator/step-registry/abi/OWNERS | 4 + ci-operator/step-registry/abi/README.md | 124 ++++ ci-operator/step-registry/abi/chains/OWNERS | 1 + .../step-registry/abi/chains/bm--bmc/OWNERS | 1 + .../abi-chains-bm--bmc-chain.metadata.json | 13 + .../bm--bmc/abi-chains-bm--bmc-chain.yaml | 14 + ci-operator/step-registry/abi/conf/OWNERS | 1 + ci-operator/step-registry/abi/conf/bm/OWNERS | 1 + .../abi/conf/bm/abi-conf-bm-commands.sh | 220 +++++++ .../abi/conf/bm/abi-conf-bm-ref.metadata.json | 13 + .../abi/conf/bm/abi-conf-bm-ref.yaml | 70 ++ ci-operator/step-registry/abi/install/OWNERS | 1 + .../step-registry/abi/install/bmc/OWNERS | 1 + .../install/bmc/abi-install-bmc-commands.sh | 603 ++++++++++++++++++ .../bmc/abi-install-bmc-ref.metadata.json | 13 + .../abi/install/bmc/abi-install-bmc-ref.yaml | 113 ++++ .../step-registry/abi/workflows/OWNERS | 1 + .../workflows/bm--bmc--cluster-health/OWNERS | 1 + ...bmc--cluster-health-workflow.metadata.json | 13 + ...lows-bm--bmc--cluster-health-workflow.yaml | 12 + 20 files changed, 1220 insertions(+) create mode 100644 ci-operator/step-registry/abi/OWNERS create mode 100644 ci-operator/step-registry/abi/README.md create mode 120000 ci-operator/step-registry/abi/chains/OWNERS create mode 120000 ci-operator/step-registry/abi/chains/bm--bmc/OWNERS create mode 100644 ci-operator/step-registry/abi/chains/bm--bmc/abi-chains-bm--bmc-chain.metadata.json create mode 100644 ci-operator/step-registry/abi/chains/bm--bmc/abi-chains-bm--bmc-chain.yaml create mode 120000 ci-operator/step-registry/abi/conf/OWNERS create mode 120000 ci-operator/step-registry/abi/conf/bm/OWNERS create mode 100644 ci-operator/step-registry/abi/conf/bm/abi-conf-bm-commands.sh create mode 100644 ci-operator/step-registry/abi/conf/bm/abi-conf-bm-ref.metadata.json create mode 100644 ci-operator/step-registry/abi/conf/bm/abi-conf-bm-ref.yaml create mode 120000 ci-operator/step-registry/abi/install/OWNERS create mode 120000 ci-operator/step-registry/abi/install/bmc/OWNERS create mode 100644 ci-operator/step-registry/abi/install/bmc/abi-install-bmc-commands.sh create mode 100644 ci-operator/step-registry/abi/install/bmc/abi-install-bmc-ref.metadata.json create mode 100644 ci-operator/step-registry/abi/install/bmc/abi-install-bmc-ref.yaml create mode 120000 ci-operator/step-registry/abi/workflows/OWNERS create mode 120000 ci-operator/step-registry/abi/workflows/bm--bmc--cluster-health/OWNERS create mode 100644 ci-operator/step-registry/abi/workflows/bm--bmc--cluster-health/abi-workflows-bm--bmc--cluster-health-workflow.metadata.json create mode 100644 ci-operator/step-registry/abi/workflows/bm--bmc--cluster-health/abi-workflows-bm--bmc--cluster-health-workflow.yaml diff --git a/ci-operator/step-registry/abi/OWNERS b/ci-operator/step-registry/abi/OWNERS new file mode 100644 index 0000000000000..507d3283afe31 --- /dev/null +++ b/ci-operator/step-registry/abi/OWNERS @@ -0,0 +1,4 @@ +approvers: &owners +- cspi-qe-ocp-lp +- ieng-chaos +reviewers: *owners diff --git a/ci-operator/step-registry/abi/README.md b/ci-operator/step-registry/abi/README.md new file mode 100644 index 0000000000000..eeb812b709418 --- /dev/null +++ b/ci-operator/step-registry/abi/README.md @@ -0,0 +1,124 @@ +# Agent-based Installer (ABI) + +**Layout (step-registry paths):** `conf//` holds manifest / image-input work; `install//` holds boot and cluster deployment (e.g. **BMC** +virtual media today; **PXE** or other targets can be added alongside without colliding with bare-metal **conf**). See `conf/bm` and `install/bmc` below. + +**Step Inputs Parameters (names, defaults, semantics):** + | Step | Reference (source of truth) | Registry Documentation | + |---------------------|-----------------------------------------------------------------------|-------------------------------------------------------------------------------| + | **abi-conf-bm** | [`abi-conf-bm-ref.yaml`](conf/bm/abi-conf-bm-ref.yaml) | [`abi-conf-bm`](https://steps.ci.openshift.org/reference/abi-conf-bm) | + | **abi-install-bmc** | [`abi-install-bmc-ref.yaml`](install/bmc/abi-install-bmc-ref.yaml) | [`abi-install-bmc`](https://steps.ci.openshift.org/reference/abi-install-bmc) | + +**Steps Execution Order:** [`abi-conf-bm-commands.sh`](conf/bm/abi-conf-bm-commands.sh) → [`abi-install-bmc-commands.sh`](install/bmc/abi-install-bmc-commands.sh) + +**Official Documentation:** [Preparing to install with the Agent-based Installer](https://docs.redhat.com/en/documentation/openshift_container_platform/latest/html/installing_an_on-premise_cluster_with_the_agent-based_installer/preparing-to-install-with-the-agent-based-installer). + +## Installation Phases + +| Phase | Comments | +|---------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Day-0** | Cluster Configuration.
Creates a bare-minimum `install-config.yaml` and generates an `agent-config.yaml` template. Then `UpdateCfg Day0` applies overrides from `OCP__ABI__CFG_FN`, followed by `OCP__ABI__DAY0_SCRIPTS_YAML`. Both configuration files must be complete before proceeding to Day-1. | +| **Day-1** | Manifest Customization.
Generates the full manifest tree under `openshift/` (`agent create cluster-manifests`). Then `UpdateCfg Day1` applies overrides from `OCP__ABI__CFG_FN`, followed by `OCP__ABI__DAY1_SCRIPTS_YAML`, before the ISO is built. | +| **Day-1.5** | Post-Bootstrap Operations.
Runs after `agent wait-for bootstrap-complete`. Applies custom actions as configured in `OCP__ABI__CFG_FN` (e.g. scale Worker MachineSets to 0 when workers are provisioned directly by ABI). Runs concurrently with `wait-for install-complete`. | +| **Day-2** | Post-Deployment Customization.
Runs after `agent wait-for install-complete` and `KUBECONFIG` is set. Custom post-deployment actions via `OCP__ABI__DAY2_SCRIPTS_YAML` (e.g. install operators, apply policies). | + +`SHARED_DIR` holds inter-step artifacts (tarball, kubeconfig, `kubeconfig-minimal`). Logs and `ocp.tgz` → `ARTIFACT_DIR`. + +## OCP__ABI__CFG_FN + +Pre-populate `OCP__ABI__CFG_FN` (e.g., `${CLUSTER_PROFILE_DIR}/ocp--abi--cfg.yaml`) with the full `agent-config.yaml`, e.g. Host definitions (NMState network +config, BMC addresses), and any extra configuration needed: +```yaml +Day0: + config: {} + configFileOverride: + yaml+: + - ...yamlCfg...: + ...yamlCfgContentToDeepMergeAppendArray... + yaml-: + - ...yamlCfg...: + ...yamlCfgContentToDeepMergeReplaceArray... + yaml=: + - ...yamlCfg...: + ...yamlCfgContentToReplace... + json+: + - ...jsonCfg...: | + ...jsonCfgContentToDeepMergeAppendArray... + json-: + - ...jsonCfg...: | + ...jsonCfgContentToDeepMergeReplaceArray... + json=: + - ...jsonCfg...: | + ...jsonCfgContentToReplace... +Day1: # Same schema as `Day0` + ... +Day1.5: + config: + - NodeProv: ...booleanNodeProvisioningStatus... +Day2: # Same schema as `Day1.5` + ... +``` + +Example: +```yaml +Day0: + configFileOverride: + yaml-: + - install-config.yaml: + networking: + machineNetwork: + - cidr: 10.6.158.0/24 + platform: + baremetal: + apiVIPs: + - 10.6.158.26 + ingressVIPs: + - 10.6.158.27 + provisioningNetwork: Disabled + - agent-config.yaml: # Full agent-config.yaml: Host definitions (NMState network config, BMC addresses, roles, rootDeviceHints, etc.) + apiVersion: v1beta1 + kind: AgentConfig + metadata: + name: integrity-config + rendezvousIP: 10.6.158.11 + additionalNTPSources: + - clock.corp.redhat.com + hosts: + - ... # Per-host: hostname, role, rootDeviceHints, interfaces, networkConfig, bmc +Day1.5: + config: + - NodeProv: false +``` + +## Tunneling / Chisel + +Refer to [WebApp Services — Chisel Tunneling Service](https://redhat.atlassian.net/wiki/display/MPEXIENG/WebApp+Services#Chisel-Tunneling-Service) +for the reference setup (which uses **NGINX** as a reverse proxy in front of **Chisel** to achieve configurable data-plane port forwarding). + +Operational layout and port table (if the above reference setup is used): +[Chisel Tunneling Service](https://redhat.atlassian.net/wiki/display/MPEXIENG/WebApp+Services#Step2.1.2.2.3--Chisel_OperationalTasks). + +Step Input Parameters: `OCP__ABI__TUN_SVC__*` / `OCP__ABI__TEAM_NAME` + +## BMC / Redfish + +**abi-conf-bm** emits `ocp--bmc--info.json`; **abi-install-bmc** drives virtual media and power via Redfish. Details live in `abi-install-bmc-commands.sh` +(maintainer-oriented). + +## Phase Customization Scripts + +The `OCP__ABI__DAY0_SCRIPTS_YAML`, `OCP__ABI__DAY1_SCRIPTS_YAML`, and `OCP__ABI__DAY2_SCRIPTS_YAML` allow injecting arbitrary shell scripts into the +corresponding installation phase, executed in the order listed within the step's shell environment. See [Installation Phases](#installation-phases) for when +each script runs relative to the phase operations. + +Example (`OCP__ABI__DAY0_SCRIPTS_YAML`): +```yaml +OCP__ABI__DAY0_SCRIPTS_YAML: | + Scripts: + - | # Complete override of configuration files instead of using `OCP__ABI__CFG_FN` mechanism (not recommended, just serves as an example). + mkdir -p "${OCP__ABI__CLUSTER_DIR}/openshift" + cp -f "${CLUSTER_PROFILE_DIR}/install-config.yaml" "${OCP__ABI__CLUSTER_DIR}/install-config.yaml" + cp -f "${CLUSTER_PROFILE_DIR}/agent-config.yaml" "${OCP__ABI__CLUSTER_DIR}/agent-config.yaml" +``` + +Schema: [BuildCustomScriptsFromYAML.sh](https://github.com/RedHatQE/OpenShift-LP-QE--Tools/blob/main/libs/bash/common/BuildCustomScriptsFromYAML.sh). diff --git a/ci-operator/step-registry/abi/chains/OWNERS b/ci-operator/step-registry/abi/chains/OWNERS new file mode 120000 index 0000000000000..ec405d65a79df --- /dev/null +++ b/ci-operator/step-registry/abi/chains/OWNERS @@ -0,0 +1 @@ +../OWNERS \ No newline at end of file diff --git a/ci-operator/step-registry/abi/chains/bm--bmc/OWNERS b/ci-operator/step-registry/abi/chains/bm--bmc/OWNERS new file mode 120000 index 0000000000000..ec405d65a79df --- /dev/null +++ b/ci-operator/step-registry/abi/chains/bm--bmc/OWNERS @@ -0,0 +1 @@ +../OWNERS \ No newline at end of file diff --git a/ci-operator/step-registry/abi/chains/bm--bmc/abi-chains-bm--bmc-chain.metadata.json b/ci-operator/step-registry/abi/chains/bm--bmc/abi-chains-bm--bmc-chain.metadata.json new file mode 100644 index 0000000000000..1d012d62e3397 --- /dev/null +++ b/ci-operator/step-registry/abi/chains/bm--bmc/abi-chains-bm--bmc-chain.metadata.json @@ -0,0 +1,13 @@ +{ + "path": "abi/chains/bm--bmc/abi-chains-bm--bmc-chain.yaml", + "owners": { + "approvers": [ + "cspi-qe-ocp-lp", + "ieng-chaos" + ], + "reviewers": [ + "cspi-qe-ocp-lp", + "ieng-chaos" + ] + } +} \ No newline at end of file diff --git a/ci-operator/step-registry/abi/chains/bm--bmc/abi-chains-bm--bmc-chain.yaml b/ci-operator/step-registry/abi/chains/bm--bmc/abi-chains-bm--bmc-chain.yaml new file mode 100644 index 0000000000000..433f8f978c3f7 --- /dev/null +++ b/ci-operator/step-registry/abi/chains/bm--bmc/abi-chains-bm--bmc-chain.yaml @@ -0,0 +1,14 @@ +chain: + as: abi-chains-bm--bmc + env: + - name: OCP__ABI__CLUSTER_DIR + default: /tmp/ocpClusterDir + documentation: |- + The Steps use a Container Image where the `CWD` is R/O. Overrides this to a writable location. + steps: + - ref: abi-conf-bm + - ref: abi-install-bmc + documentation: |- + This Chain deploy OpenShift Container Platform (OCP) on Bare Metal with BMC. + + See [ABI overview](https://github.com/openshift/release/blob/main/ci-operator/step-registry/abi/README.md) for details. diff --git a/ci-operator/step-registry/abi/conf/OWNERS b/ci-operator/step-registry/abi/conf/OWNERS new file mode 120000 index 0000000000000..ec405d65a79df --- /dev/null +++ b/ci-operator/step-registry/abi/conf/OWNERS @@ -0,0 +1 @@ +../OWNERS \ No newline at end of file diff --git a/ci-operator/step-registry/abi/conf/bm/OWNERS b/ci-operator/step-registry/abi/conf/bm/OWNERS new file mode 120000 index 0000000000000..ec405d65a79df --- /dev/null +++ b/ci-operator/step-registry/abi/conf/bm/OWNERS @@ -0,0 +1 @@ +../OWNERS \ No newline at end of file diff --git a/ci-operator/step-registry/abi/conf/bm/abi-conf-bm-commands.sh b/ci-operator/step-registry/abi/conf/bm/abi-conf-bm-commands.sh new file mode 100644 index 0000000000000..ef2d3914fdae6 --- /dev/null +++ b/ci-operator/step-registry/abi/conf/bm/abi-conf-bm-commands.sh @@ -0,0 +1,220 @@ +#!/bin/bash +# abi-conf-bm — Agent-based installer configuration (bare metal; **conf** phase). +# +# Logic in this Step: +# - Bare-minimum `install-config.yaml` scaffold -> OCP-version-aware defaults -> `baremetal` platform -> `agent-config.yaml` template. +# - `UpdateCfg Day0` merges, updates, or replaces config entries; `OCP__ABI__DAY0_SCRIPTS_YAML` scripts further customize `install-config.yaml` / `agent-config.yaml`. +# - Extracts BMC info to `ocp--bmc--info.json`; strips BMC credentials from `agent-config.yaml`. +# - Generates Cluster manifests. +# - `UpdateCfg Day1` + `OCP__ABI__DAY1_SCRIPTS_YAML` scripts customize manifests. +# +set -euxo pipefail +shopt -s inherit_errexit + +mkdir -p "${OCP__ABI__CLUSTER_DIR}" + +eval "$( + curl -fsSL "https://raw.githubusercontent.com/RedHatQE/OpenShift-LP-QE--Tools/main/libs/bash/common/BuildCustomScriptsFromYAML.sh" +)" +eval "$( + curl -fsSL "https://raw.githubusercontent.com/RedHatQE/OpenShift-LP-QE--Tools/main/libs/bash/common/EnsureReqs.sh" +)"; EnsureReqs yq + +typeset ocpABIcfg="${CLUSTER_PROFILE_DIR}/${OCP__ABI__CFG_FN}"; [ -r "${ocpABIcfg}" ] + +# Extract `openshift-install` from the release image. +# The `RELEASE_IMAGE_LATEST` is set by CI Operator based on `.releases.latest` in CI Conf. +oc adm release extract \ + -a /var/run/secrets/registry-pull--build-farms/.dockerconfigjson \ + "${RELEASE_IMAGE_LATEST}" \ + --command=openshift-install \ + --to="/tmp" +export PATH="/tmp:${PATH}" + + +function openshift-install () { + typeset -i es=0 + { + echo \ +"$(date -Iseconds)|${FUNCNAME[0]@Q} ${*@Q}"$'\n'"$(printf '%.0s-' {1..80})" + command openshift-install \ + --dir "${OCP__ABI__CLUSTER_DIR}/" \ + --log-level "${OCP__ABI__INSTLR_LOG_LEVEL}" \ + "$@" 2>&1 || es=$? + echo "$(printf '%.0s=' {1..80})" + exit ${es} + } | tee -a "${ARTIFACT_DIR}/ocp--installer--cluster.log" + return ${PIPESTATUS[0]} +} + +function UpdateCfg () { + typeset topKey="${1:?}"; (($#)) && shift + typeset cfgType='' cfgFile='' cfgCont='' updateOp='' + while IFS=$'\t' read -r cfgType cfgFile cfgCont; do + [[ "${cfgFile}" == */* ]] && + mkdir -p "${OCP__ABI__CLUSTER_DIR}/${cfgFile%/*}" + true 1>> "${OCP__ABI__CLUSTER_DIR}/${cfgFile}" + exec 3< <(cat "${OCP__ABI__CLUSTER_DIR}/${cfgFile}"); wait $! + case ${cfgType} in + (*+) updateOp='select(fileIndex==0) *+ ' ;; + (*-) updateOp='select(fileIndex==0) * ' ;; + (*=) updateOp='' ;; + esac + updateOp+='select(fileIndex==1)' + case ${cfgType} in + (yaml+|yaml-|yaml=) + yq eval-all "${updateOp}" \ + - \ + <(set +x; yq -p json -o yaml eval . 0<<<"${cfgCont}") \ + 0<&3 1>"${OCP__ABI__CLUSTER_DIR}/${cfgFile}" + ;; + (json+|json-|json=) + yq -p json -o json eval-all "${updateOp}" \ + - \ + <(set +x; echo "${cfgCont}") \ + 0<&3 1>"${OCP__ABI__CLUSTER_DIR}/${cfgFile}" + ;; + (*) : "Invalid Type: ${cfgType}"; false;; + esac + exec 3<&- + done 0< <( + yq -o json eval . "${ocpABIcfg}" | + jq -r --arg k "${topKey}" ' + (.[$k].configFileOverride // empty) | to_entries[] | + .key as $type | .value[]? | to_entries[] | + [$type, .key, ( + if ($type | startswith("json")) then .value + else (.value | tojson) + end + )] | join("\t") + ' + ) + true +} + + +# Create bare-minimum `install-config.yaml`. +{ + yq -p yaml -o json eval . | + jq -c \ + --arg clsName "${OCP__ABI__BM__CLS_NAME}" \ + --arg baseDom "${OCP__ABI__BM__BASE_DOM}" \ + --rawfile pullCrd <(set +x; cat "${CLUSTER_PROFILE_DIR}/pull-secret") \ + --rawfile sshKey <(set +x; cat "${CLUSTER_PROFILE_DIR}/ssh-publickey") \ + ' + .baseDomain=$baseDom | + .metadata.name=$clsName | + .pullSecret=($pullCrd | rtrimstr("\n")) | + .sshKey=$sshKey + ' | + yq -p json -o yaml eval . +} 0<<'fileEOF' 1> "${OCP__ABI__CLUSTER_DIR}/install-config.yaml" +apiVersion: v1 +baseDomain: '' +metadata: + name: '' +platform: {none: {}} +pullSecret: '' +sshKey: '' +fileEOF + +# Enrich with OCP-version-aware defaults. +openshift-install create install-config +# Update for Bare Metal target. +yq -i eval \ + '.platform={"baremetal": {}}' \ + "${OCP__ABI__CLUSTER_DIR}/install-config.yaml" + +# Create `agent-config.yaml` template. +openshift-install agent create agent-config-template +# Being idempotent on re-run. +[ -s "${OCP__ABI__CLUSTER_DIR}/agent-config.yaml" ] || { + jq -r \ + '."*agentconfig.AgentConfig".File.Data' \ + "${OCP__ABI__CLUSTER_DIR}/.openshift_install_state.json" | + base64 -d 1> "${OCP__ABI__CLUSTER_DIR}/agent-config.yaml" +} + +# Customize `install-config.yaml` and complete `agent-config.yaml`. +UpdateCfg Day0 +eval "$(BuildCustomScriptsFromYAML OCP__ABI__DAY0_SCRIPTS_YAML)" + +# Retrieve BMC Information from `agent-config.yaml`. +# Currently, if all Master Nodes are ready to be installed, but +# not all Worker Nodes are registering, the +# `wait-for bootstrap-complete` will exit out with error. +# As workaround, we boot the Worker Nodes first, and the +# Rendezvous Host last. +{ + yq -p yaml -o json eval . | + jq \ + --rawfile usr <(set +x; cat "${CLUSTER_PROFILE_DIR}/cred--bmc--usr") \ + --rawfile pwd <(set +x; cat "${CLUSTER_PROFILE_DIR}/cred--bmc--pwd") \ + --argjson rIP "$(yq -o json '(select( + (.rendezvousIP | length) > 0) | .rendezvousIP + ) // ([ + (.hosts[] | select(.role == "master")), + (.hosts[] | select(.role == "arbiter")), + (.hosts[] | select((.role == "") or (.role == null))) + ] | .[0] | [.networkConfig.interfaces[] | + select(.ipv4.enabled == true) | + .ipv4.address[0].ip + ] | .[0]) // error( + "rendezvousIP could not be determined" + ) ' "${OCP__ABI__CLUSTER_DIR}/agent-config.yaml")" \ + '[( + (.hosts[] | select(.role == "worker")), + (( + (.hosts[] | select((.role == "") or (.role == null))), + (.hosts[] | select(.role == "auto-assign")), + (.hosts[] | select(.role == "arbiter")), + (.hosts[] | select(.role == "master")) + ) | select(any(( + .networkConfig.interfaces[] | + select(.ipv4.enabled == true) | + .ipv4.address[]?.ip + ); . == $rIP) | not)), + (.hosts[] | select(any(( + .networkConfig.interfaces[] | + select(.ipv4.enabled == true) | + .ipv4.address[]?.ip + ); . == $rIP))) + ) | { + url: ("https://" + (.bmc.address | split("://")[-1])), + usr: (.bmc.username // ($usr | rtrimstr("\n"))), + pwd: (.bmc.password // ($pwd | rtrimstr("\n"))), + hostIPv4: ([ + .networkConfig.interfaces[] | + select(.ipv4.enabled == true) | + .ipv4.address[0]?.ip + ][0] // null) + }]' +} 0< "${OCP__ABI__CLUSTER_DIR}/agent-config.yaml" 1> "${SHARED_DIR}/ocp--bmc--info.json" + +# Strip BMC Credentials from `agent-config.yaml`. +exec 3< <(cat "${OCP__ABI__CLUSTER_DIR}/agent-config.yaml"); wait $! +{ + yq -p yaml -o json eval . | + jq '.hosts[].bmc |= del(.username, .password)' | + yq -p json -o yaml eval . +} 0<&3 1> "${OCP__ABI__CLUSTER_DIR}/agent-config.yaml" +exec 3<&- + +# Set ISO Mode. +((OCP__ABI__MIN_ISO)) && ( + export __IMG__ROOT_FS="${OCP__ABI__TUN_SVC__DP_BASE_URL%%/}/${OCP__ABI__TUN_SVC__DP_PORT}/boot-artifacts" + yq -i eval ' + .minimalISO=true | + .bootArtifactsBaseURL=strenv(__IMG__ROOT_FS) + ' "${OCP__ABI__CLUSTER_DIR}/agent-config.yaml" +) + +# Generate full manifest tree. +openshift-install agent create cluster-manifests + +# Manifest Customization. +UpdateCfg Day1 +eval "$(BuildCustomScriptsFromYAML OCP__ABI__DAY1_SCRIPTS_YAML)" + +# Save OCP Installation information for next Step. +tar zcf "${SHARED_DIR}/ocpClusterInf.tgz" -C "${OCP__ABI__CLUSTER_DIR}/" . diff --git a/ci-operator/step-registry/abi/conf/bm/abi-conf-bm-ref.metadata.json b/ci-operator/step-registry/abi/conf/bm/abi-conf-bm-ref.metadata.json new file mode 100644 index 0000000000000..91321b5d76147 --- /dev/null +++ b/ci-operator/step-registry/abi/conf/bm/abi-conf-bm-ref.metadata.json @@ -0,0 +1,13 @@ +{ + "path": "abi/conf/bm/abi-conf-bm-ref.yaml", + "owners": { + "approvers": [ + "cspi-qe-ocp-lp", + "ieng-chaos" + ], + "reviewers": [ + "cspi-qe-ocp-lp", + "ieng-chaos" + ] + } +} \ No newline at end of file diff --git a/ci-operator/step-registry/abi/conf/bm/abi-conf-bm-ref.yaml b/ci-operator/step-registry/abi/conf/bm/abi-conf-bm-ref.yaml new file mode 100644 index 0000000000000..0907ec82fc8ed --- /dev/null +++ b/ci-operator/step-registry/abi/conf/bm/abi-conf-bm-ref.yaml @@ -0,0 +1,70 @@ +ref: + as: abi-conf-bm + from_image: + namespace: ci + name: baremetal-qe-base + tag: latest + cli: latest + commands: abi-conf-bm-commands.sh + resources: + requests: + cpu: '1' + memory: 256Mi + grace_period: 5m + credentials: + - namespace: test-credentials + name: registry-pull-credentials + mount_path: /var/run/secrets/registry-pull--build-farms + env: + - name: OCP__ABI__CLUSTER_DIR + default: ocpClusterDir + documentation: |- + The `openshift-install` workspace path (preferred: relative to the Step's CWD); must match the corresponding install Step, e.g., **abi-install-bmc**, + when overriding. + - name: OCP__ABI__CFG_FN + default: ocp--abi--cfg.yaml + documentation: |- + The ABI configuration file Base Name under `${CLUSTER_PROFILE_DIR}/`; must match the corresponding install Step, e.g., **abi-install-bmc**, when + overriding. See + [OCP__ABI__CFG_FN](https://github.com/openshift/release/blob/main/ci-operator/step-registry/abi/README.md#ocp__abi__cfg_fn) for details. + - name: OCP__ABI__BM__CLS_NAME + documentation: |- + Required (no default). The Cluster name to be deployed. + - name: OCP__ABI__BM__BASE_DOM + documentation: |- + Required (no default). The Cluster DNS Base Domain. + - name: OCP__ABI__INSTLR_LOG_LEVEL + default: 'debug' + documentation: |- + Log level for `openshift-install` (`--log-level`). + - name: OCP__ABI__DAY0_SCRIPTS_YAML + default: 'Scripts: []' + documentation: |- + Phase Customization Script for Day-0 (see + [Phase Customization Scripts](https://github.com/openshift/release/blob/main/ci-operator/step-registry/abi/README.md#phase-customization-scripts)). + - name: OCP__ABI__DAY1_SCRIPTS_YAML + default: 'Scripts: []' + documentation: |- + Phase Customization Script for Day-1 (see + [Phase Customization Scripts](https://github.com/openshift/release/blob/main/ci-operator/step-registry/abi/README.md#phase-customization-scripts)). + - name: OCP__ABI__MIN_ISO + default: '1' + documentation: |- + When non-zero, patches `agent-config.yaml` with `.minimalISO=true` and `.bootArtifactsBaseURL` (boot artifacts served via Chisel tunnel) before + `agent create image`. Set to `0` to embed all artifacts in the ISO. + - name: OCP__ABI__TUN_SVC__DP_BASE_URL + documentation: |- + Required (no default). BMC-facing HTTPS prefix for the install ISO on the data plane (no trailing slash), e.g. `https:///dp`. + - name: OCP__ABI__TUN_SVC__DP_PORT + documentation: |- + Required (no default). Server-side data-plane port in the Chisel reverse tunnel and in the ISO URL path. Each Cluster Under Test must reserve a + distinct port when several jobs share one Chisel server (see + [tunneling information](https://github.com/openshift/release/blob/main/ci-operator/step-registry/abi/README.md#tunneling--chisel) for details). The + served ISO URL is `${OCP__ABI__TUN_SVC__DP_BASE_URL}/${OCP__ABI__TUN_SVC__DP_PORT}/`. + documentation: |- + The `${CLUSTER_PROFILE_DIR}/` MUST contain files: `pull-secret`, `ssh-publickey`, `cred--bmc--usr`, `cred--bmc--pwd`, and `${OCP__ABI__CFG_FN}`. + + See [ABI overview](https://github.com/openshift/release/blob/main/ci-operator/step-registry/abi/README.md) for details. + + The Container Image [ci/baremetal-qe-base:latest](https://github.com/openshift-eng/baremetal-qe-infra/blob/master/images/prow-image/Dockerfile) is used + because it provides `nmstatectl`. diff --git a/ci-operator/step-registry/abi/install/OWNERS b/ci-operator/step-registry/abi/install/OWNERS new file mode 120000 index 0000000000000..ec405d65a79df --- /dev/null +++ b/ci-operator/step-registry/abi/install/OWNERS @@ -0,0 +1 @@ +../OWNERS \ No newline at end of file diff --git a/ci-operator/step-registry/abi/install/bmc/OWNERS b/ci-operator/step-registry/abi/install/bmc/OWNERS new file mode 120000 index 0000000000000..ec405d65a79df --- /dev/null +++ b/ci-operator/step-registry/abi/install/bmc/OWNERS @@ -0,0 +1 @@ +../OWNERS \ No newline at end of file diff --git a/ci-operator/step-registry/abi/install/bmc/abi-install-bmc-commands.sh b/ci-operator/step-registry/abi/install/bmc/abi-install-bmc-commands.sh new file mode 100644 index 0000000000000..669d2abbce65c --- /dev/null +++ b/ci-operator/step-registry/abi/install/bmc/abi-install-bmc-commands.sh @@ -0,0 +1,603 @@ +#!/bin/bash +# abi-install-bmc — Agent-based installer **install** phase (BMC / virtual media; **install** phase). +# +# **Chisel:** OpenShift Secret `test-credentials/chisel-creds` is mounted at `/secret/chisel`. +# Basic-auth filenames use the `chisel-usr--…` / `chisel-pwd--…` pattern (suffix from `OCP__ABI__TEAM_NAME`). +# +# Logic in this Step: +# - `agent create image` -> ISO is served via HTTP Server `8080` (Range-aware) + **Chisel** reversed tunnel. +# - Redfish boot loop: mount ISO + boot nodes (order from `ocp--bmc--info.json`) + conditional disk wipe (BMC: pre-boot, OS: post-boot via SSH). +# - `wait-for bootstrap-complete` -> copy minimal `KUBECONFIG` -> **Day-1.5** (runs concurrently with `wait-for install-complete`). +# - `wait-for install-complete` -> eject virtual media. +# - **Day-2**: Nodes Ready -> `OCP__ABI__DAY2_SCRIPTS_YAML` scripts for custom post-deployment actions. +# - HTTP Server + **Chisel** torn down by `EXIT` trap. +# +set -euxo pipefail +shopt -s inherit_errexit + +# Set writable directories (Container Image `baremetal-qe-base` sets `HOME=/output` which may not be writable). +export XDG_CONFIG_HOME=/tmp/abi/.config # Used by `bw`. +export XDG_CACHE_HOME=/tmp/abi/.cache # Used by `openshift-install agent create image`. + +eval "$( + curl -fsSL "https://raw.githubusercontent.com/RedHatQE/OpenShift-LP-QE--Tools/main/libs/bash/common/BuildCustomScriptsFromYAML.sh" +)" +eval "$( + curl -fsSL "https://raw.githubusercontent.com/RedHatQE/OpenShift-LP-QE--Tools/main/libs/bash/common/Vault--BitWarden--UploadAttachment.sh" +)" +eval "$( + curl -fsSL "https://raw.githubusercontent.com/RedHatQE/OpenShift-LP-QE--Tools/main/libs/bash/common/EnsureReqs.sh" +)"; EnsureReqs yq chisel bw + +typeset ocpABIcfg="${CLUSTER_PROFILE_DIR}/${OCP__ABI__CFG_FN}"; [ -r "${ocpABIcfg}" ] +typeset bmcInfo="${SHARED_DIR}/ocp--bmc--info.json"; [ -f "${bmcInfo}" ] +typeset isoFile='' isoURL='' chiselCrdUsr='' chiselCrdPwd='' +typeset -i httpSvcPort=8080 +typeset -ai taskPIDs=() + +# Extract `openshift-install` from the release image. +# The `RELEASE_IMAGE_LATEST` is set by CI Operator based on `.releases.latest` in CI Conf. +oc adm release extract \ + -a /var/run/secrets/registry-pull--build-farms/.dockerconfigjson \ + "${RELEASE_IMAGE_LATEST}" \ + --command=openshift-install \ + --to="/tmp" +export PATH="/tmp:${PATH}" + + +function openshift-install () { + typeset -i es=0 + { + echo \ +"$(date -Iseconds)|${FUNCNAME[0]@Q} ${*@Q}"$'\n'"$(printf '%.0s-' {1..80})" + command openshift-install \ + --dir "${OCP__ABI__CLUSTER_DIR}/" \ + --log-level "${OCP__ABI__INSTLR_LOG_LEVEL}" \ + "$@" 2>&1 || es=$? + echo "$(printf '%.0s=' {1..80})" + exit ${es} + } | tee -a "${ARTIFACT_DIR}/ocp--installer--cluster.log" + return ${PIPESTATUS[0]} +} + +function HandleSIGCHLD () { + typeset -i i=0 + for i in "${!taskPIDs[@]}"; do + kill -0 "${taskPIDs[i]}" 2>/dev/null || + unset "taskPIDs[i]" + done + true +} + +function RedfishAPIcall () { + typeset bmcInfo="${1:?}"; (($#)) && shift + typeset bmcURL="${1:?}"; (($#)) && shift + typeset apiMethod="${1:?}"; (($#)) && shift + typeset apiEP="${1?}"; (($#)) && shift + typeset -i es=0 tryLeft=6 httpCode=0 + typeset httpResp='' + while ((tryLeft)); do + es=0 + httpResp=$(curl -sSLk -X "${apiMethod}" \ + --fail-with-body \ + -w '\n%{response_code}' \ + -K <( + set +x + jq -r \ + --arg url "${bmcURL}" \ + ' + .[] | + select(.url == $url) | + "-u \("\(.usr):\(.pwd)" | @json)" + ' \ + 0< "${bmcInfo}" + ) \ + -H 'Content-Type: application/json' \ + -H 'Accept: application/json' \ + "$@" \ + "${bmcURL}/redfish/v1/${apiEP#/}") || es=$? + httpCode="${httpResp##*$'\n'}" + # Retry on 500,503: Transient server failure. + case ${httpCode} in + (500|503) ;; + (*) break;; + esac + sleep 10 + ((--tryLeft)) + done + printf '%s' "${httpResp%$'\n'${httpCode}}" + return ${es} +} + +function VCD-Eject () { + typeset bmcInfo="${1:?}"; (($#)) && shift + typeset bmcURL="${1:?}"; (($#)) && shift + typeset bmcMgrId="${1:?}"; (($#)) && shift + RedfishAPIcall "${bmcInfo}" "${bmcURL}" POST \ + "Managers/${bmcMgrId}/VirtualMedia/CD/Actions/VirtualMedia.EjectMedia" \ + -d '{}' || true + true +} + +function Host-PowerControl () { + typeset bmcInfo="${1:?}"; (($#)) && shift + typeset bmcURL="${1:?}"; (($#)) && shift + typeset bmcSysId="${1:?}"; (($#)) && shift + typeset resetType="${1:?}"; (($#)) && shift + typeset -i es=0 + RedfishAPIcall "${bmcInfo}" "${bmcURL}" POST \ + "Systems/${bmcSysId}/Actions/ComputerSystem.Reset" \ + -d "{\"ResetType\": \"${resetType}\"}" || es=$? + return ${es} +} + +function WipeDisks () { + typeset -i tPID="${1:?}"; (($#)) && shift + typeset bmcInfo="${1:?}"; (($#)) && shift + typeset bmcURL="${1:?}"; (($#)) && shift + typeset bmcSysId="${1:?}"; (($#)) && shift + typeset bmcMgrId="${1:?}"; (($#)) && shift + typeset wipeMethod="${1?}"; (($#)) && shift + typeset rmtScript='' + typeset -i es=0 + case ${wipeMethod} in + ('') rmtScript="$(cat - 0<<'sshEOF' +sudo bash -o pipefail -O inherit_errexit -euxc "$(cat - 0<<'shEOF' + grep -qE '\bcoreos\.live(\.|iso=)' /proc/cmdline || exit 193 + true +shEOF +)" +sshEOF + )";;& + (OS) rmtScript="$(cat - 0<<'sshEOF' +sudo bash -o pipefail -O inherit_errexit -euxc "$(cat - 0<<'shEOF' + typeset dev= + grep -qE '\bcoreos\.live(\.|iso=)' /proc/cmdline || exit 193 + udevadm settle + while IFS= read -r dev; do + sgdisk --zap-all "${dev}" + wipefs -a "${dev}" + blkdiscard "${dev}" 2> /dev/null || true + done 0< <( + lsblk -dpno NAME,TYPE | awk '($2 == "disk"){print $1}' + ) + true +shEOF +)" +sshEOF + )";;& + ('') ;& + (OS) ( + typeset stdErr='' rgx='' + typeset hostIPv4='' + hostIPv4="$(jq -r \ + --arg url "${bmcURL}" \ + '.[] | select(.url == $url).hostIPv4' \ + 0< "${bmcInfo}")" + typeset -i tryLeft=$((2 * OCP__ABI__WAIT__NODE_READY__M)) es=0 + while ((tryLeft)); do + kill -0 "${tPID}" 2>/dev/null || break + sleep 30 + es=0 + stdErr="$({ + ssh -n \ + -o UserKnownHostsFile=/dev/null \ + -o StrictHostKeyChecking=no \ + -o ConnectTimeout=5 \ + -i "${CLUSTER_PROFILE_DIR}/ssh-privatekey" \ + "core@${hostIPv4}" \ + "${rmtScript}" \ + 2> >(tee /dev/fd/3) 1>&3 || + es=$? + } 3>&2; exit ${es})" || es=$? + case ${es} in + (0) break ;; + (193) exit ${es} ;; + (255) + rgx='\bPermission denied \(.*\bpublickey\b.*\)' + [[ "${stdErr}" =~ ${rgx} ]] && exit 255 + ;; + esac + ((--tryLeft)) + done + ) || es=$?;; + (BMC) ( + typeset ctrlId='' volEP='' driveEP='' jobId='' + typeset -a jobIds=() + while IFS= read -r ctrlId; do + while IFS= read -r volEP; do + # Try `Volume.Initialize`. + jobId="$( + RedfishAPIcall "${bmcInfo}" "${bmcURL}" POST \ + "${volEP#/redfish/v1/}/Actions/Volume.Initialize" \ + -d '{ + "InitializeType": "Slow", + "@Redfish.OperationApplyTime": "OnReset" + }' -o /dev/null -D - | + sed -nE 's/^[Ll]ocation: ([^\r]*)\r?$/\1/p;T;q' + )" || true + jobId="${jobId##*/}" + [ -n "${jobId}" ] && jobIds+=("${jobId}") && continue + # Fallback to `SecureErase` the Volume's Physical Drives. + while IFS= read -r driveEP; do + jobId="$( + RedfishAPIcall "${bmcInfo}" "${bmcURL}" POST \ + "${driveEP#/redfish/v1/}/Actions/Drive.SecureErase" \ + -d '{}' -o /dev/null -D - | + sed -nE 's/^[Ll]ocation: ([^\r]*)\r?$/\1/p;T;q' + )" || true + jobId="${jobId##*/}" + [ -n "${jobId}" ] && jobIds+=("${jobId}") + done 0< <( + RedfishAPIcall "${bmcInfo}" "${bmcURL}" GET \ + "${volEP#/redfish/v1/}" | + jq -r '.Links.Drives[]?."@odata.id" // empty' + ) + done 0< <( + RedfishAPIcall "${bmcInfo}" "${bmcURL}" GET \ + "Systems/${bmcSysId}/Storage/${ctrlId}/Volumes" | + jq -r '.Members[]?."@odata.id" // empty' + ) + done 0< <( + RedfishAPIcall "${bmcInfo}" "${bmcURL}" GET \ + "Systems/${bmcSysId}/Storage" | + jq -r '.Members[]."@odata.id" | split("/")[-1]' + ) + # Restart Host. + Host-PowerControl "${bmcInfo}" "${bmcURL}" "${bmcSysId}" ForceRestart + # Wait for all wipe Jobs to complete. + while true; do + kill -0 "${tPID}" 2>/dev/null || break + sleep 60 + for jobId in "${jobIds[@]}"; do + { + RedfishAPIcall "${bmcInfo}" "${bmcURL}" GET \ + "Managers/${bmcMgrId}/Jobs/${jobId}" | + jq -e ' + .JobState | test("^Completed"; "i") + ' 1> /dev/null + } && { + RedfishAPIcall "${bmcInfo}" "${bmcURL}" DELETE \ + "Managers/${bmcMgrId}/Jobs/${jobId}" || + true + } || continue 2 + done + break + done + ) || es=$?;; + (*) : "Unknown method: ${wipeMethod}"; es=1;; + esac + return ${es} +} + + +# Chisel basic auth (disable `xtrace` while reading secrets). +set +x +chiselCrdUsr="$(cat "/var/run/secrets/chisel/chisel-usr--${OCP__ABI__TEAM_NAME}")" +chiselCrdPwd="$(cat "/var/run/secrets/chisel/chisel-pwd--${OCP__ABI__TEAM_NAME}")" +set -x + +trap 'HandleSIGCHLD' CHLD +trap ' + ((${#taskPIDs[@]})) && { + kill "${taskPIDs[@]}" 2>/dev/null || true + wait "${taskPIDs[@]}" 2>/dev/null || true + } +' EXIT + +# Restore OCP Installation information from previous Step. +mkdir -p "${OCP__ABI__CLUSTER_DIR}" +tar zxf "${SHARED_DIR}/ocpClusterInf.tgz" -C "${OCP__ABI__CLUSTER_DIR}/" +rm -f "${SHARED_DIR}/ocpClusterInf.tgz" + +# Chisel / ISO URL (Job Conf. YAML key `.tests[*].steps.env`); fail before `agent create image`. +[ -n "${OCP__ABI__TUN_SVC__DP_BASE_URL}" ] && [ -n "${OCP__ABI__TUN_SVC__DP_PORT}" ] && [ -n "${OCP__ABI__TUN_SVC__CP_URL}" ] + +# ISO Creation Phase. +unset KUBECONFIG # If set, the certificate inside it will be embedded to ISO, causing mismatch during `wait-for ...`. +openshift-install agent create image +isoFile="$( + shopt -s nullglob + echo "${OCP__ABI__CLUSTER_DIR}"/agent.*.iso +)" +[ -f "${isoFile}" ] +isoURL="${OCP__ABI__TUN_SVC__DP_BASE_URL%%/}/${OCP__ABI__TUN_SVC__DP_PORT}/${isoFile##*/}" + +# Local HTTP serves the ISO (`HTTP Range` required by some BMC / virtual-media stacks); `ARTIFACT_DIR` holds logs. +{ + python3 - "${httpSvcPort}" "${OCP__ABI__CLUSTER_DIR}" 0<<'pyEOF' +import functools +import http.server +import os +import shutil +import sys +from datetime import datetime, timezone + + +class RangeHandler(http.server.SimpleHTTPRequestHandler): + def send_head(self): + path = self.translate_path(self.path) + if os.path.isdir(path): + return super().send_head() + try: + f = open(path, 'rb') + except OSError: + self.send_error(404) + return None + fs = os.fstat(f.fileno()) + size = fs.st_size + ctype = self.guess_type(path) + rng = self.headers.get('Range', '') + start, end = 0, size - 1 + if rng.startswith('bytes='): + try: + s, e = rng[6:].split('-', 1) + start = int(s) if s else 0 + end = int(e) if e else size - 1 + except ValueError: + f.close() + self.send_error(400) + return None + end = min(end, size - 1) + if start > end: + f.close() + self.send_error(416) + return None + f.seek(start) + self._copy_length = end - start + 1 + self.send_response(206) + self.send_header('Content-Range', f'bytes {start}-{end}/{size}') + else: + self._copy_length = None + self.send_response(200) + length = end - start + 1 + self.send_header('Content-type', ctype) + self.send_header('Content-Length', str(length)) + self.send_header('Accept-Ranges', 'bytes') + self.send_header('Last-Modified', self.date_time_string(fs.st_mtime)) + self.end_headers() + return f + + def copyfile(self, source, outputfile): + try: + remaining = getattr(self, '_copy_length', None) + if remaining is not None: + self._copy_length = None + buf = shutil.COPY_BUFSIZE + while remaining > 0: + data = source.read(min(buf, remaining)) + if not data: + break + outputfile.write(data) + remaining -= len(data) + else: + super().copyfile(source, outputfile) + except (BrokenPipeError, ConnectionResetError): + pass + + def log_message(self, fmt, *args): + hdrs = ''.join(f' {k}: {v}\n' for k, v in self.headers.items()) + sys.stderr.write( + f'[{datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")}] {fmt % args}\n' + f'{hdrs}' + ) + sys.stderr.flush() + + +http.server.test( + HandlerClass=functools.partial( + RangeHandler, + directory=(sys.argv[2] if len(sys.argv) > 2 else '.'), + ), + port=int(sys.argv[1]) if len(sys.argv) > 1 else 8080, + bind='0.0.0.0', +) +pyEOF +} 1> "${ARTIFACT_DIR}/ocp--installer--httpd.log" 2>&1 & taskPIDs+=($!) + +# Chisel reverse tunnel (CI has no ingress to the test pod). +set +x +chisel client \ + --auth "${chiselCrdUsr}:${chiselCrdPwd}" \ + "${OCP__ABI__TUN_SVC__CP_URL%%/}/" \ + "R:0.0.0.0:${OCP__ABI__TUN_SVC__DP_PORT}:localhost:${httpSvcPort}" \ + 1> "${ARTIFACT_DIR}/ocp--installer--chisel.log" 2>&1 & taskPIDs+=($!) +set -x + +# Probe BMC-facing ISO URL over the tunnel (HTTP `HEAD`). `sleep` before `curl` so the Chisel reverse tunnel can finish coming up. +( + typeset -i tryLeft=5 + while ((tryLeft)); do + sleep 5 + curl -fsSL -I -o /dev/null \ + --connect-timeout 2 --max-time 5 \ + "${isoURL}" && break + ((--tryLeft)) + done +) + +# Reboot Nodes into OCP Agent Installation ISO. +({ + typeset bmcURL='' bmcVend='' bmcSysId='' bmcMgrId='' + typeset diskWipeMethod='' + typeset -i tryLeft=0 didBMCwipe=0 + typeset -i myPID="${BASHPID}" + typeset -i tPID + tPID="$(ps -o ppid= -p "${myPID}")" + while IFS= read -r bmcURL; do + # Auto-discover BMC Vendor and Identifiers. + bmcVend=$( + RedfishAPIcall "${bmcInfo}" "${bmcURL}" GET '' | + jq -r '.Vendor // "Unknown"' + ) + bmcSysId=$( + RedfishAPIcall "${bmcInfo}" "${bmcURL}" GET 'Systems' | + jq -r '.Members[0]["@odata.id"] | split("/")[-1]' + ) + bmcMgrId=$( + RedfishAPIcall "${bmcInfo}" "${bmcURL}" GET 'Managers' | + jq -r '.Members[0]["@odata.id"] | split("/")[-1]' + ) + + # Vendor-specific preparation. + case ${bmcVend} in + (Dell) + # Ignore Cert. on `.RFS.1` (VirtualMedia/CD). + RedfishAPIcall "${bmcInfo}" "${bmcURL}" PATCH \ + "Managers/${bmcMgrId}/Attributes" \ + -d '{"Attributes": {"RFS.1.IgnoreCertWarning": "Yes"}}' + ;; + (*) false;; + esac + + # Ensure booting to ISO. + # Note: The BMC does not always guarantee booting to ISO. It has been + # observed the Node can boot to the old OS, despite + # `BootSourceOverrideEnabled=Continuous`. The `WipeDisks()` + # implements ISO Boot detection and fails if it detects the Node + # booted to the old OS, so the entire boot attempt can be retried. + tryLeft=5 didBMCwipe=0 + while ((tryLeft)); do + kill -0 "${tPID}" 2>/dev/null || break + diskWipeMethod='' + + # Eject previously mounted media. + VCD-Eject "${bmcInfo}" "${bmcURL}" "${bmcMgrId}" + # Set Boot Order. + { + # Try to set to VCD for wiping Disks via Host OS. + RedfishAPIcall "${bmcInfo}" "${bmcURL}" PATCH \ + "Systems/${bmcSysId}" \ + -d '{"Boot": { + "BootSourceOverrideEnabled": "Continuous", + "BootSourceOverrideTarget": "Cd" + }}' && + diskWipeMethod=OS + } || ((didBMCwipe)) || { + # Fallback to wiping Disks via BMC. + # Note: Currently, we do not have a solution to perform BMC + # wipe on Dell BOSS Disk, if it is set as RAID. The BOSS RAID + # Disk is most likely used as the Boot Disk, hence the above + # ISO Booting instability issue may still cause the Node to + # boot to the old OS. + WipeDisks "${tPID}" "${bmcInfo}" \ + "${bmcURL}" "${bmcSysId}" "${bmcMgrId}" \ + BMC + didBMCwipe=1 + } + # Mount ISO. + RedfishAPIcall "${bmcInfo}" "${bmcURL}" POST \ + "Managers/${bmcMgrId}/VirtualMedia/CD/Actions/VirtualMedia.InsertMedia" \ + -d "$( + jq -cnr \ + --arg img "${isoURL}" \ + '{ + "Image": $img, + "TransferProtocolType": "HTTPS", + "TransferMethod": "Stream" + }' + )" + # Set boot `Once` if BMC Wipe (`Continuous` not supported). + [ -n "${diskWipeMethod}" ] || { + RedfishAPIcall "${bmcInfo}" "${bmcURL}" PATCH \ + "Systems/${bmcSysId}" \ + -d '{"Boot": { + "BootSourceOverrideEnabled": "Once", + "BootSourceOverrideTarget": "Cd" + }}' + } + # Restart Host. + Host-PowerControl "${bmcInfo}" "${bmcURL}" "${bmcSysId}" ForceRestart + # Wipe Disks via Host OS (only detect ISO Boot for BMC Wipe). + WipeDisks "${tPID}" "${bmcInfo}" \ + "${bmcURL}" "${bmcSysId}" "${bmcMgrId}" \ + "${diskWipeMethod}" && break + ((--tryLeft)) + done + # Restore Boot Order. + [ -z "${diskWipeMethod}" ] || { + RedfishAPIcall "${bmcInfo}" "${bmcURL}" PATCH \ + "Systems/${bmcSysId}" \ + -d '{"Boot": { + "BootSourceOverrideEnabled": "Disabled", + "BootSourceOverrideTarget": "None" + }}' + } + done < <(jq -r ' + .[] | .url + ' 0< "${bmcInfo}") +} |& tee "${ARTIFACT_DIR}/ocp--installer--bmc.log") & taskPIDs+=($!) +# Wait for BootStrap Node to finish. +( + typeset -i tryLeft="${OCP__ABI__WAIT__BOOTSTRAP__TRY}" + while ((tryLeft)); do + openshift-install agent wait-for bootstrap-complete && break + ((--tryLeft)) + done +) +cp -f "${OCP__ABI__CLUSTER_DIR}/auth/kubeconfig" "${SHARED_DIR}/kubeconfig-minimal" + +# Day-1.5 Phase. +( + typeset cfgKey='' cfgVal='' + export KUBECONFIG="${OCP__ABI__CLUSTER_DIR}/auth/kubeconfig" + while IFS=$'\t' read -r cfgKey cfgVal; do + case ${cfgKey} in + (NodeProv) + [ "${cfgVal}" = false ] && { + # Workers are provisioned by ABI. No + # BareMetalHost CRDs or Ironic + # provisioning network. + while true; do + oc -n openshift-machine-api \ + scale MachineSets \ + --replicas 0 --all \ + && break || sleep 60 + done + } + ;; + esac + done 0< <( + yq -o json eval ' + ."Day1.5".config // [] + ' "${ocpABIcfg}" | + jq -r ' + .[] | to_entries[] | + [.key, (.value | tostring)] | join("\t") + ' + ) + true +) & taskPIDs+=($!) +# Wait for OCP installation to complete (`install-complete` can be slow with many workers). +( + typeset -i tryLeft="${OCP__ABI__WAIT__CLUSTER__TRY}" + while ((tryLeft)); do + openshift-install agent wait-for install-complete && break + ((--tryLeft)) + done +) + +# Eject virtual media on all nodes (ISO no longer needed after install). +while IFS= read -r bmcURL; do + VCD-Eject "${bmcInfo}" "${bmcURL}" "$( + RedfishAPIcall "${bmcInfo}" "${bmcURL}" GET 'Managers' | + jq -r '.Members[0]["@odata.id"] | split("/")[-1]' + )" +done < <(jq -r '.[] | .url' 0< "${bmcInfo}") + +# Collect cluster authentication artifacts. +tar zcf "${ARTIFACT_DIR}/ocp.tgz" -C "${OCP__ABI__CLUSTER_DIR}/" auth/ +cp -f "${OCP__ABI__CLUSTER_DIR}/auth/kube"{config,admin-password} "${SHARED_DIR}/" + +export KUBECONFIG="${SHARED_DIR}/kubeconfig" +[ -f "${KUBECONFIG}" ] + +# Upload `KUBECONFIG` to BitWarden. +[ -z "${BW__OBJ_NAME}" ] || Vault--BitWarden--UploadAttachment \ + "${BW__OBJ_NAME}" /var/run/secrets/vault--bit-warden/SvcAcc-RW "${KUBECONFIG}" + +# Ensure Nodes readiness before Day-2 customization. +oc wait node --all --for=condition=Ready --timeout=300s + +# Post-Deployment Customization. +eval "$(BuildCustomScriptsFromYAML OCP__ABI__DAY2_SCRIPTS_YAML)" diff --git a/ci-operator/step-registry/abi/install/bmc/abi-install-bmc-ref.metadata.json b/ci-operator/step-registry/abi/install/bmc/abi-install-bmc-ref.metadata.json new file mode 100644 index 0000000000000..84905fc1dcf90 --- /dev/null +++ b/ci-operator/step-registry/abi/install/bmc/abi-install-bmc-ref.metadata.json @@ -0,0 +1,13 @@ +{ + "path": "abi/install/bmc/abi-install-bmc-ref.yaml", + "owners": { + "approvers": [ + "cspi-qe-ocp-lp", + "ieng-chaos" + ], + "reviewers": [ + "cspi-qe-ocp-lp", + "ieng-chaos" + ] + } +} \ No newline at end of file diff --git a/ci-operator/step-registry/abi/install/bmc/abi-install-bmc-ref.yaml b/ci-operator/step-registry/abi/install/bmc/abi-install-bmc-ref.yaml new file mode 100644 index 0000000000000..bb18b01831622 --- /dev/null +++ b/ci-operator/step-registry/abi/install/bmc/abi-install-bmc-ref.yaml @@ -0,0 +1,113 @@ +ref: + as: abi-install-bmc + from_image: + namespace: ci + name: baremetal-qe-base + tag: latest + cli: latest + commands: abi-install-bmc-commands.sh + resources: + requests: + cpu: '1' + memory: 512Mi + grace_period: 5m + credentials: + - namespace: test-credentials + name: chisel-creds + mount_path: /var/run/secrets/chisel + - namespace: test-credentials + name: ieng--vault--bit-warden + mount_path: /var/run/secrets/vault--bit-warden + - namespace: test-credentials + name: registry-pull-credentials + mount_path: /var/run/secrets/registry-pull--build-farms + env: + - name: OCP__ABI__CLUSTER_DIR + default: ocpClusterDir + documentation: |- + The `openshift-install` workspace path (preferred: relative to the Step's CWD); must match the corresponding conf. Step, e.g., **abi-conf-bm**, + when overriding. + - name: OCP__ABI__CFG_FN + default: ocp--abi--cfg.yaml + documentation: |- + The ABI configuration file Base Name under `${CLUSTER_PROFILE_DIR}/`; must match the corresponding conf. Step, e.g., **abi-conf-bm**, when + overriding. See + [OCP__ABI__CFG_FN](https://github.com/openshift/release/blob/main/ci-operator/step-registry/abi/README.md#ocp__abi__cfg_fn) for details. + - name: OCP__ABI__INSTLR_LOG_LEVEL + default: 'debug' + documentation: |- + Log level for `openshift-install` (`--log-level`). + - name: OCP__ABI__TEAM_NAME + documentation: |- + Required (no default). Vault-synced Chisel basic-auth files under `/secret/chisel` use this value as the filename suffix (`chisel-usr--…`, + `chisel-pwd--…`). + - name: OCP__ABI__TUN_SVC__DP_BASE_URL + documentation: |- + Required (no default). BMC-facing HTTPS prefix for the install ISO on the data plane (no trailing slash), e.g. `https:///dp`. + - name: OCP__ABI__TUN_SVC__DP_PORT + documentation: |- + Required (no default). Server-side data-plane port in the Chisel reverse tunnel and in the ISO URL path. Each Cluster Under Test must reserve a + distinct port when several jobs share one Chisel server (see + [tunneling information](https://github.com/openshift/release/blob/main/ci-operator/step-registry/abi/README.md#tunneling--chisel) for details). The + served ISO URL is `${OCP__ABI__TUN_SVC__DP_BASE_URL}/${OCP__ABI__TUN_SVC__DP_PORT}/`. + - name: OCP__ABI__TUN_SVC__CP_URL + documentation: |- + Required (no default). Control-plane URL for `chisel client` (e.g. `https:///cp`). + - name: OCP__ABI__WAIT__NODE_READY__M + default: '20' + documentation: |- + Maximum wait time (in min.) for the Node to be ready (reachable by SSH). When this time has elapsed, it is assumed that the Node is stuck at booting + phase. + Recommended value: + - 2 * AverageNodeBootTimeInMin + - AverageNodeBootTimeInMin=10 -> 20 + - name: OCP__ABI__WAIT__BOOTSTRAP__TRY + default: '3' + documentation: |- + Maximum attempts for `openshift-install agent wait-for bootstrap-complete` (each attempt is subject to the installer's internal timeout, currently + about 60 min.). + + Recommended value: + - RoundUp((1.5 * AverageNodeBootTimeInMin) * NumOfNodes / InstallerTimeOutInMin) + 1 + - AverageNodeBootTimeInMin=10, NumOfNodes=6, InstallerTimeOutInMin=60 -> 3 + - name: OCP__ABI__WAIT__CLUSTER__TRY + default: '3' + documentation: |- + Maximum attempts for `openshift-install agent wait-for install-complete`. + + Currently `agent wait-for install-complete` does the following: + - Wait for `bootstrap` to complete (max. 60 min.). + - Cluster initialization to complete (max. 40 min.). + - Cluster Operators to be completely deployed (max. 30 min.). + + Since the Step Script calls this AFTER `wait-for bootstrap-complete`, this means the first loop max. may wait up to 40 min. (the Cluster did not get + initialized in this attempt) or up to 70 min. However, most likely the Cluster initialization is completed in the 1st attempt. The maximum wait time + for all Cluster Operators to stabilize depends on the Node hardware. + + Expected completion time, based on Node hardware: + - Bare Metals: + - Cluster Operator `baremetal`: + - Dell PowerEdge R6615 | AMD EPYC 9634 84-Core Processor: < 1 h (6 Nodes) + + Recommended value: + - RoundUp(ExpectedCOstableTimeInMin / 30) + 1 + + - name: OCP__ABI__DAY2_SCRIPTS_YAML + default: 'Scripts: []' + documentation: |- + Phase Customization Script for Day-2 + (see [Phase Customization Scripts](https://github.com/openshift/release/blob/main/ci-operator/step-registry/abi/README.md#phase-customization-scripts)). + - name: BW__OBJ_NAME + default: '' + documentation: |- + Name of the BitWarden Object (Login, Note, etc.) to attach the `KUBECONFIG`. If set to empty string, the `KUBECONFIG` is not uploaded and will be lost + after the test is completed and the CUT (Cluster Under Test) will not be reachable at all. + documentation: |- + The `${CLUSTER_PROFILE_DIR}/` MUST contain files: `ssh-privatekey` and `${OCP__ABI__CFG_FN}`. + + See [ABI overview](https://github.com/openshift/release/blob/main/ci-operator/step-registry/abi/README.md) for details. + + The **Chisel** tunnel service must be exclusively used by one test at a time, hence it must support configurable data-plane port forwarding via URL path + (see `OCP__ABI__TUN_SVC__DP_PORT`); refer to + [tunneling information](https://github.com/openshift/release/blob/main/ci-operator/step-registry/abi/README.md#tunneling--chisel) for details. The + required tunnel-related parameters are documented on their respective Env. Var. entries. diff --git a/ci-operator/step-registry/abi/workflows/OWNERS b/ci-operator/step-registry/abi/workflows/OWNERS new file mode 120000 index 0000000000000..ec405d65a79df --- /dev/null +++ b/ci-operator/step-registry/abi/workflows/OWNERS @@ -0,0 +1 @@ +../OWNERS \ No newline at end of file diff --git a/ci-operator/step-registry/abi/workflows/bm--bmc--cluster-health/OWNERS b/ci-operator/step-registry/abi/workflows/bm--bmc--cluster-health/OWNERS new file mode 120000 index 0000000000000..ec405d65a79df --- /dev/null +++ b/ci-operator/step-registry/abi/workflows/bm--bmc--cluster-health/OWNERS @@ -0,0 +1 @@ +../OWNERS \ No newline at end of file diff --git a/ci-operator/step-registry/abi/workflows/bm--bmc--cluster-health/abi-workflows-bm--bmc--cluster-health-workflow.metadata.json b/ci-operator/step-registry/abi/workflows/bm--bmc--cluster-health/abi-workflows-bm--bmc--cluster-health-workflow.metadata.json new file mode 100644 index 0000000000000..3db2961962d4d --- /dev/null +++ b/ci-operator/step-registry/abi/workflows/bm--bmc--cluster-health/abi-workflows-bm--bmc--cluster-health-workflow.metadata.json @@ -0,0 +1,13 @@ +{ + "path": "abi/workflows/bm--bmc--cluster-health/abi-workflows-bm--bmc--cluster-health-workflow.yaml", + "owners": { + "approvers": [ + "cspi-qe-ocp-lp", + "ieng-chaos" + ], + "reviewers": [ + "cspi-qe-ocp-lp", + "ieng-chaos" + ] + } +} \ No newline at end of file diff --git a/ci-operator/step-registry/abi/workflows/bm--bmc--cluster-health/abi-workflows-bm--bmc--cluster-health-workflow.yaml b/ci-operator/step-registry/abi/workflows/bm--bmc--cluster-health/abi-workflows-bm--bmc--cluster-health-workflow.yaml new file mode 100644 index 0000000000000..cb05cde169b0f --- /dev/null +++ b/ci-operator/step-registry/abi/workflows/bm--bmc--cluster-health/abi-workflows-bm--bmc--cluster-health-workflow.yaml @@ -0,0 +1,12 @@ +workflow: + as: abi-workflows-bm--bmc--cluster-health + steps: + allow_best_effort_post_steps: true + pre: + - chain: abi-chains-bm--bmc + - chain: cucushift-installer-check-cluster-health + post: [] + documentation: |- + This Workflow deploy OpenShift Container Platform (OCP) on Bare Metal with BMC, then perform Cluster Health Check. + + See [ABI overview](https://github.com/openshift/release/blob/main/ci-operator/step-registry/abi/README.md) for details.