Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions ci-operator/step-registry/rosa/pool/OWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
approvers:
- dustman9000
- joshbranham
- bmeng
- ravitri
reviewers:
- dustman9000
- joshbranham
- bmeng
- ravitri
10 changes: 10 additions & 0 deletions ci-operator/step-registry/rosa/pool/checkin/OWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
approvers:
- dustman9000
- joshbranham
- bmeng
- ravitri
reviewers:
- dustman9000
- joshbranham
- bmeng
- ravitri
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash

set -o nounset
set -o errexit
set -o pipefail

log(){
echo -e "\033[1m$(date "+%d-%m-%YT%H:%M:%S") " "${*}\033[0m" >&2
}

POOL_NAMESPACE="${POOL_NAMESPACE:-rosa-pool}"
POOL_HOST_KUBECONFIG="/etc/rosa-pool-manager/kubeconfig"
CLAIM_FILE="${SHARED_DIR}/pool-claim"

if [[ ! -f "${CLAIM_FILE}" ]]; then
log "No pool claim found (${CLAIM_FILE} does not exist). Nothing to check in."
exit 0
fi

CM_NAME=$(cat "${CLAIM_FILE}")
if [[ -z "${CM_NAME}" ]]; then
log "Pool claim file is empty. Nothing to check in."
exit 0
fi

if [[ ! -f "${POOL_HOST_KUBECONFIG}" ]]; then
log "WARNING: Pool host kubeconfig not found. Cannot check in ${CM_NAME}."
log "The health check job will recover this cluster via stale lease detection."
exit 0
fi

pool_oc() {
oc --kubeconfig="${POOL_HOST_KUBECONFIG}" "$@"
}

RELEASED_AT=$(date -u +%Y-%m-%dT%H:%M:%SZ)

log "Checking in pool cluster: ${CM_NAME}"

# Patch the ConfigMap back to available
if pool_oc patch configmap "${CM_NAME}" -n "${POOL_NAMESPACE}" --type merge -p '{
"metadata": {
"labels": {
"rosa-pool/status": "available"
},
"annotations": {
"rosa-pool/holder": "",
"rosa-pool/build-id": "",
"rosa-pool/released-at": "'"${RELEASED_AT}"'"
}
}
}'; then
log "Cluster ${CM_NAME} returned to pool"
else
log "WARNING: Failed to check in ${CM_NAME}. Health check will recover it."
fi
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"path": "rosa/pool/checkin/rosa-pool-checkin-ref.yaml",
"owners": {
"approvers": [
"dustman9000",
"joshbranham",
"bmeng",
"ravitri"
],
"reviewers": [
"dustman9000",
"joshbranham",
"bmeng",
"ravitri"
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
ref:
as: rosa-pool-checkin
from: cli
best_effort: true
grace_period: 1m
commands: rosa-pool-checkin-commands.sh
resources:
requests:
cpu: 100m
memory: 128Mi
timeout: 5m0s
credentials:
- namespace: test-credentials
name: rosa-pool-manager
mount_path: /etc/rosa-pool-manager
env:
- name: POOL_NAMESPACE
default: "rosa-pool"
documentation: Namespace on the pool host cluster where pool ConfigMaps live.
documentation: |-
Returns a ROSA cluster to the standby pool after testing.

Reads the pool claim from SHARED_DIR/pool-claim (written by
rosa-pool-checkout) and patches the ConfigMap back to available
status, clearing the holder and build-id annotations.

Runs as best_effort so job cleanup always proceeds even if
checkin fails. A periodic health check job recovers stale leases.
10 changes: 10 additions & 0 deletions ci-operator/step-registry/rosa/pool/checkout/OWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
approvers:
- dustman9000
- joshbranham
- bmeng
- ravitri
reviewers:
- dustman9000
- joshbranham
- bmeng
- ravitri
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
#!/bin/bash

set -o nounset
set -o errexit
set -o pipefail

trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN} && wait; fi' TERM

log(){
echo -e "\033[1m$(date "+%d-%m-%YT%H:%M:%S") " "${*}\033[0m" >&2
}

POOL_NAMESPACE="${POOL_NAMESPACE:-rosa-pool}"
POOL_TYPE="${POOL_TYPE:-classic-sts}"
POOL_REGION="${POOL_REGION:-}"
POOL_VERSION="${POOL_VERSION:-}"
POOL_CHECKOUT_TIMEOUT="${POOL_CHECKOUT_TIMEOUT_MINUTES:-30}"
POOL_HOST_KUBECONFIG="/etc/rosa-pool-manager/kubeconfig"
OCM_LOGIN_ENV="${OCM_LOGIN_ENV:-staging}"

if [[ ! -f "${POOL_HOST_KUBECONFIG}" ]]; then
log "ERROR: Pool host kubeconfig not found at ${POOL_HOST_KUBECONFIG}"
exit 1
fi

pool_oc() {
oc --kubeconfig="${POOL_HOST_KUBECONFIG}" "$@"
}

# Build label selector
SELECTOR="rosa-pool/managed=true,rosa-pool/status=available,rosa-pool/type=${POOL_TYPE}"
if [[ -n "${POOL_REGION}" ]]; then
SELECTOR="${SELECTOR},rosa-pool/region=${POOL_REGION}"
fi
if [[ -n "${POOL_VERSION}" ]]; then
SELECTOR="${SELECTOR},rosa-pool/version=${POOL_VERSION}"
fi

log "Pool checkout starting"
log " Type: ${POOL_TYPE}"
log " Region: ${POOL_REGION:-any}"
log " Version: ${POOL_VERSION:-any}"
log " Timeout: ${POOL_CHECKOUT_TIMEOUT} minutes"
log " Selector: ${SELECTOR}"

JOB_NAME="${JOB_NAME:-unknown-job}"
BUILD_ID="${BUILD_ID:-unknown-build}"
DEADLINE=$(($(date +%s) + POOL_CHECKOUT_TIMEOUT * 60))
ATTEMPT=0

while true; do
NOW=$(date +%s)
if [[ ${NOW} -ge ${DEADLINE} ]]; then
log "ERROR: Pool checkout timed out after ${POOL_CHECKOUT_TIMEOUT} minutes"
log "No available clusters matching selector: ${SELECTOR}"
exit 1
fi

REMAINING=$(( (DEADLINE - NOW) / 60 ))
ATTEMPT=$((ATTEMPT + 1))
log "Attempt ${ATTEMPT} (${REMAINING}m remaining)"

# List available clusters
CLUSTERS_JSON=$(pool_oc get configmap -n "${POOL_NAMESPACE}" -l "${SELECTOR}" -o json 2>/dev/null || echo '{"items":[]}')
COUNT=$(echo "${CLUSTERS_JSON}" | jq '.items | length')

if [[ "${COUNT}" -eq 0 ]]; then
log "No available clusters in pool. Waiting 30s..."
sleep 30
continue
fi

log "Found ${COUNT} available cluster(s), attempting claim..."

# Try to claim each available cluster
CLAIMED=false
for i in $(seq 0 $((COUNT - 1))); do
CM=$(echo "${CLUSTERS_JSON}" | jq ".items[${i}]")
CM_NAME=$(echo "${CM}" | jq -r '.metadata.name')
CLUSTER_ID=$(echo "${CM}" | jq -r '.data["cluster-id"]')
CLUSTER_NAME=$(echo "${CM}" | jq -r '.data["cluster-name"]')

log "Trying to claim ${CM_NAME} (cluster: ${CLUSTER_ID})..."

# Mutate the ConfigMap in memory: set status to in-use with holder info
ACQUIRED_AT=$(date -u +%Y-%m-%dT%H:%M:%SZ)
MODIFIED=$(echo "${CM}" | jq '
.metadata.labels["rosa-pool/status"] = "in-use" |
.metadata.annotations["rosa-pool/holder"] = "'"${JOB_NAME}"'" |
.metadata.annotations["rosa-pool/build-id"] = "'"${BUILD_ID}"'" |
.metadata.annotations["rosa-pool/acquired-at"] = "'"${ACQUIRED_AT}"'"
')

# CAS: oc replace will fail with 409 if resourceVersion changed
if echo "${MODIFIED}" | pool_oc replace -n "${POOL_NAMESPACE}" -f - 2>/dev/null; then
log "Claimed cluster ${CLUSTER_ID} (${CM_NAME})"
CLAIMED=true

# Write claim metadata to SHARED_DIR for checkin and downstream steps
echo "${CM_NAME}" > "${SHARED_DIR}/pool-claim"
echo "${CLUSTER_ID}" > "${SHARED_DIR}/cluster-id"
echo "${CLUSTER_NAME}" > "${SHARED_DIR}/cluster-name"

# Write cluster metadata for downstream steps
echo "${CM}" | jq -r '.data.region // empty' > "${SHARED_DIR}/cluster-region"
echo "${CM}" | jq -r '.data["ocm-env"] // empty' > "${SHARED_DIR}/ocm-env"
echo "${CM}" | jq -r '.data["api-url"] // empty' > "${SHARED_DIR}/api-url"

break
else
log "Conflict on ${CM_NAME} (another job claimed it). Trying next..."
fi
done

if [[ "${CLAIMED}" == "true" ]]; then
break
fi

log "All available clusters were claimed by other jobs. Waiting 15s..."
sleep 15
done

CLUSTER_ID=$(cat "${SHARED_DIR}/cluster-id")
log "Cluster claimed: ${CLUSTER_ID}"

# Log in to OCM for backplane access
SSO_CLIENT_ID=$(cat "${CLUSTER_PROFILE_DIR}/sso-client-id" 2>/dev/null || true)
SSO_CLIENT_SECRET=$(cat "${CLUSTER_PROFILE_DIR}/sso-client-secret" 2>/dev/null || true)
OCM_TOKEN=$(cat "${CLUSTER_PROFILE_DIR}/ocm-token" 2>/dev/null || true)

if [[ -n "${SSO_CLIENT_ID}" && -n "${SSO_CLIENT_SECRET}" ]]; then
log "Logging into OCM ${OCM_LOGIN_ENV} with SSO credentials"
ocm login --url "${OCM_LOGIN_ENV}" --client-id "${SSO_CLIENT_ID}" --client-secret "${SSO_CLIENT_SECRET}"
elif [[ -n "${OCM_TOKEN}" ]]; then
log "Logging into OCM ${OCM_LOGIN_ENV} with offline token"
ocm login --url "${OCM_LOGIN_ENV}" --token "${OCM_TOKEN}"
else
log "ERROR: No OCM credentials found in cluster profile"
exit 1
fi

# Get cluster access via backplane
log "Getting kubeconfig for ${CLUSTER_ID} via backplane"
ocm backplane login "${CLUSTER_ID}" --multi

# Copy the backplane-generated kubeconfig to SHARED_DIR
BACKPLANE_KUBECONFIG="${HOME}/.kube/backplane/${CLUSTER_ID}/config"
if [[ -f "${BACKPLANE_KUBECONFIG}" ]]; then
cp "${BACKPLANE_KUBECONFIG}" "${SHARED_DIR}/kubeconfig"
elif [[ -f "${HOME}/.kube/config" ]]; then
cp "${HOME}/.kube/config" "${SHARED_DIR}/kubeconfig"
else
log "ERROR: No kubeconfig produced by backplane login"
exit 1
fi

# Verify cluster access
export KUBECONFIG="${SHARED_DIR}/kubeconfig"
if oc whoami &>/dev/null; then
log "Verified cluster access: $(oc whoami --show-server)"
log "Nodes: $(oc get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ')"
else
log "WARNING: Could not verify cluster access (oc whoami failed)"
Comment on lines +149 to +163

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Validate kubeconfig target matches the claimed cluster before continuing.

Line 150-Line 163 can succeed with a kubeconfig that authenticates but points at the wrong cluster (fallback path). That can run E2E actions against an unintended target.

Suggested fix
 # Verify cluster access
 export KUBECONFIG="${SHARED_DIR}/kubeconfig"
 if oc whoami &>/dev/null; then
-    log "Verified cluster access: $(oc whoami --show-server)"
+    CURRENT_SERVER="$(oc whoami --show-server)"
+    EXPECTED_SERVER="$(cat "${SHARED_DIR}/api-url" 2>/dev/null || true)"
+    if [[ -n "${EXPECTED_SERVER}" && "${CURRENT_SERVER}" != "${EXPECTED_SERVER}" ]]; then
+        log "ERROR: Kubeconfig server mismatch. Expected ${EXPECTED_SERVER}, got ${CURRENT_SERVER}"
+        exit 1
+    fi
+    log "Verified cluster access: ${CURRENT_SERVER}"
     log "Nodes: $(oc get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ')"
 else
     log "WARNING: Could not verify cluster access (oc whoami failed)"
 fi
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@ci-operator/step-registry/rosa/pool/checkout/rosa-pool-checkout-commands.sh`
around lines 149 - 163, The kubeconfig fallback can authenticate to a different
cluster; after copying BACKPLANE_KUBECONFIG (or the fallback
${HOME}/.kube/config) to "${SHARED_DIR}/kubeconfig", explicitly verify the
target matches the expected cluster before proceeding: set
KUBECONFIG="${SHARED_DIR}/kubeconfig", extract the server/cluster identifier
from the produced kubeconfig (e.g. via oc whoami --show-server or oc config
view) and compare it to the expected server/cluster derived from
BACKPLANE_KUBECONFIG (or an explicit expected variable); if they differ, log an
error and exit instead of continuing to run E2E commands. Ensure this check is
performed prior to the existing oc whoami/oc get nodes verification.

fi

log "Pool checkout complete"
log " Cluster ID: ${CLUSTER_ID}"
log " Pool claim: $(cat "${SHARED_DIR}/pool-claim")"
log " Kubeconfig: ${SHARED_DIR}/kubeconfig"
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"path": "rosa/pool/checkout/rosa-pool-checkout-ref.yaml",
"owners": {
"approvers": [
"dustman9000",
"joshbranham",
"bmeng",
"ravitri"
],
"reviewers": [
"dustman9000",
"joshbranham",
"bmeng",
"ravitri"
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
ref:
as: rosa-pool-checkout
from: rosa-aws-cli
grace_period: 2m
commands: rosa-pool-checkout-commands.sh
resources:
requests:
cpu: 100m
memory: 300Mi
timeout: 35m0s
credentials:
- namespace: test-credentials
name: rosa-pool-manager
mount_path: /etc/rosa-pool-manager
env:
- name: POOL_TYPE
default: "classic-sts"
documentation: Label selector value for rosa-pool/type. Filters which pool clusters are eligible.
- name: POOL_REGION
default: ""
documentation: Optional region filter. When set, only clusters with matching rosa-pool/region label are eligible.
- name: POOL_VERSION
default: ""
documentation: Optional version filter. When set, only clusters with matching rosa-pool/version label are eligible.
- name: POOL_CHECKOUT_TIMEOUT_MINUTES
default: "30"
documentation: Maximum time to wait for an available cluster before failing.
- name: POOL_NAMESPACE
default: "rosa-pool"
documentation: Namespace on the pool host cluster where pool ConfigMaps live.
- name: OCM_LOGIN_ENV
default: "staging"
documentation: OCM environment for backplane login.
documentation: |-
Checks out a pre-provisioned ROSA cluster from the standby pool.

Uses ConfigMap-based lease tracking with optimistic concurrency (CAS)
to safely claim a cluster. If no clusters are available, waits and
retries until POOL_CHECKOUT_TIMEOUT_MINUTES. On successful checkout,
logs in via ocm backplane and writes cluster-id, cluster-name, and
kubeconfig to SHARED_DIR.

Pool ConfigMaps live in POOL_NAMESPACE on the pool host cluster.
Each ConfigMap represents one pool cluster with labels for type,
region, version, and availability status.

Requires rosa-pool-manager credentials (kubeconfig for pool host)
and OCM credentials (from cluster profile) for backplane login.
10 changes: 10 additions & 0 deletions ci-operator/step-registry/rosa/pool/e2e-workflow/OWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
approvers:
- dustman9000
- joshbranham
- bmeng
- ravitri
reviewers:
- dustman9000
- joshbranham
- bmeng
- ravitri
Loading