diff --git a/ci-operator/step-registry/openshift/claude/payload/agent/openshift-claude-payload-agent-commands.sh b/ci-operator/step-registry/openshift/claude/payload/agent/openshift-claude-payload-agent-commands.sh index 8cddaa8a4675b..f6694a58d9b01 100644 --- a/ci-operator/step-registry/openshift/claude/payload/agent/openshift-claude-payload-agent-commands.sh +++ b/ci-operator/step-registry/openshift/claude/payload/agent/openshift-claude-payload-agent-commands.sh @@ -65,19 +65,30 @@ echo "Version: ${VERSION}, Stream: ${STREAM}" echo "Release API: ${API_URL}" echo "Automatic reverts enabled: ${ENABLE_PAYLOAD_REVERT}" -# Poll until all blocking jobs have finished (no Pending jobs remain). -# We can't wait for the payload to reach terminal state because this -# analysis job is itself part of the payload's verification jobs. +# Poll until blocking jobs finish OR the payload reaches a terminal state. +# The release controller can report jobs as Pending even after they complete +# in Prow, so also check the payload phase as a fallback. The phase can be +# briefly wrong at creation time (RC bug), so we require PHASE_GRACE_PERIOD +# to elapse before trusting a terminal phase. MAX_WAIT=36000 # 10 hours in seconds POLL_INTERVAL=300 # 5 minutes +PHASE_GRACE_PERIOD=3600 # 1 hour ELAPSED=0 POLL_COUNT=0 +# Extract payload creation time from the tag (e.g. 4.22.0-0.nightly-2026-02-25-152806) +PAYLOAD_TIMESTAMP=$(echo "${PAYLOAD_TAG}" | grep -oP '\d{4}-\d{2}-\d{2}-\d{6}$') +PAYLOAD_CREATED=$(date -u -d "${PAYLOAD_TIMESTAMP:0:10} ${PAYLOAD_TIMESTAMP:11:2}:${PAYLOAD_TIMESTAMP:13:2}:${PAYLOAD_TIMESTAMP:15:2}" +%s 2>/dev/null || echo "0") + PHASE_WAIT_START=$(date +%s) echo "" -echo "=== Waiting for all blocking jobs to complete before analysis ===" +echo "=== Waiting for blocking jobs to complete (or payload to reach terminal state) ===" echo "Polling every $((POLL_INTERVAL / 60)) minutes (timeout: $((MAX_WAIT / 3600)) hours)" +if [[ "${PAYLOAD_CREATED}" -gt 0 ]]; then + echo "Payload created at: $(date -u -d "@${PAYLOAD_CREATED}" '+%Y-%m-%d %H:%M:%S UTC' 2>/dev/null || echo "${PAYLOAD_TIMESTAMP}")" + echo "Terminal phase trusted after: $((PHASE_GRACE_PERIOD / 60)) minutes from creation" +fi echo "" while true; do @@ -87,11 +98,12 @@ while true; do FAILED=$(echo "${RELEASE_JSON}" | jq '[.results.blockingJobs // {} | to_entries[] | select(.value.state == "Failed")] | length') SUCCEEDED=$(echo "${RELEASE_JSON}" | jq '[.results.blockingJobs // {} | to_entries[] | select(.value.state == "Succeeded")] | length') TOTAL=$(echo "${RELEASE_JSON}" | jq '[.results.blockingJobs // {} | to_entries[]] | length') + PHASE=$(echo "${RELEASE_JSON}" | jq -r '.phase // "Unknown"') # Guard against release controller race: jobs may briefly report as Succeeded before being dispatched NOT_STARTED=$(echo "${RELEASE_JSON}" | jq '[.results.blockingJobs // {} | to_entries[] | select(.value.url == null or .value.url == "")] | length') ELAPSED_MIN=$((ELAPSED / 60)) - echo "[Poll #${POLL_COUNT} | ${ELAPSED_MIN}m elapsed] Blocking jobs: ${SUCCEEDED}/${TOTAL} succeeded, ${PENDING} pending, ${FAILED} failed" + echo "[Poll #${POLL_COUNT} | ${ELAPSED_MIN}m elapsed] Phase: ${PHASE} | Blocking jobs: ${SUCCEEDED}/${TOTAL} succeeded, ${PENDING} pending, ${FAILED} failed" if [[ "${NOT_STARTED}" -gt 0 ]]; then echo " ${NOT_STARTED} job(s) have no prow URL yet, waiting for them to be dispatched..." @@ -125,13 +137,30 @@ _Agent: ${CLAUDE_MODEL}_" fi echo "All blocking jobs have completed. ${FAILED}/${TOTAL} failed. Starting analysis..." break + elif [[ "${PHASE}" == "Accepted" || "${PHASE}" == "Rejected" ]]; then + NOW=$(date +%s) + PAYLOAD_AGE=$(( NOW - PAYLOAD_CREATED )) + if [[ "${PAYLOAD_CREATED}" -gt 0 && "${PAYLOAD_AGE}" -lt "${PHASE_GRACE_PERIOD}" ]]; then + REMAINING=$(( (PHASE_GRACE_PERIOD - PAYLOAD_AGE) / 60 )) + echo " Payload phase is ${PHASE} but payload is only $((PAYLOAD_AGE / 60))m old (grace period: $((PHASE_GRACE_PERIOD / 60))m). Waiting ${REMAINING}m more before trusting phase..." + else + echo "" + echo "Payload reached terminal state (${PHASE}) with ${PENDING} job(s) still pending." + if [[ "${FAILED}" -gt 0 ]]; then + echo "${FAILED}/${TOTAL} blocking jobs failed. Starting analysis..." + break + elif [[ "${PHASE}" == "Accepted" ]]; then + echo "Payload was accepted. No analysis needed." + exit 0 + fi + fi fi ELAPSED=$((ELAPSED + POLL_INTERVAL)) if [[ ${ELAPSED} -ge ${MAX_WAIT} ]]; then echo "" - echo "Timed out after $((MAX_WAIT / 3600)) hours waiting for blocking jobs to complete (${PENDING} still pending)." - exit 1 + echo "ERROR: Timed out after $((MAX_WAIT / 3600)) hours waiting for blocking jobs to complete (${PENDING} still pending). Proceeding to analyze what we have..." + break fi echo " Next check in $((POLL_INTERVAL / 60)) minutes..."