diff --git a/ci-operator/step-registry/openshift-qe/hypershift-infra/openshift-qe-hypershift-infra-commands.sh b/ci-operator/step-registry/openshift-qe/hypershift-infra/openshift-qe-hypershift-infra-commands.sh index cdc3a5386db7c..a9c2f5f3fd0ac 100644 --- a/ci-operator/step-registry/openshift-qe/hypershift-infra/openshift-qe-hypershift-infra-commands.sh +++ b/ci-operator/step-registry/openshift-qe/hypershift-infra/openshift-qe-hypershift-infra-commands.sh @@ -46,17 +46,92 @@ function checkForInfraReady() { function rebalanceInfra() { if [[ $1 == "prometheus-k8s" ]] ; then - log "$(date) - Initiate migration of prometheus componenets to infra nodepools" + log "$(date) - Initiate migration of prometheus to infra nodepools" oc get pods -n openshift-monitoring -o wide | grep prometheus-k8s oc get sts prometheus-k8s -n openshift-monitoring - + + log "$(date) - Apply cluster-monitoring-config to move prometheus to infra nodes" + # Note: Fresh ROSA HCP clusters don't have cluster-monitoring-config by default. + # Safe to create/replace for single-use CI clusters. If running against reused + # clusters with existing monitoring config, this would overwrite retention/resources. + # Only moving prometheusK8s as it's the resource-intensive component; other monitoring + # components consume minimal resources and can remain on worker nodes. + cat << 'EOF' | oc apply -f - +apiVersion: v1 +kind: ConfigMap +metadata: + name: cluster-monitoring-config + namespace: openshift-monitoring +data: + config.yaml: |+ + prometheusK8s: + nodeSelector: + node-role.kubernetes.io/infra: "" + tolerations: + - effect: "NoSchedule" + key: "node-role.kubernetes.io/infra" + operator: "Exists" +EOF + + log "$(date) - Wait for cluster-monitoring-operator to reconcile the configuration" + RECONCILED=false + for i in {1..30}; do + if oc get sts prometheus-k8s -n openshift-monitoring -o json | \ + jq -e '.spec.template.spec.nodeSelector["node-role.kubernetes.io/infra"] == "" and any(.spec.template.spec.tolerations[]?; .key == "node-role.kubernetes.io/infra" and .operator == "Exists" and .effect == "NoSchedule")' >/dev/null; then + RECONCILED=true + log "$(date) - StatefulSet reconciled with infra nodeSelector and tolerations" + break + fi + [[ $((i % 6)) -eq 0 ]] && log "$(date) - Still waiting for reconciliation... ($i/30)" + sleep 10 + done + if [[ "${RECONCILED}" != "true" ]]; then + log "$(date) - ERROR: cluster-monitoring-operator did not update prometheus-k8s placement" + log "Current StatefulSet spec:" + oc get sts prometheus-k8s -n openshift-monitoring -o json | jq '.spec.template.spec | {nodeSelector, tolerations}' + exit 1 + fi + log "$(date) - Restart stateful set pods" echo "rollout restart -n openshift-monitoring statefulset/prometheus-k8s" - oc rollout restart -n openshift-monitoring statefulset/prometheus-k8s - + oc rollout restart -n openshift-monitoring statefulset/prometheus-k8s + log "$(date) - Wait till they are completely restarted" oc rollout status -n openshift-monitoring statefulset/prometheus-k8s - + + log "$(date) - Verify prometheus pods are running on infra nodes" + # Wait up to 2 minutes for pods to be scheduled on infra nodes + RETRY=0 + MAX_RETRIES=12 + VERIFY_SUCCESS=false + while [ $RETRY -lt $MAX_RETRIES ]; do + ALL_ON_INFRA=true + for node in $(oc get pods -n openshift-monitoring -o wide | grep -i "prometheus-k8s-" | grep -i running | awk '{print$7}'); do + if [[ $(oc get nodes --no-headers -l node-role.kubernetes.io/infra | awk '{print$1}' | grep -w "$node") != "" ]]; then + log "$(date) - prometheus pod on $node (infra node) ✓" + else + log "$(date) - WARNING: prometheus pod on $node is NOT an infra node" + ALL_ON_INFRA=false + fi + done + + if [ "$ALL_ON_INFRA" = true ]; then + log "$(date) - All prometheus-k8s pods are on infra nodes ✓" + VERIFY_SUCCESS=true + break + else + RETRY=$((RETRY+1)) + log "$(date) - Retry $RETRY/$MAX_RETRIES: Waiting for prometheus pods to move to infra nodes..." + sleep 10 + fi + done + + if [ "$VERIFY_SUCCESS" = false ]; then + log "$(date) - ERROR: Prometheus pods failed to move to infra nodes after $MAX_RETRIES attempts" + oc get pods -n openshift-monitoring -o wide | grep prometheus-k8s + exit 1 + fi + log "$(date) - Check pods status again and the hosting nodes" oc get pods -n openshift-monitoring -o wide | grep prometheus-k8s else