Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,92 @@ function checkForInfraReady() {

function rebalanceInfra() {
if [[ $1 == "prometheus-k8s" ]] ; then
log "$(date) - Initiate migration of prometheus componenets to infra nodepools"
log "$(date) - Initiate migration of prometheus to infra nodepools"
oc get pods -n openshift-monitoring -o wide | grep prometheus-k8s
oc get sts prometheus-k8s -n openshift-monitoring


log "$(date) - Apply cluster-monitoring-config to move prometheus to infra nodes"
# Note: Fresh ROSA HCP clusters don't have cluster-monitoring-config by default.
# Safe to create/replace for single-use CI clusters. If running against reused
# clusters with existing monitoring config, this would overwrite retention/resources.
# Only moving prometheusK8s as it's the resource-intensive component; other monitoring
# components consume minimal resources and can remain on worker nodes.
cat << 'EOF' | oc apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: cluster-monitoring-config
namespace: openshift-monitoring
data:
config.yaml: |+
prometheusK8s:
nodeSelector:
node-role.kubernetes.io/infra: ""
tolerations:
- effect: "NoSchedule"
key: "node-role.kubernetes.io/infra"
operator: "Exists"
EOF
Comment thread
Sandeepyadav93 marked this conversation as resolved.

log "$(date) - Wait for cluster-monitoring-operator to reconcile the configuration"
RECONCILED=false
for i in {1..30}; do
if oc get sts prometheus-k8s -n openshift-monitoring -o json | \
jq -e '.spec.template.spec.nodeSelector["node-role.kubernetes.io/infra"] == "" and any(.spec.template.spec.tolerations[]?; .key == "node-role.kubernetes.io/infra" and .operator == "Exists" and .effect == "NoSchedule")' >/dev/null; then
RECONCILED=true
log "$(date) - StatefulSet reconciled with infra nodeSelector and tolerations"
break
fi
[[ $((i % 6)) -eq 0 ]] && log "$(date) - Still waiting for reconciliation... ($i/30)"
sleep 10
done
if [[ "${RECONCILED}" != "true" ]]; then
log "$(date) - ERROR: cluster-monitoring-operator did not update prometheus-k8s placement"
log "Current StatefulSet spec:"
oc get sts prometheus-k8s -n openshift-monitoring -o json | jq '.spec.template.spec | {nodeSelector, tolerations}'
exit 1
fi

log "$(date) - Restart stateful set pods"
echo "rollout restart -n openshift-monitoring statefulset/prometheus-k8s"
oc rollout restart -n openshift-monitoring statefulset/prometheus-k8s
oc rollout restart -n openshift-monitoring statefulset/prometheus-k8s

log "$(date) - Wait till they are completely restarted"
oc rollout status -n openshift-monitoring statefulset/prometheus-k8s


log "$(date) - Verify prometheus pods are running on infra nodes"
# Wait up to 2 minutes for pods to be scheduled on infra nodes
RETRY=0
MAX_RETRIES=12
VERIFY_SUCCESS=false
while [ $RETRY -lt $MAX_RETRIES ]; do
ALL_ON_INFRA=true
for node in $(oc get pods -n openshift-monitoring -o wide | grep -i "prometheus-k8s-" | grep -i running | awk '{print$7}'); do
if [[ $(oc get nodes --no-headers -l node-role.kubernetes.io/infra | awk '{print$1}' | grep -w "$node") != "" ]]; then
log "$(date) - prometheus pod on $node (infra node) ✓"
else
log "$(date) - WARNING: prometheus pod on $node is NOT an infra node"
ALL_ON_INFRA=false
fi
done

if [ "$ALL_ON_INFRA" = true ]; then
log "$(date) - All prometheus-k8s pods are on infra nodes ✓"
VERIFY_SUCCESS=true
break
else
RETRY=$((RETRY+1))
log "$(date) - Retry $RETRY/$MAX_RETRIES: Waiting for prometheus pods to move to infra nodes..."
sleep 10
fi
done

if [ "$VERIFY_SUCCESS" = false ]; then
log "$(date) - ERROR: Prometheus pods failed to move to infra nodes after $MAX_RETRIES attempts"
oc get pods -n openshift-monitoring -o wide | grep prometheus-k8s
exit 1
fi

log "$(date) - Check pods status again and the hosting nodes"
oc get pods -n openshift-monitoring -o wide | grep prometheus-k8s
else
Expand Down