From d237fe79aac555d44a2ba627a202a3f2c168789d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Sierant?= Date: Sat, 11 Oct 2025 11:04:46 +0200 Subject: [PATCH 1/2] Fixed openshift cluster cleaner --- docker/cluster-cleaner/Dockerfile | 2 +- docker/cluster-cleaner/Makefile | 4 +- .../scripts/clean-failed-namespaces.sh | 23 +++++---- .../scripts/clean-ops-manager.sh | 11 ----- .../scripts/construction-site.sh | 9 ---- .../scripts/delete-old-builder-pods.sh | 23 --------- docker/cluster-cleaner/templates/job.yaml | 28 ----------- .../templates/ops_manager_cleaner_job.yaml | 48 ------------------- 8 files changed, 18 insertions(+), 130 deletions(-) delete mode 100755 docker/cluster-cleaner/scripts/clean-ops-manager.sh delete mode 100755 docker/cluster-cleaner/scripts/construction-site.sh delete mode 100755 docker/cluster-cleaner/scripts/delete-old-builder-pods.sh delete mode 100644 docker/cluster-cleaner/templates/ops_manager_cleaner_job.yaml diff --git a/docker/cluster-cleaner/Dockerfile b/docker/cluster-cleaner/Dockerfile index 0fcb72fe2..35c9d0c27 100644 --- a/docker/cluster-cleaner/Dockerfile +++ b/docker/cluster-cleaner/Dockerfile @@ -1,6 +1,6 @@ FROM python:3-slim-buster -ADD https://storage.googleapis.com/kubernetes-release/release/v1.13.3/bin/linux/amd64/kubectl /usr/bin +ADD https://dl.k8s.io/release/v1.34.0/bin/linux/amd64/kubectl /usr/bin RUN chmod +x /usr/bin/kubectl COPY scripts/* / diff --git a/docker/cluster-cleaner/Makefile b/docker/cluster-cleaner/Makefile index 630a4688c..d4bfeceb5 100644 --- a/docker/cluster-cleaner/Makefile +++ b/docker/cluster-cleaner/Makefile @@ -1,4 +1,4 @@ -IMAGE_VERSION=0.15 +IMAGE_VERSION=0.18 .PHONY: all all: build push install @@ -18,7 +18,7 @@ install: build push kubectl create namespace cluster-cleaner || true helm template . \ --set cleanerVersion=$(IMAGE_VERSION) \ - --set namespace=cluster-cleaner\ + --set namespace=cluster-cleaner \ --set cleanerNamespace=cluster-cleaner > cluster-cleaner.yaml kubectl apply -f cluster-cleaner.yaml rm cluster-cleaner.yaml diff --git a/docker/cluster-cleaner/scripts/clean-failed-namespaces.sh b/docker/cluster-cleaner/scripts/clean-failed-namespaces.sh index 6b3f3b0c8..6fc607021 100755 --- a/docker/cluster-cleaner/scripts/clean-failed-namespaces.sh +++ b/docker/cluster-cleaner/scripts/clean-failed-namespaces.sh @@ -1,21 +1,24 @@ #!/usr/bin/env sh +touch error.log +tail -F error.log & + delete_resources_safely() { resource_type="$1" namespace="$2" echo "Attempting normal deletion of $resource_type in $namespace..." - kubectl delete "${resource_type}" --all -n "${namespace}" --wait=true --timeout=10s || true + kubectl delete "${resource_type}" --all -n "${namespace}" --wait=true --timeout=10s 2>error.log|| true # Check if any resources are still stuck # Let's not fail here and continue deletion - resources=$(kubectl get "$resource_type" -n "${namespace}" --no-headers -o custom-columns=":metadata.name" 2>/dev/null || true) + resources=$(kubectl get "$resource_type" -n "${namespace}" --no-headers -o custom-columns=":metadata.name" 2>error.log || true) for resource in ${resources}; do echo "${resource_type}/${resource} is still present, force deleting..." - kubectl patch "${resource_type}" "${resource}" -n "${namespace}" -p '{"metadata":{"finalizers":null}}' --type=merge || true - kubectl delete "${resource_type}" "${resource}" -n "${namespace}" --force --grace-period=0 || true + kubectl patch "${resource_type}" "${resource}" -n "${namespace}" -p '{"metadata":{"finalizers":null}}' --type=merge 2>error.log || true + kubectl delete "${resource_type}" "${resource}" -n "${namespace}" --force --grace-period=0 2>error.log || true done } @@ -29,11 +32,12 @@ if [ -z ${LABELS+x} ]; then exit 1 fi + echo "Deleting namespaces for evg tasks that are older than ${DELETE_OLDER_THAN_AMOUNT} ${DELETE_OLDER_THAN_UNIT} with label ${LABELS}" echo "Which are:" kubectl get namespace -l "${LABELS}" -o name -for namespace in $(kubectl get namespace -l "${LABELS}" -o name); do - creation_time=$(kubectl get "${namespace}" -o jsonpath='{.metadata.creationTimestamp}' 2>/dev/null || echo "") +for namespace in $(kubectl get namespace -l "${LABELS}" -o name 2>error.log); do + creation_time=$(kubectl get "${namespace}" -o jsonpath='{.metadata.creationTimestamp}' 2>error.log || echo "") if [ -z "$creation_time" ]; then echo "Namespace ${namespace} does not exist or has no creation timestamp, skipping." @@ -49,14 +53,17 @@ for namespace in $(kubectl get namespace -l "${LABELS}" -o name); do echo "Deleting ${namespace_name}" - csrs_in_namespace=$(kubectl get csr -o name | grep "${namespace_name}" || true) + csrs_in_namespace=$(kubectl get csr -o name 2>error.log | grep "${namespace_name}" 2>/dev/null || true) if [ -n "${csrs_in_namespace}" ]; then - kubectl delete "${csrs_in_namespace}" + kubectl delete "${csrs_in_namespace}" 2>error.log fi delete_resources_safely "mdb" "${namespace_name}" delete_resources_safely "mdbu" "${namespace_name}" + delete_resources_safely "mdbc" "${namespace_name}" + delete_resources_safely "mdbmc" "${namespace_name}" delete_resources_safely "om" "${namespace_name}" + delete_resources_safely "clustermongodbroles" "${namespace_name}" echo "Attempting to delete namespace: ${namespace_name}" diff --git a/docker/cluster-cleaner/scripts/clean-ops-manager.sh b/docker/cluster-cleaner/scripts/clean-ops-manager.sh deleted file mode 100755 index 674700c4c..000000000 --- a/docker/cluster-cleaner/scripts/clean-ops-manager.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env sh - -if [ -z "${OM_NAMESPACE}" ]; then - echo "OM_NAMESPACE env variable is not specified"; - exit 1 -fi - -echo "Removing Ops Manager in ${OM_NAMESPACE}" - -kubectl --namespace "${OM_NAMESPACE}" delete om ops-manager - diff --git a/docker/cluster-cleaner/scripts/construction-site.sh b/docker/cluster-cleaner/scripts/construction-site.sh deleted file mode 100755 index 96aacd21a..000000000 --- a/docker/cluster-cleaner/scripts/construction-site.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env bash - -# -# Builds the `construction-site` namespace and docker config configmap -# - -kubectl create namespace construction-site || true - -kubectl -n construction-site create configmap docker-config --from-literal=config.json='{"credHelpers":{"268558157000.dkr.ecr.us-east-1.amazonaws.com":"ecr-login"}}' diff --git a/docker/cluster-cleaner/scripts/delete-old-builder-pods.sh b/docker/cluster-cleaner/scripts/delete-old-builder-pods.sh deleted file mode 100755 index ed1f39763..000000000 --- a/docker/cluster-cleaner/scripts/delete-old-builder-pods.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env bash - -NAMESPACE=construction-site - -if [ -z ${DELETE_OLDER_THAN_AMOUNT+x} ] || [ -z ${DELETE_OLDER_THAN_UNIT+x} ]; then - echo "Need to set both 'DELETE_OLDER_THAN_AMOUNT' and 'DELETE_OLDER_THAN_UNIT' environment variables." - exit 1 -fi - -for pod in $(kubectl -n ${NAMESPACE} get pods -o name); do - creation_time=$(kubectl -n ${NAMESPACE} get "${pod}" -o jsonpath='{.metadata.creationTimestamp}') - status=$(kubectl get "${pod}" -o jsonpath='{.status.phase}' -n "${NAMESPACE}") - - if [[ "${status}" != "Succeeded" ]] && [[ "${status}" != "Failed" ]]; then - # we don't remove pending tasks - continue - fi - - if ! ./is_older_than.py "${creation_time}" "${DELETE_OLDER_THAN_AMOUNT}" "${DELETE_OLDER_THAN_UNIT}"; then - continue - fi - kubectl -n ${NAMESPACE} delete "${pod}" -done diff --git a/docker/cluster-cleaner/templates/job.yaml b/docker/cluster-cleaner/templates/job.yaml index b131e75fb..d75e6d80a 100644 --- a/docker/cluster-cleaner/templates/job.yaml +++ b/docker/cluster-cleaner/templates/job.yaml @@ -84,34 +84,6 @@ spec: - name: LABELS value: "evg=task" -# Clean old builder pods ---- -apiVersion: batch/v1 -kind: CronJob -metadata: - name: cluster-cleaner-delete-builder-pods - namespace: {{ .Values.cleanerNamespace }} -spec: - # Runs every hour - schedule: "0 * * * *" - jobTemplate: - spec: - template: - spec: - serviceAccountName: cluster-cleaner - restartPolicy: Never - - containers: - - name: cluster-cleaner - image: 268558157000.dkr.ecr.us-east-1.amazonaws.com/dev/cluster-cleaner:{{ .Values.cleanerVersion }} - imagePullPolicy: Always - command: ["./delete-old-builder-pods.sh"] - env: - - name: DELETE_OLDER_THAN_UNIT - value: "minutes" - - name: DELETE_OLDER_THAN_AMOUNT - value: "20" - # Clean old certificates --- apiVersion: batch/v1 diff --git a/docker/cluster-cleaner/templates/ops_manager_cleaner_job.yaml b/docker/cluster-cleaner/templates/ops_manager_cleaner_job.yaml deleted file mode 100644 index 05509344a..000000000 --- a/docker/cluster-cleaner/templates/ops_manager_cleaner_job.yaml +++ /dev/null @@ -1,48 +0,0 @@ -{{ if .Values.namespace }} ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: ops-manager-cleaner - namespace: {{ .Values.namespace }} - ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: ops-manager-cleaner - namespace: {{ .Values.namespace }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: cluster-admin -subjects: -- kind: ServiceAccount - name: ops-manager-cleaner - namespace: {{ .Values.namespace }} - ---- -apiVersion: batch/v1 -kind: CronJob -metadata: - name: cluster-cleaner-ops-manager - namespace: {{ .Values.namespace }} -spec: - # Run at 3:00 am every day. - schedule: "0 3 * * *" - jobTemplate: - spec: - template: - spec: - serviceAccountName: ops-manager-cleaner - restartPolicy: Never - - containers: - - name: cluster-cleaner - image: 268558157000.dkr.ecr.us-east-1.amazonaws.com/dev/cluster-cleaner:{{ .Values.cleanerVersion }} - imagePullPolicy: Always - command: ["./clean-ops-manager.sh"] - env: - - name: OM_NAMESPACE - value: {{ .Values.namespace }} -{{ end }} From 63d76690a3ea3acd899c0c09f4f78b96b791703c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Sierant?= Date: Sat, 11 Oct 2025 11:19:17 +0200 Subject: [PATCH 2/2] Fixed cleaning up namespaces in openshift --- scripts/evergreen/e2e/e2e.sh | 2 +- scripts/funcs/kubernetes | 46 +++++++++++++++++++++++++++--------- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/scripts/evergreen/e2e/e2e.sh b/scripts/evergreen/e2e/e2e.sh index e1e6ed556..81b4e7d0b 100755 --- a/scripts/evergreen/e2e/e2e.sh +++ b/scripts/evergreen/e2e/e2e.sh @@ -123,7 +123,7 @@ echo "TEST_NAME is set to: ${TEST_NAME}" delete_operator "${NAMESPACE}" -# We'll have the task running for the alloca ted time, minus the time it took us +# We'll have the task running for the allocated time, minus the time it took us # to get all the way here, assuming configuring and deploying the operator can # take a bit of time. This is needed because Evergreen kills the process *AND* # Docker containers running on the host when it hits a timeout. Under these diff --git a/scripts/funcs/kubernetes b/scripts/funcs/kubernetes index 11250422d..567c43deb 100644 --- a/scripts/funcs/kubernetes +++ b/scripts/funcs/kubernetes @@ -98,7 +98,7 @@ create_image_registries_secret() { context=$1 namespace=$2 secret_name=$3 - + # Detect the correct config file path based on container runtime local config_file local temp_config_file="" @@ -106,7 +106,7 @@ create_image_registries_secret() { # For Podman, use root's auth.json since minikube uses sudo podman config_file="/root/.config/containers/auth.json" echo "Using Podman config: ${config_file}" - + # Create a temporary copy that the current user can read temp_config_file=$(mktemp) sudo cp "${config_file}" "${temp_config_file}" @@ -117,7 +117,7 @@ create_image_registries_secret() { config_file="${HOME}/.docker/config.json" echo "Using Docker config: ${config_file}" fi - + # shellcheck disable=SC2154 if kubectl --context "${context}" get namespace "${namespace}"; then kubectl --context "${context}" -n "${namespace}" delete secret "${secret_name}" --ignore-not-found @@ -127,7 +127,7 @@ create_image_registries_secret() { else echo "Skipping creating pull secret in ${context}/${namespace}. The namespace doesn't exist yet." fi - + # Clean up temporary file if [[ -n "${temp_config_file}" ]] && [[ -f "${temp_config_file}" ]]; then rm -f "${temp_config_file}" @@ -156,6 +156,26 @@ create_image_registries_secret() { fi } +force_delete_all_resources_from_namespace() { + resource_type="$1" + namespace="$2" + + echo "Attempting normal deletion of ${resource_type} in ${namespace}..." + kubectl delete "${resource_type}" --all -n "${namespace}" --wait=true --timeout=10s || true + + # Check if any resources are still stuck + echo "Checking if any resources are still stuck:" + kubectl get "${resource_type}" -n "${namespace}" --no-headers -o custom-columns=":metadata.name" || true + resources=$(kubectl get "${resource_type}" -n "${namespace}" --no-headers -o custom-columns=":metadata.name" 2>/dev/null || true) + + for resource in ${resources}; do + echo "${resource_type}/${resource} is still present, force deleting..." + + kubectl patch "${resource_type}" "${resource}" -n "${namespace}" -p '{"metadata":{"finalizers":null}}' --type=merge || true + kubectl delete "${resource_type}" "${resource}" -n "${namespace}" --force --grace-period=0 || true + done +} + reset_namespace() { context=$1 namespace=$2 @@ -166,19 +186,23 @@ reset_namespace() { set +e - helm uninstall --kube-context="${context}" mongodb-kubernetes-operator || true & - helm uninstall --kube-context="${context}" mongodb-kubernetes-operator-multi-cluster || true & - # Cleans the namespace. Note, that fine-grained cleanup is performed instead of just deleting the namespace as it takes # considerably less time title "Cleaning Kubernetes resources in context: ${context}" ensure_namespace "${namespace}" - kubectl delete --context "${context}" mdb --all -n "${namespace}" || true - kubectl delete --context "${context}" mdbu --all -n "${namespace}" || true - kubectl delete --context "${context}" mdbmc --all -n "${namespace}" || true - kubectl delete --context "${context}" om --all -n "${namespace}" || true + force_delete_all_resources_from_namespace "mdb" "${namespace}" + force_delete_all_resources_from_namespace "mdbu" "${namespace}" + force_delete_all_resources_from_namespace "mdbc" "${namespace}" + force_delete_all_resources_from_namespace "mdbmc" "${namespace}" + force_delete_all_resources_from_namespace "om" "${namespace}" + force_delete_all_resources_from_namespace "clustermongodbroles" "${namespace}" + + echo "Sleeping to allow the operator to perform cleanups" + sleep 10 + helm uninstall --kube-context="${context}" mongodb-kubernetes-operator || true & + helm uninstall --kube-context="${context}" mongodb-kubernetes-operator-multi-cluster || true & # Openshift variant runs all tests sequentially. In order to avoid clashes between tests, we need to wait till # the namespace is gone. This trigger OpenShift Project deletion, which is a "Namespace on Steroids" and it takes