From c0c9bdf3fbbe3e0322728fc2c47f07a8089d9f3f Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Tue, 19 Aug 2025 14:11:00 -0700 Subject: [PATCH 1/3] fix: poll until addon manifestworks have been cleaned up Signed-off-by: Artur Shad Nik --- .gitignore | 4 +- .../internal/controller/spoke.go | 57 +++++++++++++++++-- 2 files changed, 56 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index fdc94e6d..2d4c9aa9 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ coverage.* profile.cov *hub-bundle*.tar.gz *spoke-bundle*.tar.gz +*fleetconfig-support-bundle/ # Dependency directories (remove the comment below to include it) # vendor/ @@ -40,6 +41,7 @@ go.work.sum venv/ *__pycache__/ requirements.txt +.DS_Store # Temp files -tmp/ \ No newline at end of file +tmp/ diff --git a/fleetconfig-controller/internal/controller/spoke.go b/fleetconfig-controller/internal/controller/spoke.go index 4fc4a977..01458e08 100644 --- a/fleetconfig-controller/internal/controller/spoke.go +++ b/fleetconfig-controller/internal/controller/spoke.go @@ -9,11 +9,14 @@ import ( "regexp" "slices" "strings" + "time" certificatesv1 "k8s.io/api/certificates/v1" corev1 "k8s.io/api/core/v1" kerrs "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + workapi "open-cluster-management.io/api/client/work/clientset/versioned" clusterv1 "open-cluster-management.io/api/cluster/v1" operatorv1 "open-cluster-management.io/api/operator/v1" workv1 "open-cluster-management.io/api/work/v1" @@ -32,8 +35,10 @@ import ( var csrSuffixPattern = regexp.MustCompile(`-[a-zA-Z0-9]{5}$`) const ( - amwExistsError = "you should manually clean them, uninstall kluster will cause those works out of control." - managedClusterAddOn = "ManagedClusterAddOn" + amwExistsError = "you should manually clean them, uninstall kluster will cause those works out of control." + managedClusterAddOn = "ManagedClusterAddOn" + addonCleanupTimeout = 1 * time.Minute + addonCleanupPollInterval = 2 * time.Second ) // handleSpokes manages Spoke cluster join and upgrade operations @@ -599,16 +604,25 @@ func deregisterSpoke(ctx context.Context, kClient client.Client, hubKubeconfig [ // remove addons only after confirming that the cluster can be unjoined - this avoids leaving dangling resources that may rely on the addon if err := handleAddonDisable(ctx, spoke.Name, spoke.EnabledAddons); err != nil { fc.SetConditions(true, v1alpha1.NewCondition( - "AddonsDisabled", spoke.AddonDisableType(), metav1.ConditionFalse, metav1.ConditionTrue, + err.Error(), spoke.AddonDisableType(), metav1.ConditionFalse, metav1.ConditionTrue, )) return err } + + // Wait for addon manifestWorks to be fully cleaned up before proceeding with unjoin if len(spoke.EnabledAddons) > 0 { + if err := waitForAddonManifestWorksCleanup(ctx, workC, spoke.Name, addonCleanupTimeout); err != nil { + fc.SetConditions(true, v1alpha1.NewCondition( + err.Error(), spoke.AddonDisableType(), metav1.ConditionFalse, metav1.ConditionTrue, + )) + return fmt.Errorf("addon manifestWorks cleanup failed: %w", err) + } fc.SetConditions(true, v1alpha1.NewCondition( "AddonsDisabled", spoke.AddonDisableType(), metav1.ConditionTrue, metav1.ConditionTrue, )) } - // unjoin spoke + + // unjoin spoke - safe to proceed now that addon cleanup is confirmed if err := unjoinSpoke(ctx, kClient, fc, spoke); err != nil { return err } @@ -652,6 +666,41 @@ func allOwnersAddOns(mws []workv1.ManifestWork) bool { return true } +// waitForAddonManifestWorksCleanup polls for addon-related manifestWorks to be removed +// after addon disable operation to avoid race conditions during spoke unjoin +func waitForAddonManifestWorksCleanup(ctx context.Context, workC *workapi.Clientset, spokeName string, timeout time.Duration) error { + logger := log.FromContext(ctx) + logger.V(1).Info("waiting for addon manifestWorks cleanup", "spokeName", spokeName, "timeout", timeout) + + err := wait.PollUntilContextTimeout(ctx, addonCleanupPollInterval, timeout, true, func(ctx context.Context) (bool, error) { + manifestWorks, err := workC.WorkV1().ManifestWorks(spokeName).List(ctx, metav1.ListOptions{}) + if err != nil { + logger.V(3).Info("failed to list manifestWorks during cleanup wait", "error", err) + // Return false to continue polling on transient errors + return false, nil + } + + // Success condition: no manifestWorks remaining + if len(manifestWorks.Items) == 0 { + logger.V(1).Info("addon manifestWorks cleanup completed", "spokeName", spokeName, "remainingManifestWorks", len(manifestWorks.Items)) + return true, nil + } + + logger.V(3).Info("waiting for addon manifestWorks cleanup", + "spokeName", spokeName, + "addonManifestWorks", len(manifestWorks.Items)) + + // Continue polling + return false, nil + }) + + if err != nil { + return fmt.Errorf("timeout waiting for addon manifestWorks cleanup for spoke %s: %w", spokeName, err) + } + + return nil +} + // prepareKlusterletValuesFile creates a temporary file with klusterlet values and returns // args to append and a cleanup function. Returns empty slice if values are empty. func prepareKlusterletValuesFile(values *v1alpha1.KlusterletChartConfig) ([]string, func(), error) { From b65f9fe50bae07aff7d9907a729a5a0bd936c552 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Tue, 19 Aug 2025 15:06:24 -0700 Subject: [PATCH 2/3] refactor: check existing mcao when deregistering spoke Signed-off-by: Artur Shad Nik --- .../internal/controller/addon.go | 14 ++++++++- .../internal/controller/spoke.go | 30 ++++++++++++++----- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/fleetconfig-controller/internal/controller/addon.go b/fleetconfig-controller/internal/controller/addon.go index facf6c8f..c17620cc 100644 --- a/fleetconfig-controller/internal/controller/addon.go +++ b/fleetconfig-controller/internal/controller/addon.go @@ -470,7 +470,19 @@ func handleAddonDisable(ctx context.Context, spokeName string, addons []string) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm addon disable' to complete...") if err != nil { out := append(stdout, stderr...) - return fmt.Errorf("failed to disable addons: %v, output: %s", err, string(out)) + outStr := string(out) + + // Check if the error is due to addon not being found or cluster not found - these are success cases + if strings.Contains(outStr, "add-on not found") { + logger.V(1).Info("addon already disabled (not found)", "managedcluster", spokeName, "addons", addons, "output", outStr) + return nil + } + if strings.Contains(outStr, "managedclusters.cluster.open-cluster-management.io") && strings.Contains(outStr, "not found") { + logger.V(1).Info("addon disable skipped (cluster not found)", "managedcluster", spokeName, "addons", addons, "output", outStr) + return nil + } + + return fmt.Errorf("failed to disable addons: %v, output: %s", err, outStr) } logger.V(1).Info("disabled addons", "managedcluster", spokeName, "addons", addons, "output", string(stdout)) return nil diff --git a/fleetconfig-controller/internal/controller/spoke.go b/fleetconfig-controller/internal/controller/spoke.go index 01458e08..72cd393e 100644 --- a/fleetconfig-controller/internal/controller/spoke.go +++ b/fleetconfig-controller/internal/controller/spoke.go @@ -580,6 +580,10 @@ func deregisterSpoke(ctx context.Context, kClient client.Client, hubKubeconfig [ if err != nil { return err } + addonC, err := common.AddOnClient(hubKubeconfig) + if err != nil { + return err + } // skip clean up if the ManagedCluster resource is not found or if any manifestWorks exist managedCluster, err := clusterC.ClusterV1().ManagedClusters().Get(ctx, spoke.Name, metav1.GetOptions{}) if kerrs.IsNotFound(err) { @@ -601,16 +605,26 @@ func deregisterSpoke(ctx context.Context, kClient client.Client, hubKubeconfig [ } - // remove addons only after confirming that the cluster can be unjoined - this avoids leaving dangling resources that may rely on the addon - if err := handleAddonDisable(ctx, spoke.Name, spoke.EnabledAddons); err != nil { - fc.SetConditions(true, v1alpha1.NewCondition( - err.Error(), spoke.AddonDisableType(), metav1.ConditionFalse, metav1.ConditionTrue, - )) - return err + // disable all addons installed on the spoke, and wait for associated resources to be cleaned up + mcao, err := addonC.AddonV1alpha1().ManagedClusterAddOns(spoke.Name).List(ctx, metav1.ListOptions{LabelSelector: v1alpha1.ManagedBySelector.String()}) + if err != nil { + return fmt.Errorf("failed to list managedClusterAddOns for managedCluster %s: %w", managedCluster.Name, err) } + mcaoList := mcao.Items + if len(mcaoList) > 0 { + var addOnsToDisable []string + for _, m := range mcaoList { + addOnsToDisable = append(addOnsToDisable, m.Name) + } + // remove addons only after confirming that the cluster can be unjoined - this avoids leaving dangling resources that may rely on the addon + if err := handleAddonDisable(ctx, spoke.Name, addOnsToDisable); err != nil { + fc.SetConditions(true, v1alpha1.NewCondition( + err.Error(), spoke.AddonDisableType(), metav1.ConditionFalse, metav1.ConditionTrue, + )) + return err + } - // Wait for addon manifestWorks to be fully cleaned up before proceeding with unjoin - if len(spoke.EnabledAddons) > 0 { + // Wait for addon manifestWorks to be fully cleaned up before proceeding with unjoin if err := waitForAddonManifestWorksCleanup(ctx, workC, spoke.Name, addonCleanupTimeout); err != nil { fc.SetConditions(true, v1alpha1.NewCondition( err.Error(), spoke.AddonDisableType(), metav1.ConditionFalse, metav1.ConditionTrue, From 84fdba57ecbdd44674f7c762ef3cfa018e38e39f Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Tue, 19 Aug 2025 16:31:08 -0700 Subject: [PATCH 3/3] refactor: revert 'check existing mcao when deregistering spoke' Signed-off-by: Artur Shad Nik --- .../internal/controller/addon.go | 4 +- .../internal/controller/spoke.go | 37 ++++++------------- 2 files changed, 14 insertions(+), 27 deletions(-) diff --git a/fleetconfig-controller/internal/controller/addon.go b/fleetconfig-controller/internal/controller/addon.go index c17620cc..83ff68a9 100644 --- a/fleetconfig-controller/internal/controller/addon.go +++ b/fleetconfig-controller/internal/controller/addon.go @@ -474,11 +474,11 @@ func handleAddonDisable(ctx context.Context, spokeName string, addons []string) // Check if the error is due to addon not being found or cluster not found - these are success cases if strings.Contains(outStr, "add-on not found") { - logger.V(1).Info("addon already disabled (not found)", "managedcluster", spokeName, "addons", addons, "output", outStr) + logger.V(5).Info("addon already disabled (not found)", "managedcluster", spokeName, "addons", addons, "output", outStr) return nil } if strings.Contains(outStr, "managedclusters.cluster.open-cluster-management.io") && strings.Contains(outStr, "not found") { - logger.V(1).Info("addon disable skipped (cluster not found)", "managedcluster", spokeName, "addons", addons, "output", outStr) + logger.V(5).Info("addon disable skipped (cluster not found)", "managedcluster", spokeName, "addons", addons, "output", outStr) return nil } diff --git a/fleetconfig-controller/internal/controller/spoke.go b/fleetconfig-controller/internal/controller/spoke.go index 72cd393e..ef4c1499 100644 --- a/fleetconfig-controller/internal/controller/spoke.go +++ b/fleetconfig-controller/internal/controller/spoke.go @@ -580,10 +580,7 @@ func deregisterSpoke(ctx context.Context, kClient client.Client, hubKubeconfig [ if err != nil { return err } - addonC, err := common.AddOnClient(hubKubeconfig) - if err != nil { - return err - } + // skip clean up if the ManagedCluster resource is not found or if any manifestWorks exist managedCluster, err := clusterC.ClusterV1().ManagedClusters().Get(ctx, spoke.Name, metav1.GetOptions{}) if kerrs.IsNotFound(err) { @@ -605,25 +602,15 @@ func deregisterSpoke(ctx context.Context, kClient client.Client, hubKubeconfig [ } - // disable all addons installed on the spoke, and wait for associated resources to be cleaned up - mcao, err := addonC.AddonV1alpha1().ManagedClusterAddOns(spoke.Name).List(ctx, metav1.ListOptions{LabelSelector: v1alpha1.ManagedBySelector.String()}) - if err != nil { - return fmt.Errorf("failed to list managedClusterAddOns for managedCluster %s: %w", managedCluster.Name, err) + // remove addons only after confirming that the cluster can be unjoined - this avoids leaving dangling resources that may rely on the addon + if err := handleAddonDisable(ctx, spoke.Name, spoke.EnabledAddons); err != nil { + fc.SetConditions(true, v1alpha1.NewCondition( + err.Error(), spoke.AddonDisableType(), metav1.ConditionFalse, metav1.ConditionTrue, + )) + return err } - mcaoList := mcao.Items - if len(mcaoList) > 0 { - var addOnsToDisable []string - for _, m := range mcaoList { - addOnsToDisable = append(addOnsToDisable, m.Name) - } - // remove addons only after confirming that the cluster can be unjoined - this avoids leaving dangling resources that may rely on the addon - if err := handleAddonDisable(ctx, spoke.Name, addOnsToDisable); err != nil { - fc.SetConditions(true, v1alpha1.NewCondition( - err.Error(), spoke.AddonDisableType(), metav1.ConditionFalse, metav1.ConditionTrue, - )) - return err - } + if len(spoke.EnabledAddons) > 0 { // Wait for addon manifestWorks to be fully cleaned up before proceeding with unjoin if err := waitForAddonManifestWorksCleanup(ctx, workC, spoke.Name, addonCleanupTimeout); err != nil { fc.SetConditions(true, v1alpha1.NewCondition( @@ -656,14 +643,14 @@ func deregisterSpoke(ctx context.Context, kClient client.Client, hubKubeconfig [ } // remove ManagedCluster - if err = clusterC.ClusterV1().ManagedClusters().Delete(ctx, spoke.Name, metav1.DeleteOptions{}); err != nil && !kerrs.IsNotFound(err) { - return err + if err = clusterC.ClusterV1().ManagedClusters().Delete(ctx, spoke.Name, metav1.DeleteOptions{}); err != nil { + return client.IgnoreNotFound(err) } // remove Namespace ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: spoke.Name}} - if err := kClient.Delete(ctx, ns); err != nil && !kerrs.IsNotFound(err) { - return err + if err := kClient.Delete(ctx, ns); err != nil { + return client.IgnoreNotFound(err) } return nil