From 90e5d7b6d98a2191efc202ed38009de3497c162a Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Wed, 24 Sep 2025 22:19:54 -0700 Subject: [PATCH 01/62] refactor: fleetconfig-controller as an addon Signed-off-by: Artur Shad Nik --- .../api/v1beta1/constants.go | 42 +- .../api/v1beta1/spoke_types.go | 5 + fleetconfig-controller/build/Dockerfile.base | 2 +- .../build/Dockerfile.devspace | 2 +- fleetconfig-controller/build/Dockerfile.eks | 2 +- fleetconfig-controller/build/Dockerfile.gke | 2 +- ...gement.io_clustermanagementaddons.crd.yaml | 634 +++++++++++ ...anagement.io_managedclusteraddons.crd.yaml | 405 ++++++++ ...agement.io_addondeploymentconfigs.crd.yaml | 279 +++++ ...ster-management.io_addontemplates.crd.yaml | 594 +++++++++++ .../templates/clusterissuer.yaml | 2 + .../templates/deployment.yaml | 11 +- .../ocm/{add-ons.yaml => custom-addons.yaml} | 0 .../ocm/fcc-addon/addon-template.yaml | 192 ++++ .../fcc-addon/cluster-management-addon.yaml | 23 + .../ocm/fcc-addon/cluster-role-binding.yaml | 13 + fleetconfig-controller/cmd/main.go | 149 +-- fleetconfig-controller/cmd/manager/manager.go | 290 ++++++ .../config/devspace/{ => hub}/manager.yaml | 5 + .../config/devspace/spoke/manager.yaml | 56 + fleetconfig-controller/devspace.yaml | 65 +- .../dev/devspace-start-hub.sh} | 0 .../hack/dev/devspace-start-spoke.sh | 41 + fleetconfig-controller/hack/install_crds.sh | 1 + .../internal/controller/v1beta1/addon.go | 58 +- .../controller/v1beta1/spoke_controller.go | 772 +------------- .../controller/v1beta1/spoke_handler.go | 981 ++++++++++++++++++ .../internal/webhook/v1beta1/validation.go | 24 +- .../webhook/v1beta1/validation_test.go | 11 +- 29 files changed, 3779 insertions(+), 882 deletions(-) create mode 100644 fleetconfig-controller/charts/fleetconfig-controller/crds/0000_00_addon.open-cluster-management.io_clustermanagementaddons.crd.yaml create mode 100644 fleetconfig-controller/charts/fleetconfig-controller/crds/0000_01_addon.open-cluster-management.io_managedclusteraddons.crd.yaml create mode 100644 fleetconfig-controller/charts/fleetconfig-controller/crds/0000_02_addon.open-cluster-management.io_addondeploymentconfigs.crd.yaml create mode 100644 fleetconfig-controller/charts/fleetconfig-controller/crds/0000_03_addon.open-cluster-management.io_addontemplates.crd.yaml rename fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/{add-ons.yaml => custom-addons.yaml} (100%) create mode 100644 fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml create mode 100644 fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-management-addon.yaml create mode 100644 fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-role-binding.yaml create mode 100644 fleetconfig-controller/cmd/manager/manager.go rename fleetconfig-controller/config/devspace/{ => hub}/manager.yaml (89%) create mode 100644 fleetconfig-controller/config/devspace/spoke/manager.yaml rename fleetconfig-controller/{devspace-start.sh => hack/dev/devspace-start-hub.sh} (100%) create mode 100755 fleetconfig-controller/hack/dev/devspace-start-spoke.sh create mode 100644 fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go diff --git a/fleetconfig-controller/api/v1beta1/constants.go b/fleetconfig-controller/api/v1beta1/constants.go index ac7079b3..7fa82b0f 100644 --- a/fleetconfig-controller/api/v1beta1/constants.go +++ b/fleetconfig-controller/api/v1beta1/constants.go @@ -3,10 +3,13 @@ package v1beta1 import "k8s.io/apimachinery/pkg/labels" const ( - // HubCleanupFinalizer is the finalizer for Hub cleanup. + // HubCleanupPreflightFinalizer is the finalizer for cleanup preflight checks hub cluster's controller instance. Used to signal to the spoke's controller than unjoin can proceed. + HubCleanupPreflightFinalizer = "fleetconfig.open-cluster-management.io/hub-cleanup-preflight" + + // HubCleanupFinalizer is the finalizer for cleanup by the hub cluster's controller instance. HubCleanupFinalizer = "fleetconfig.open-cluster-management.io/hub-cleanup" - // SpokeCleanupFinalizer is the finalizer for Spoke cleanup. + // SpokeCleanupFinalizer is the finalizer for cleanup by the spoke cluster's controller instance. SpokeCleanupFinalizer = "fleetconfig.open-cluster-management.io/spoke-cleanup" ) @@ -65,6 +68,41 @@ const ( ManagedClusterTypeHubAsSpoke = "hub-as-spoke" ) +const ( + // ClusterTypeHub indicates that the controller is running in a Hub cluster. + ClusterTypeHub = "hub" + + // ClusterTypeSpoke indicates that the controller is running in a Spoke cluster. + ClusterTypeSpoke = "spoke" + + // HubKubeconfigEnvVar is the environment variable containing the path to the mounted Hub kubeconfig. + HubKubeconfigEnvVar = "HUB_KUBECONFIG" + + // HubKubeconfigFallbackPath is the path of the mounted kubeconfig when the controller is running in a Spoke cluster. Used if the environment variable is not set. + HubKubeconfigFallbackPath = "/managed/hub-kubeconfig/kubeconfig" + + // SpokeNameEnvVar is the environment variable containing the name of the Spoke resource. + SpokeNameEnvVar = "CLUSTER_NAME" + + // SpokeNamespaceEnvVar is the environment variable containing the namespace of the Spoke resource. + SpokeNamespaceEnvVar = "CLUSTER_NAMESPACE" + + // HubNamespaceEnvVar is the environment variable containing the namespace of the Spoke resource. + HubNamespaceEnvVar = "HUB_NAMESPACE" + + // ControllerNamespaceEnvVar is the environment variable containing the namespace that the controller is deployed to. + ControllerNamespaceEnvVar = "CONTROLLER_NAMESPACE" + + // FCCAddOnName is the name of the fleetconfig-controller-addon + FCCAddOnName = "fleetconfig-controller-manager" +) + +// SupportedClusterTypes are the valid cluster types that the controller can be installed in. +var SupportedClusterTypes = []string{ + ClusterTypeHub, + ClusterTypeSpoke, +} + // FleetConfig labels const ( // LabelManagedClusterType is the label key for the managed cluster type. diff --git a/fleetconfig-controller/api/v1beta1/spoke_types.go b/fleetconfig-controller/api/v1beta1/spoke_types.go index a7e880b2..b496566d 100644 --- a/fleetconfig-controller/api/v1beta1/spoke_types.go +++ b/fleetconfig-controller/api/v1beta1/spoke_types.go @@ -98,6 +98,11 @@ func (s *Spoke) IsManagedBy(om metav1.ObjectMeta) bool { return s.Spec.HubRef.Name == om.Name && s.Spec.HubRef.Namespace == om.Namespace } +// IsHubAsSpoke returns true if the cluster is a hub-as-spoke. Determined either by name `hub-as-spoke` or an InCluster kubeconfig +func (s *Spoke) IsHubAsSpoke() bool { + return s.Name == ManagedClusterTypeHubAsSpoke || s.Spec.Kubeconfig.InCluster +} + // Klusterlet is the configuration for a klusterlet. type Klusterlet struct { // Annotations to apply to the spoke cluster. If not present, the 'agent.open-cluster-management.io/' prefix is added to each key. diff --git a/fleetconfig-controller/build/Dockerfile.base b/fleetconfig-controller/build/Dockerfile.base index adf00d89..83972aa5 100644 --- a/fleetconfig-controller/build/Dockerfile.base +++ b/fleetconfig-controller/build/Dockerfile.base @@ -22,7 +22,7 @@ COPY go.sum go.sum RUN go mod download # Copy the go source -COPY cmd/main.go cmd/main.go +COPY cmd/ cmd/ COPY api/ api/ COPY internal/ internal/ COPY pkg/ pkg/ diff --git a/fleetconfig-controller/build/Dockerfile.devspace b/fleetconfig-controller/build/Dockerfile.devspace index 43f250bb..2e198134 100644 --- a/fleetconfig-controller/build/Dockerfile.devspace +++ b/fleetconfig-controller/build/Dockerfile.devspace @@ -52,7 +52,7 @@ COPY go.sum go.sum RUN go mod download # Copy the go source -COPY cmd/main.go cmd/main.go +COPY cmd/ cmd/ COPY api/ api/ COPY internal/ internal/ COPY pkg/ pkg/ diff --git a/fleetconfig-controller/build/Dockerfile.eks b/fleetconfig-controller/build/Dockerfile.eks index de1efc3e..ce1db588 100644 --- a/fleetconfig-controller/build/Dockerfile.eks +++ b/fleetconfig-controller/build/Dockerfile.eks @@ -22,7 +22,7 @@ COPY go.sum go.sum RUN go mod download # Copy the go source -COPY cmd/main.go cmd/main.go +COPY cmd/ cmd/ COPY api/ api/ COPY internal/ internal/ COPY pkg/ pkg/ diff --git a/fleetconfig-controller/build/Dockerfile.gke b/fleetconfig-controller/build/Dockerfile.gke index 3ca29b86..1b2b2e9f 100644 --- a/fleetconfig-controller/build/Dockerfile.gke +++ b/fleetconfig-controller/build/Dockerfile.gke @@ -21,7 +21,7 @@ COPY go.sum go.sum RUN go mod download # Copy the go source -COPY cmd/main.go cmd/main.go +COPY cmd/ cmd/ COPY api/ api/ COPY internal/ internal/ COPY pkg/ pkg/ diff --git a/fleetconfig-controller/charts/fleetconfig-controller/crds/0000_00_addon.open-cluster-management.io_clustermanagementaddons.crd.yaml b/fleetconfig-controller/charts/fleetconfig-controller/crds/0000_00_addon.open-cluster-management.io_clustermanagementaddons.crd.yaml new file mode 100644 index 00000000..d249d7df --- /dev/null +++ b/fleetconfig-controller/charts/fleetconfig-controller/crds/0000_00_addon.open-cluster-management.io_clustermanagementaddons.crd.yaml @@ -0,0 +1,634 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: clustermanagementaddons.addon.open-cluster-management.io +spec: + group: addon.open-cluster-management.io + names: + kind: ClusterManagementAddOn + listKind: ClusterManagementAddOnList + plural: clustermanagementaddons + shortNames: + - cma + - cmas + singular: clustermanagementaddon + preserveUnknownFields: false + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .spec.addOnMeta.displayName + name: DISPLAY NAME + type: string + - jsonPath: .spec.addOnConfiguration.crdName + name: CRD NAME + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + ClusterManagementAddOn represents the registration of an add-on to the cluster manager. + This resource allows you to discover which add-ons are available for the cluster manager + and provides metadata information about the add-ons. The ClusterManagementAddOn name is used + for the namespace-scoped ManagedClusterAddOn resource. + ClusterManagementAddOn is a cluster-scoped resource. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec represents a desired configuration for the agent on + the cluster management add-on. + properties: + addOnConfiguration: + description: |- + Deprecated: Use supportedConfigs filed instead + addOnConfiguration is a reference to configuration information for the add-on. + In scenario where a multiple add-ons share the same add-on CRD, multiple ClusterManagementAddOn + resources need to be created and reference the same AddOnConfiguration. + properties: + crName: + description: |- + crName is the name of the CR used to configure instances of the managed add-on. + This field should be configured if add-on CR have a consistent name across the all of the ManagedCluster instaces. + type: string + crdName: + description: |- + crdName is the name of the CRD used to configure instances of the managed add-on. + This field should be configured if the add-on have a CRD that controls the configuration of the add-on. + type: string + lastObservedGeneration: + description: lastObservedGeneration is the observed generation + of the custom resource for the configuration of the addon. + format: int64 + type: integer + type: object + addOnMeta: + description: addOnMeta is a reference to the metadata information + for the add-on. + properties: + description: + description: description represents the detailed description of + the add-on. + type: string + displayName: + description: displayName represents the name of add-on that will + be displayed. + type: string + type: object + installStrategy: + default: + type: Manual + description: |- + InstallStrategy represents that related ManagedClusterAddOns should be installed + on certain clusters. + properties: + placements: + description: |- + Placements is a list of placement references honored when install strategy type is + Placements. All clusters selected by these placements will install the addon + If one cluster belongs to multiple placements, it will only apply the strategy defined + later in the order. That is to say, The latter strategy overrides the previous one. + items: + properties: + configs: + description: |- + Configs is the configuration of managedClusterAddon during installation. + User can override the configuration by updating the managedClusterAddon directly. + items: + properties: + group: + default: "" + description: group of the add-on configuration. + type: string + name: + description: name of the add-on configuration. + minLength: 1 + type: string + namespace: + description: |- + namespace of the add-on configuration. + If this field is not set, the configuration is in the cluster scope. + type: string + resource: + description: resource of the add-on configuration. + minLength: 1 + type: string + required: + - name + - resource + type: object + type: array + name: + description: Name is the name of the placement + minLength: 1 + type: string + namespace: + description: Namespace is the namespace of the placement + minLength: 1 + type: string + rolloutStrategy: + default: + type: All + description: |- + The rollout strategy to apply addon configurations change. + The rollout strategy only watches the addon configurations defined in ClusterManagementAddOn. + properties: + all: + description: All defines required fields for RolloutStrategy + type All + properties: + maxFailures: + anyOf: + - type: integer + - type: string + default: 0 + description: |- + MaxFailures is a percentage or number of clusters in the current rollout that can fail before + proceeding to the next rollout. Fail means the cluster has a failed status or timeout status + (does not reach successful status after ProgressDeadline). + Once the MaxFailures is breached, the rollout will stop. + MaxFailures is only considered for rollout types Progressive and ProgressivePerGroup. For + Progressive, this is considered over the total number of clusters. For ProgressivePerGroup, + this is considered according to the size of the current group. For both Progressive and + ProgressivePerGroup, the MaxFailures does not apply for MandatoryDecisionGroups, which tolerate + no failures. + Default is that no failures are tolerated. + pattern: ^((100|[0-9]{1,2})%|[0-9]+)$ + x-kubernetes-int-or-string: true + minSuccessTime: + default: "0" + description: |- + MinSuccessTime is a "soak" time. In other words, the minimum amount of time the workload + applier controller will wait from the start of each rollout before proceeding (assuming a + successful state has been reached and MaxFailures wasn't breached). + MinSuccessTime is only considered for rollout types Progressive and ProgressivePerGroup. + The default value is 0 meaning the workload applier proceeds immediately after a successful + state is reached. + MinSuccessTime must be defined in [0-9h]|[0-9m]|[0-9s] format examples; 2h , 90m , 360s + type: string + progressDeadline: + default: None + description: |- + ProgressDeadline defines how long workload applier controller will wait for the workload to + reach a successful state in the cluster. + If the workload does not reach a successful state after ProgressDeadline, will stop waiting + and workload will be treated as "timeout" and be counted into MaxFailures. Once the MaxFailures + is breached, the rollout will stop. + ProgressDeadline default value is "None", meaning the workload applier will wait for a + successful state indefinitely. + ProgressDeadline must be defined in [0-9h]|[0-9m]|[0-9s] format examples; 2h , 90m , 360s + pattern: ^(([0-9])+[h|m|s])|None$ + type: string + type: object + progressive: + description: Progressive defines required fields for + RolloutStrategy type Progressive + properties: + mandatoryDecisionGroups: + description: |- + List of the decision groups names or indexes to apply the workload first and fail if workload + did not reach successful state. + GroupName or GroupIndex must match with the decisionGroups defined in the placement's + decisionStrategy + items: + description: |- + MandatoryDecisionGroup set the decision group name or group index. + GroupName is considered first to select the decisionGroups then GroupIndex. + properties: + groupIndex: + description: |- + GroupIndex of the decision group should match the placementDecisions label value with label key + cluster.open-cluster-management.io/decision-group-index + format: int32 + type: integer + groupName: + description: |- + GroupName of the decision group should match the placementDecisions label value with label key + cluster.open-cluster-management.io/decision-group-name + type: string + type: object + type: array + maxConcurrency: + anyOf: + - type: integer + - type: string + description: |- + MaxConcurrency is the max number of clusters to deploy workload concurrently. The default value + for MaxConcurrency is determined from the clustersPerDecisionGroup defined in the + placement->DecisionStrategy. + pattern: ^((100|[0-9]{1,2})%|[0-9]+)$ + x-kubernetes-int-or-string: true + maxFailures: + anyOf: + - type: integer + - type: string + default: 0 + description: |- + MaxFailures is a percentage or number of clusters in the current rollout that can fail before + proceeding to the next rollout. Fail means the cluster has a failed status or timeout status + (does not reach successful status after ProgressDeadline). + Once the MaxFailures is breached, the rollout will stop. + MaxFailures is only considered for rollout types Progressive and ProgressivePerGroup. For + Progressive, this is considered over the total number of clusters. For ProgressivePerGroup, + this is considered according to the size of the current group. For both Progressive and + ProgressivePerGroup, the MaxFailures does not apply for MandatoryDecisionGroups, which tolerate + no failures. + Default is that no failures are tolerated. + pattern: ^((100|[0-9]{1,2})%|[0-9]+)$ + x-kubernetes-int-or-string: true + minSuccessTime: + default: "0" + description: |- + MinSuccessTime is a "soak" time. In other words, the minimum amount of time the workload + applier controller will wait from the start of each rollout before proceeding (assuming a + successful state has been reached and MaxFailures wasn't breached). + MinSuccessTime is only considered for rollout types Progressive and ProgressivePerGroup. + The default value is 0 meaning the workload applier proceeds immediately after a successful + state is reached. + MinSuccessTime must be defined in [0-9h]|[0-9m]|[0-9s] format examples; 2h , 90m , 360s + type: string + progressDeadline: + default: None + description: |- + ProgressDeadline defines how long workload applier controller will wait for the workload to + reach a successful state in the cluster. + If the workload does not reach a successful state after ProgressDeadline, will stop waiting + and workload will be treated as "timeout" and be counted into MaxFailures. Once the MaxFailures + is breached, the rollout will stop. + ProgressDeadline default value is "None", meaning the workload applier will wait for a + successful state indefinitely. + ProgressDeadline must be defined in [0-9h]|[0-9m]|[0-9s] format examples; 2h , 90m , 360s + pattern: ^(([0-9])+[h|m|s])|None$ + type: string + type: object + progressivePerGroup: + description: ProgressivePerGroup defines required fields + for RolloutStrategy type ProgressivePerGroup + properties: + mandatoryDecisionGroups: + description: |- + List of the decision groups names or indexes to apply the workload first and fail if workload + did not reach successful state. + GroupName or GroupIndex must match with the decisionGroups defined in the placement's + decisionStrategy + items: + description: |- + MandatoryDecisionGroup set the decision group name or group index. + GroupName is considered first to select the decisionGroups then GroupIndex. + properties: + groupIndex: + description: |- + GroupIndex of the decision group should match the placementDecisions label value with label key + cluster.open-cluster-management.io/decision-group-index + format: int32 + type: integer + groupName: + description: |- + GroupName of the decision group should match the placementDecisions label value with label key + cluster.open-cluster-management.io/decision-group-name + type: string + type: object + type: array + maxFailures: + anyOf: + - type: integer + - type: string + default: 0 + description: |- + MaxFailures is a percentage or number of clusters in the current rollout that can fail before + proceeding to the next rollout. Fail means the cluster has a failed status or timeout status + (does not reach successful status after ProgressDeadline). + Once the MaxFailures is breached, the rollout will stop. + MaxFailures is only considered for rollout types Progressive and ProgressivePerGroup. For + Progressive, this is considered over the total number of clusters. For ProgressivePerGroup, + this is considered according to the size of the current group. For both Progressive and + ProgressivePerGroup, the MaxFailures does not apply for MandatoryDecisionGroups, which tolerate + no failures. + Default is that no failures are tolerated. + pattern: ^((100|[0-9]{1,2})%|[0-9]+)$ + x-kubernetes-int-or-string: true + minSuccessTime: + default: "0" + description: |- + MinSuccessTime is a "soak" time. In other words, the minimum amount of time the workload + applier controller will wait from the start of each rollout before proceeding (assuming a + successful state has been reached and MaxFailures wasn't breached). + MinSuccessTime is only considered for rollout types Progressive and ProgressivePerGroup. + The default value is 0 meaning the workload applier proceeds immediately after a successful + state is reached. + MinSuccessTime must be defined in [0-9h]|[0-9m]|[0-9s] format examples; 2h , 90m , 360s + type: string + progressDeadline: + default: None + description: |- + ProgressDeadline defines how long workload applier controller will wait for the workload to + reach a successful state in the cluster. + If the workload does not reach a successful state after ProgressDeadline, will stop waiting + and workload will be treated as "timeout" and be counted into MaxFailures. Once the MaxFailures + is breached, the rollout will stop. + ProgressDeadline default value is "None", meaning the workload applier will wait for a + successful state indefinitely. + ProgressDeadline must be defined in [0-9h]|[0-9m]|[0-9s] format examples; 2h , 90m , 360s + pattern: ^(([0-9])+[h|m|s])|None$ + type: string + type: object + type: + default: All + enum: + - All + - Progressive + - ProgressivePerGroup + type: string + type: object + required: + - name + - namespace + type: object + type: array + x-kubernetes-list-map-keys: + - namespace + - name + x-kubernetes-list-type: map + type: + default: Manual + description: |- + Type is the type of the install strategy, it can be: + - Manual: no automatic install + - Placements: install to clusters selected by placements. + enum: + - Manual + - Placements + type: string + type: object + supportedConfigs: + description: |- + supportedConfigs is a list of configuration types supported by add-on. + An empty list means the add-on does not require configurations. + The default is an empty list + items: + description: ConfigMeta represents a collection of metadata information + for add-on configuration. + properties: + defaultConfig: + description: |- + defaultConfig represents the namespace and name of the default add-on configuration. + In scenario where all add-ons have a same configuration. + properties: + name: + description: name of the add-on configuration. + minLength: 1 + type: string + namespace: + description: |- + namespace of the add-on configuration. + If this field is not set, the configuration is in the cluster scope. + type: string + required: + - name + type: object + group: + default: "" + description: group of the add-on configuration. + type: string + resource: + description: resource of the add-on configuration. + minLength: 1 + type: string + required: + - resource + type: object + type: array + x-kubernetes-list-map-keys: + - group + - resource + x-kubernetes-list-type: map + type: object + status: + description: status represents the current status of cluster management + add-on. + properties: + defaultconfigReferences: + description: defaultconfigReferences is a list of current add-on default + configuration references. + items: + description: |- + DefaultConfigReference is a reference to the current add-on configuration. + This resource is used to record the configuration resource for the current add-on. + properties: + desiredConfig: + description: desiredConfig record the desired config spec hash. + properties: + name: + description: name of the add-on configuration. + minLength: 1 + type: string + namespace: + description: |- + namespace of the add-on configuration. + If this field is not set, the configuration is in the cluster scope. + type: string + specHash: + description: spec hash for an add-on configuration. + type: string + required: + - name + type: object + group: + default: "" + description: group of the add-on configuration. + type: string + resource: + description: resource of the add-on configuration. + minLength: 1 + type: string + required: + - resource + type: object + type: array + installProgressions: + description: installProgression is a list of current add-on configuration + references per placement. + items: + properties: + conditions: + description: conditions describe the state of the managed and + monitored components for the operator. + items: + description: Condition contains details for one aspect of + the current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, + Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + configReferences: + description: configReferences is a list of current add-on configuration + references. + items: + description: |- + InstallConfigReference is a reference to the current add-on configuration. + This resource is used to record the configuration resource for the current add-on. + properties: + desiredConfig: + description: desiredConfig record the desired config name + and spec hash. + properties: + name: + description: name of the add-on configuration. + minLength: 1 + type: string + namespace: + description: |- + namespace of the add-on configuration. + If this field is not set, the configuration is in the cluster scope. + type: string + specHash: + description: spec hash for an add-on configuration. + type: string + required: + - name + type: object + group: + default: "" + description: group of the add-on configuration. + type: string + lastAppliedConfig: + description: |- + lastAppliedConfig records the config spec hash when the all the corresponding + ManagedClusterAddOn are applied successfully. + properties: + name: + description: name of the add-on configuration. + minLength: 1 + type: string + namespace: + description: |- + namespace of the add-on configuration. + If this field is not set, the configuration is in the cluster scope. + type: string + specHash: + description: spec hash for an add-on configuration. + type: string + required: + - name + type: object + lastKnownGoodConfig: + description: |- + lastKnownGoodConfig records the last known good config spec hash. + For fresh install or rollout with type UpdateAll or RollingUpdate, the + lastKnownGoodConfig is the same as lastAppliedConfig. + For rollout with type RollingUpdateWithCanary, the lastKnownGoodConfig + is the last successfully applied config spec hash of the canary placement. + properties: + name: + description: name of the add-on configuration. + minLength: 1 + type: string + namespace: + description: |- + namespace of the add-on configuration. + If this field is not set, the configuration is in the cluster scope. + type: string + specHash: + description: spec hash for an add-on configuration. + type: string + required: + - name + type: object + resource: + description: resource of the add-on configuration. + minLength: 1 + type: string + required: + - resource + type: object + type: array + name: + description: Name is the name of the placement + minLength: 1 + type: string + namespace: + description: Namespace is the namespace of the placement + minLength: 1 + type: string + required: + - name + - namespace + type: object + type: array + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] diff --git a/fleetconfig-controller/charts/fleetconfig-controller/crds/0000_01_addon.open-cluster-management.io_managedclusteraddons.crd.yaml b/fleetconfig-controller/charts/fleetconfig-controller/crds/0000_01_addon.open-cluster-management.io_managedclusteraddons.crd.yaml new file mode 100644 index 00000000..779b39fc --- /dev/null +++ b/fleetconfig-controller/charts/fleetconfig-controller/crds/0000_01_addon.open-cluster-management.io_managedclusteraddons.crd.yaml @@ -0,0 +1,405 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: managedclusteraddons.addon.open-cluster-management.io +spec: + group: addon.open-cluster-management.io + names: + kind: ManagedClusterAddOn + listKind: ManagedClusterAddOnList + plural: managedclusteraddons + shortNames: + - mca + - mcas + singular: managedclusteraddon + preserveUnknownFields: false + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.conditions[?(@.type=="Available")].status + name: Available + type: string + - jsonPath: .status.conditions[?(@.type=="Degraded")].status + name: Degraded + type: string + - jsonPath: .status.conditions[?(@.type=="Progressing")].status + name: Progressing + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + ManagedClusterAddOn is the Custom Resource object which holds the current state + of an add-on. This object is used by add-on operators to convey their state. + This resource should be created in the ManagedCluster namespace. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec holds configuration that could apply to any operator. + properties: + configs: + description: |- + configs is a list of add-on configurations. + In scenario where the current add-on has its own configurations. + An empty list means there are no default configurations for add-on. + The default is an empty list + items: + properties: + group: + default: "" + description: group of the add-on configuration. + type: string + name: + description: name of the add-on configuration. + minLength: 1 + type: string + namespace: + description: |- + namespace of the add-on configuration. + If this field is not set, the configuration is in the cluster scope. + type: string + resource: + description: resource of the add-on configuration. + minLength: 1 + type: string + required: + - name + - resource + type: object + type: array + installNamespace: + default: open-cluster-management-agent-addon + description: |- + installNamespace is the namespace on the managed cluster to install the addon agent. + If it is not set, open-cluster-management-agent-addon namespace is used to install the addon agent. + maxLength: 63 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + type: object + status: + description: |- + status holds the information about the state of an operator. It is consistent with status information across + the Kubernetes ecosystem. + properties: + addOnConfiguration: + description: |- + Deprecated: Use configReferences instead. + addOnConfiguration is a reference to configuration information for the add-on. + This resource is used to locate the configuration resource for the add-on. + properties: + crName: + description: |- + crName is the name of the CR used to configure instances of the managed add-on. + This field should be configured if add-on CR have a consistent name across the all of the ManagedCluster instaces. + type: string + crdName: + description: |- + crdName is the name of the CRD used to configure instances of the managed add-on. + This field should be configured if the add-on have a CRD that controls the configuration of the add-on. + type: string + lastObservedGeneration: + description: lastObservedGeneration is the observed generation + of the custom resource for the configuration of the addon. + format: int64 + type: integer + type: object + addOnMeta: + description: |- + addOnMeta is a reference to the metadata information for the add-on. + This should be same as the addOnMeta for the corresponding ClusterManagementAddOn resource. + properties: + description: + description: description represents the detailed description of + the add-on. + type: string + displayName: + description: displayName represents the name of add-on that will + be displayed. + type: string + type: object + conditions: + description: conditions describe the state of the managed and monitored + components for the operator. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + configReferences: + description: |- + configReferences is a list of current add-on configuration references. + This will be overridden by the clustermanagementaddon configuration references. + items: + description: |- + ConfigReference is a reference to the current add-on configuration. + This resource is used to locate the configuration resource for the current add-on. + properties: + desiredConfig: + description: desiredConfig record the desired config spec hash. + properties: + name: + description: name of the add-on configuration. + minLength: 1 + type: string + namespace: + description: |- + namespace of the add-on configuration. + If this field is not set, the configuration is in the cluster scope. + type: string + specHash: + description: spec hash for an add-on configuration. + type: string + required: + - name + type: object + group: + default: "" + description: group of the add-on configuration. + type: string + lastAppliedConfig: + description: lastAppliedConfig record the config spec hash when + the corresponding ManifestWork is applied successfully. + properties: + name: + description: name of the add-on configuration. + minLength: 1 + type: string + namespace: + description: |- + namespace of the add-on configuration. + If this field is not set, the configuration is in the cluster scope. + type: string + specHash: + description: spec hash for an add-on configuration. + type: string + required: + - name + type: object + lastObservedGeneration: + description: |- + Deprecated: Use LastAppliedConfig instead + lastObservedGeneration is the observed generation of the add-on configuration. + format: int64 + type: integer + name: + description: name of the add-on configuration. + minLength: 1 + type: string + namespace: + description: |- + namespace of the add-on configuration. + If this field is not set, the configuration is in the cluster scope. + type: string + resource: + description: resource of the add-on configuration. + minLength: 1 + type: string + required: + - name + - resource + type: object + type: array + healthCheck: + description: |- + healthCheck indicates how to check the healthiness status of the current addon. It should be + set by each addon implementation, by default, the lease mode will be used. + properties: + mode: + default: Lease + description: mode indicates which mode will be used to check the + healthiness status of the addon. + enum: + - Lease + - Customized + type: string + type: object + namespace: + description: |- + namespace is the namespace on the managedcluster to put registration secret or lease for the addon. It is + required when registration is set or healthcheck mode is Lease. + type: string + registrations: + description: |- + registrations is the configurations for the addon agent to register to hub. It should be set by each addon controller + on hub to define how the addon agent on managedcluster is registered. With the registration defined, + The addon agent can access to kube apiserver with kube style API or other endpoints on hub cluster with client + certificate authentication. A csr will be created per registration configuration. If more than one + registrationConfig is defined, a csr will be created for each registration configuration. It is not allowed that + multiple registrationConfigs have the same signer name. After the csr is approved on the hub cluster, the klusterlet + agent will create a secret in the installNamespace for the registrationConfig. If the signerName is + "kubernetes.io/kube-apiserver-client", the secret name will be "{addon name}-hub-kubeconfig" whose contents includes + key/cert and kubeconfig. Otherwise, the secret name will be "{addon name}-{signer name}-client-cert" whose contents includes key/cert. + items: + description: |- + RegistrationConfig defines the configuration of the addon agent to register to hub. The Klusterlet agent will + create a csr for the addon agent with the registrationConfig. + properties: + signerName: + description: signerName is the name of signer that addon agent + will use to create csr. + maxLength: 571 + minLength: 5 + pattern: ^([a-z0-9][a-z0-9-]*[a-z0-9]\.)+[a-z]+\/[a-z0-9-\.]+$ + type: string + subject: + description: |- + subject is the user subject of the addon agent to be registered to the hub. + If it is not set, the addon agent will have the default subject + "subject": { + "user": "system:open-cluster-management:cluster:{clusterName}:addon:{addonName}:agent:{agentName}", + "groups: ["system:open-cluster-management:cluster:{clusterName}:addon:{addonName}", + "system:open-cluster-management:addon:{addonName}", "system:authenticated"] + } + properties: + groups: + description: groups is the user group of the addon agent. + items: + type: string + type: array + organizationUnit: + description: organizationUnit is the ou of the addon agent + items: + type: string + type: array + user: + description: user is the user name of the addon agent. + type: string + type: object + required: + - signerName + type: object + type: array + relatedObjects: + description: |- + relatedObjects is a list of objects that are "interesting" or related to this operator. Common uses are: + 1. the detailed resource driving the operator + 2. operator namespaces + 3. operand namespaces + 4. related ClusterManagementAddon resource + items: + description: ObjectReference contains enough information to let + you inspect or modify the referred object. + properties: + group: + description: group of the referent. + type: string + name: + description: name of the referent. + type: string + namespace: + description: namespace of the referent. + type: string + resource: + description: resource of the referent. + type: string + required: + - group + - name + - resource + type: object + type: array + supportedConfigs: + description: |- + SupportedConfigs is a list of configuration types that are allowed to override the add-on configurations defined + in ClusterManagementAddOn spec. + The default is an empty list, which means the add-on configurations can not be overridden. + items: + description: ConfigGroupResource represents the GroupResource of + the add-on configuration + properties: + group: + default: "" + description: group of the add-on configuration. + type: string + resource: + description: resource of the add-on configuration. + minLength: 1 + type: string + required: + - resource + type: object + type: array + x-kubernetes-list-map-keys: + - group + - resource + x-kubernetes-list-type: map + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] diff --git a/fleetconfig-controller/charts/fleetconfig-controller/crds/0000_02_addon.open-cluster-management.io_addondeploymentconfigs.crd.yaml b/fleetconfig-controller/charts/fleetconfig-controller/crds/0000_02_addon.open-cluster-management.io_addondeploymentconfigs.crd.yaml new file mode 100644 index 00000000..eb41b145 --- /dev/null +++ b/fleetconfig-controller/charts/fleetconfig-controller/crds/0000_02_addon.open-cluster-management.io_addondeploymentconfigs.crd.yaml @@ -0,0 +1,279 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: addondeploymentconfigs.addon.open-cluster-management.io +spec: + group: addon.open-cluster-management.io + names: + kind: AddOnDeploymentConfig + listKind: AddOnDeploymentConfigList + plural: addondeploymentconfigs + singular: addondeploymentconfig + preserveUnknownFields: false + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + AddOnDeploymentConfig represents a configuration to customize the deployments of an add-on. + For example, you can specify the NodePlacement to control the scheduling of the add-on agents. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec represents a desired configuration for an add-on. + properties: + agentInstallNamespace: + default: open-cluster-management-agent-addon + description: AgentInstallNamespace is the namespace where the add-on + agent should be installed on the managed cluster. + maxLength: 63 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + customizedVariables: + description: |- + CustomizedVariables is a list of name-value variables for the current add-on deployment. + The add-on implementation can use these variables to render its add-on deployment. + The default is an empty list. + items: + description: CustomizedVariable represents a customized variable + for add-on deployment. + properties: + name: + description: Name of this variable. + maxLength: 255 + pattern: ^[a-zA-Z_][_a-zA-Z0-9]*$ + type: string + value: + description: Value of this variable. + maxLength: 1024 + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + nodePlacement: + description: |- + NodePlacement enables explicit control over the scheduling of the add-on agents on the + managed cluster. + All add-on agent pods are expected to comply with this node placement. + If the placement is nil, the placement is not specified, it will be omitted. + If the placement is an empty object, the placement will match all nodes and tolerate nothing. + properties: + nodeSelector: + additionalProperties: + type: string + description: |- + NodeSelector defines which Nodes the Pods are scheduled on. + If the selector is an empty list, it will match all nodes. + The default is an empty list. + type: object + tolerations: + description: |- + Tolerations is attached by pods to tolerate any taint that matches + the triple using the matching operator . + If the tolerations is an empty list, it will tolerate nothing. + The default is an empty list. + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + type: object + proxyConfig: + description: |- + ProxyConfig holds proxy settings for add-on agent on the managed cluster. + Empty means no proxy settings is available. + properties: + caBundle: + description: |- + CABundle is a CA certificate bundle to verify the proxy server. + And it's only useful when HTTPSProxy is set and a HTTPS proxy server is specified. + format: byte + type: string + httpProxy: + description: HTTPProxy is the URL of the proxy for HTTP requests + type: string + httpsProxy: + description: HTTPSProxy is the URL of the proxy for HTTPS requests + type: string + noProxy: + description: |- + NoProxy is a comma-separated list of hostnames and/or CIDRs and/or IPs for which the proxy + should not be used. + type: string + type: object + registries: + description: |- + Registries describes how to override images used by the addon agent on the managed cluster. + the following example will override image "quay.io/open-cluster-management/addon-agent" to + "quay.io/ocm/addon-agent" when deploying the addon agent + + registries: + - source: quay.io/open-cluster-management/addon-agent + mirror: quay.io/ocm/addon-agent + items: + description: ImageMirror describes how to mirror images from a source + properties: + mirror: + description: Mirror is the mirrored registry of the Source. + Will be ignored if Mirror is empty. + type: string + source: + description: Source is the source registry. All image registries + will be replaced by Mirror if Source is empty. + type: string + required: + - mirror + type: object + type: array + resourceRequirements: + description: |- + ResourceRequirements specify the resources required by add-on agents. + If a container matches multiple ContainerResourceRequirements, the last matched configuration in the + array will take precedence. + items: + description: ContainerResourceRequirements defines resources required + by one or a group of containers. + properties: + containerID: + description: |- + ContainerID is a unique identifier for an agent container. It consists of three parts: resource types, + resource name, and container name, separated by ':'. The format follows + '{resource_types}:{resource_name}:{container_name}' where + 1). Supported resource types include deployments, daemonsets, statefulsets, replicasets, jobs, + cronjobs and pods; + 2). Wildcards (*) can be used in any part to match multiple containers. For example, '*:*:*' + matches all containers of the agent. + pattern: ^(deployments|daemonsets|statefulsets|replicasets|jobs|cronjobs|pods|\*):.+:.+$ + type: string + resources: + description: Compute resources required by matched containers. + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + required: + - containerID + - resources + type: object + type: array + x-kubernetes-list-map-keys: + - containerID + x-kubernetes-list-type: map + type: object + required: + - spec + type: object + served: true + storage: true +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] diff --git a/fleetconfig-controller/charts/fleetconfig-controller/crds/0000_03_addon.open-cluster-management.io_addontemplates.crd.yaml b/fleetconfig-controller/charts/fleetconfig-controller/crds/0000_03_addon.open-cluster-management.io_addontemplates.crd.yaml new file mode 100644 index 00000000..a4e7deeb --- /dev/null +++ b/fleetconfig-controller/charts/fleetconfig-controller/crds/0000_03_addon.open-cluster-management.io_addontemplates.crd.yaml @@ -0,0 +1,594 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + creationTimestamp: null + name: addontemplates.addon.open-cluster-management.io +spec: + group: addon.open-cluster-management.io + names: + kind: AddOnTemplate + listKind: AddOnTemplateList + plural: addontemplates + singular: addontemplate + preserveUnknownFields: false + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .spec.addonName + name: ADDON NAME + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + AddOnTemplate is the Custom Resource object, it is used to describe + how to deploy the addon agent and how to register the addon. + + AddOnTemplate is a cluster-scoped resource, and will only be used + on the hub cluster. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + spec holds the registration configuration for the addon and the + addon agent resources yaml description. + properties: + addonName: + description: AddonName represents the name of the addon which the + template belongs to + type: string + agentSpec: + description: AgentSpec describes what/how the kubernetes resources + of the addon agent to be deployed on a managed cluster. + properties: + deleteOption: + description: |- + DeleteOption represents deletion strategy when the manifestwork is deleted. + Foreground deletion strategy is applied to all the resource in this manifestwork if it is not set. + properties: + propagationPolicy: + default: Foreground + description: |- + propagationPolicy can be Foreground, Orphan or SelectivelyOrphan + SelectivelyOrphan should be rarely used. It is provided for cases where particular resources is transfering + ownership from one ManifestWork to another or another management unit. + Setting this value will allow a flow like + 1. create manifestwork/2 to manage foo + 2. update manifestwork/1 to selectively orphan foo + 3. remove foo from manifestwork/1 without impacting continuity because manifestwork/2 adopts it. + enum: + - Foreground + - Orphan + - SelectivelyOrphan + type: string + selectivelyOrphans: + description: selectivelyOrphan represents a list of resources + following orphan deletion stratecy + properties: + orphaningRules: + description: |- + orphaningRules defines a slice of orphaningrule. + Each orphaningrule identifies a single resource included in this manifestwork + items: + description: OrphaningRule identifies a single resource + included in this manifestwork to be orphaned + properties: + group: + description: |- + Group is the API Group of the Kubernetes resource, + empty string indicates it is in core group. + type: string + name: + description: Name is the name of the Kubernetes + resource. + type: string + namespace: + description: |- + Name is the namespace of the Kubernetes resource, empty string indicates + it is a cluster scoped resource. + type: string + resource: + description: Resource is the resource name of the + Kubernetes resource. + type: string + required: + - name + - resource + type: object + type: array + type: object + ttlSecondsAfterFinished: + description: |- + TTLSecondsAfterFinished limits the lifetime of a ManifestWork that has been marked Complete + by one or more conditionRules set for its manifests. If this field is set, and + the manifestwork has completed, then it is elligible to be automatically deleted. + If this field is unset, the manifestwork won't be automatically deleted even afer completion. + If this field is set to zero, the manfiestwork becomes elligible to be deleted immediately + after completion. + format: int64 + type: integer + type: object + executor: + description: |- + Executor is the configuration that makes the work agent to perform some pre-request processing/checking. + e.g. the executor identity tells the work agent to check the executor has sufficient permission to write + the workloads to the local managed cluster. + Note that nil executor is still supported for backward-compatibility which indicates that the work agent + will not perform any additional actions before applying resources. + properties: + subject: + description: |- + Subject is the subject identity which the work agent uses to talk to the + local cluster when applying the resources. + properties: + serviceAccount: + description: |- + ServiceAccount is for identifying which service account to use by the work agent. + Only required if the type is "ServiceAccount". + properties: + name: + description: Name is the name of the service account. + maxLength: 253 + minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)$ + type: string + namespace: + description: Namespace is the namespace of the service + account. + maxLength: 253 + minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)$ + type: string + required: + - name + - namespace + type: object + type: + description: |- + Type is the type of the subject identity. + Supported types are: "ServiceAccount". + enum: + - ServiceAccount + type: string + required: + - type + type: object + type: object + manifestConfigs: + description: ManifestConfigs represents the configurations of + manifests defined in workload field. + items: + description: ManifestConfigOption represents the configurations + of a manifest defined in workload field. + properties: + conditionRules: + description: ConditionRules defines how to set manifestwork + conditions for a specific manifest. + items: + properties: + celExpressions: + description: |- + CelExpressions defines the CEL expressions to be evaluated for the condition. + Final result is the logical AND of all expressions. + items: + type: string + type: array + condition: + description: |- + Condition is the type of condition that is set based on this rule. + Any condition is supported, but certain special conditions can be used to + to control higher level behaviors of the manifestwork. + If the condition is Complete, the manifest will no longer be updated once completed. + type: string + message: + description: Message is set on the condition created + for this rule + type: string + messageExpression: + description: |- + MessageExpression uses a CEL expression to generate a message for the condition + Will override message if both are set and messageExpression returns a non-empty string. + Variables: + - object: The current instance of the manifest + - result: Boolean result of the CEL expressions + type: string + type: + description: |- + Type defines how a manifest should be evaluated for a condition. + It can be CEL, or WellKnownConditions. + If the type is CEL, user should specify the celExpressions field + If the type is WellKnownConditions, certain common types in k8s.io/api will be considered + completed as defined by hardcoded rules. + enum: + - WellKnownConditions + - CEL + type: string + required: + - condition + - type + type: object + x-kubernetes-validations: + - message: Condition is required for CEL rules + rule: self.type != 'CEL' || self.condition != "" + type: array + x-kubernetes-list-map-keys: + - condition + x-kubernetes-list-type: map + feedbackRules: + description: |- + FeedbackRules defines what resource status field should be returned. If it is not set or empty, + no feedback rules will be honored. + items: + properties: + jsonPaths: + description: JsonPaths defines the json path under + status field to be synced. + items: + properties: + name: + description: Name represents the alias name + for this field + type: string + path: + description: |- + Path represents the json path of the field under status. + The path must point to a field with single value in the type of integer, bool or string. + If the path points to a non-existing field, no value will be returned. + If the path points to a structure, map or slice, no value will be returned and the status conddition + of StatusFeedBackSynced will be set as false. + Ref to https://kubernetes.io/docs/reference/kubectl/jsonpath/ on how to write a jsonPath. + type: string + version: + description: |- + Version is the version of the Kubernetes resource. + If it is not specified, the resource with the semantically latest version is + used to resolve the path. + type: string + required: + - name + - path + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: + description: |- + Type defines the option of how status can be returned. + It can be jsonPaths or wellKnownStatus. + If the type is JSONPaths, user should specify the jsonPaths field + If the type is WellKnownStatus, certain common fields of status defined by a rule only + for types in in k8s.io/api and open-cluster-management/api will be reported, + If these status fields do not exist, no values will be reported. + enum: + - WellKnownStatus + - JSONPaths + type: string + required: + - type + type: object + type: array + resourceIdentifier: + description: |- + ResourceIdentifier represents the group, resource, name and namespace of a resoure. + iff this refers to a resource not created by this manifest work, the related rules will not be executed. + properties: + group: + description: |- + Group is the API Group of the Kubernetes resource, + empty string indicates it is in core group. + type: string + name: + description: Name is the name of the Kubernetes resource. + type: string + namespace: + description: |- + Name is the namespace of the Kubernetes resource, empty string indicates + it is a cluster scoped resource. + type: string + resource: + description: Resource is the resource name of the Kubernetes + resource. + type: string + required: + - name + - resource + type: object + updateStrategy: + description: |- + UpdateStrategy defines the strategy to update this manifest. UpdateStrategy is Update + if it is not set. + properties: + serverSideApply: + description: |- + serverSideApply defines the configuration for server side apply. It is honored only when the + type of the updateStrategy is ServerSideApply + properties: + fieldManager: + default: work-agent + description: |- + FieldManager is the manager to apply the resource. It is work-agent by default, but can be other name with work-agent + as the prefix. + pattern: ^work-agent + type: string + force: + description: Force represents to force apply the + manifest. + type: boolean + ignoreFields: + description: IgnoreFields defines a list of json + paths in the resource that will not be updated + on the spoke. + items: + properties: + condition: + default: OnSpokePresent + description: |- + Condition defines the condition that the fields should be ignored when apply the resource. + Fields in JSONPaths are all ignored when condition is met, otherwise no fields is ignored + in the apply operation. + enum: + - OnSpokePresent + - OnSpokeChange + type: string + jsonPaths: + description: JSONPaths defines the list of + json path in the resource to be ignored + items: + type: string + minItems: 1 + type: array + required: + - condition + - jsonPaths + type: object + type: array + x-kubernetes-list-map-keys: + - condition + x-kubernetes-list-type: map + type: object + type: + default: Update + description: |- + type defines the strategy to update this manifest, default value is Update. + Update type means to update resource by an update call. + CreateOnly type means do not update resource based on current manifest. + ServerSideApply type means to update resource using server side apply with work-controller as the field manager. + If there is conflict, the related Applied condition of manifest will be in the status of False with the + reason of ApplyConflict. + ReadOnly type means the agent will only check the existence of the resource based on its metadata, + statusFeedBackRules can still be used to get feedbackResults. + enum: + - Update + - CreateOnly + - ServerSideApply + - ReadOnly + type: string + required: + - type + type: object + required: + - resourceIdentifier + type: object + type: array + workload: + description: Workload represents the manifest workload to be deployed + on a managed cluster. + properties: + manifests: + description: Manifests represents a list of kuberenetes resources + to be deployed on a managed cluster. + items: + description: Manifest represents a resource to be deployed + on managed cluster. + type: object + x-kubernetes-embedded-resource: true + x-kubernetes-preserve-unknown-fields: true + type: array + type: object + type: object + registration: + description: Registration holds the registration configuration for + the addon + items: + description: |- + RegistrationSpec describes how to register an addon agent to the hub cluster. + With the registration defined, The addon agent can access to kube apiserver with kube style API + or other endpoints on hub cluster with client certificate authentication. During the addon + registration process, a csr will be created for each Registration on the hub cluster. The + CSR will be approved automatically, After the csr is approved on the hub cluster, the klusterlet + agent will create a secret in the installNamespace for the addon agent. + If the RegistrationType type is KubeClient, the secret name will be "{addon name}-hub-kubeconfig" + whose content includes key/cert and kubeconfig. Otherwise, If the RegistrationType type is + CustomSigner the secret name will be "{addon name}-{signer name}-client-cert" whose content + includes key/cert. + properties: + customSigner: + description: |- + CustomSigner holds the configuration of the CustomSigner type registration + required when the Type is CustomSigner + properties: + signerName: + description: signerName is the name of signer that addon + agent will use to create csr. + maxLength: 571 + minLength: 5 + pattern: ^([a-z0-9][a-z0-9-]*[a-z0-9]\.)+[a-z]+\/[a-z0-9-\.]+$ + type: string + signingCA: + description: |- + SigningCA represents the reference of the secret on the hub cluster to sign the CSR + the secret type must be "kubernetes.io/tls" + Note: The addon manager will not have permission to access the secret by default, so + the user must grant the permission to the addon manager(by creating rolebinding/clusterrolebinding + for the addon-manager serviceaccount "addon-manager-controller-sa"). + properties: + name: + description: Name of the signing CA secret + type: string + namespace: + description: Namespace of the signing CA secret, the + namespace of the addon-manager will be used if it + is not set. + type: string + required: + - name + type: object + subject: + description: |- + Subject is the user subject of the addon agent to be registered to the hub. + If it is not set, the addon agent will have the default subject + "subject": { + "user": "system:open-cluster-management:cluster:{clusterName}:addon:{addonName}:agent:{agentName}", + "groups: ["system:open-cluster-management:cluster:{clusterName}:addon:{addonName}", + "system:open-cluster-management:addon:{addonName}", "system:authenticated"] + } + properties: + groups: + description: groups is the user group of the addon agent. + items: + type: string + type: array + organizationUnit: + description: organizationUnit is the ou of the addon + agent + items: + type: string + type: array + user: + description: user is the user name of the addon agent. + type: string + type: object + required: + - signerName + - signingCA + type: object + kubeClient: + description: KubeClient holds the configuration of the KubeClient + type registration + properties: + hubPermissions: + description: HubPermissions represent the permission configurations + of the addon agent to access the hub cluster + items: + description: |- + HubPermissionConfig configures the permission of the addon agent to access the hub cluster. + Will create a RoleBinding in the same namespace as the managedClusterAddon to bind the user + provided ClusterRole/Role to the "system:open-cluster-management:cluster::addon:" + Group. + properties: + currentCluster: + description: |- + CurrentCluster contains the configuration of CurrentCluster type binding. + It is required when the type is CurrentCluster. + properties: + clusterRoleName: + description: |- + ClusterRoleName is the name of the clusterrole the addon agent is bound. A rolebinding + will be created referring to this cluster role in each cluster namespace. + The user must make sure the clusterrole exists on the hub cluster. + type: string + required: + - clusterRoleName + type: object + singleNamespace: + description: |- + SingleNamespace contains the configuration of SingleNamespace type binding. + It is required when the type is SingleNamespace + properties: + namespace: + description: |- + Namespace is the namespace the addon agent has permissions to bind to. A rolebinding + will be created in this namespace referring to the RoleRef. + type: string + roleRef: + description: |- + RoleRef is an reference to the permission resource. it could be a role or a cluster role, + the user must make sure it exist on the hub cluster. + properties: + apiGroup: + description: APIGroup is the group for the + resource being referenced + type: string + kind: + description: Kind is the type of resource + being referenced + type: string + name: + description: Name is the name of resource + being referenced + type: string + required: + - apiGroup + - kind + - name + type: object + x-kubernetes-map-type: atomic + required: + - namespace + - roleRef + type: object + type: + description: |- + Type of the permissions setting. It defines how to bind the roleRef on the hub cluster. It can be: + - CurrentCluster: Bind the roleRef to the namespace with the same name as the managedCluster. + - SingleNamespace: Bind the roleRef to the namespace specified by SingleNamespaceBindingConfig. + enum: + - CurrentCluster + - SingleNamespace + type: string + required: + - type + type: object + type: array + type: object + type: + description: |- + Type of the registration configuration, it supports: + - KubeClient: the addon agent can access the hub kube apiserver with kube style API. + the signer name should be "kubernetes.io/kube-apiserver-client". When this type is + used, the KubeClientRegistrationConfig can be used to define the permission of the + addon agent to access the hub cluster + - CustomSigner: the addon agent can access the hub cluster through user-defined endpoints. + When this type is used, the CustomSignerRegistrationConfig can be used to define how + to issue the client certificate for the addon agent. + enum: + - KubeClient + - CustomSigner + type: string + required: + - type + type: object + type: array + required: + - addonName + - agentSpec + type: object + required: + - spec + type: object + served: true + storage: true + subresources: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/clusterissuer.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/clusterissuer.yaml index 434175f4..8da1e529 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/clusterissuer.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/clusterissuer.yaml @@ -1,4 +1,6 @@ {{- if .Values.certificates.clusterIssuer.enabled -}} +# yaml-language-server: $schema=https://datreeio.github.io/CRDs-catalog/cert-manager.io/clusterissuer_v1.json + apiVersion: cert-manager.io/v1 kind: ClusterIssuer metadata: diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml index 9a661072..4c3b4b6a 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml @@ -32,17 +32,22 @@ spec: - args: - "--leader-elect" - "--health-probe-bind-address=:{{ .Values.healthCheck.port }}" - {{ if .Values.admissionWebhooks.enabled }} + {{- if .Values.admissionWebhooks.enabled }} - "--use-webhook=true" - "--webhook-port={{ .Values.webhookService.port }}" - "--webhook-cert-dir={{ .Values.admissionWebhooks.certificate.mountPath }}" - "--spoke-concurrent-reconciles={{ .Values.spokeConcurrentReconciles }}" - {{ end }} + - "--cluster-type=hub" + {{- end }} command: - /manager env: - name: KUBERNETES_CLUSTER_DOMAIN value: {{ quote .Values.kubernetesClusterDomain }} + - name: CONTROLLER_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace image: {{ include "controller.image" . }} imagePullPolicy: {{ quote .Values.image.pullPolicy }} name: manager @@ -55,11 +60,13 @@ spec: - containerPort: {{ .Values.webhookService.port }} name: webhook-server protocol: TCP + {{- end }} - containerPort: {{ .Values.healthCheck.port }} name: healthz protocol: TCP livenessProbe: {{- toYaml .Values.livenessProbe | nindent 10 }} readinessProbe: {{- toYaml .Values.readinessProbe | nindent 10 }} + {{- if .Values.admissionWebhooks.enabled }} volumeMounts: - mountPath: {{ .Values.admissionWebhooks.certificate.mountPath }} name: tls-cert-vol diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/add-ons.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/custom-addons.yaml similarity index 100% rename from fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/add-ons.yaml rename to fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/custom-addons.yaml diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml new file mode 100644 index 00000000..9706a87f --- /dev/null +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml @@ -0,0 +1,192 @@ +# yaml-language-server: $schema=https://datreeio.github.io/CRDs-catalog/addon.open-cluster-management.io/addontemplate_v1alpha1.json + +apiVersion: addon.open-cluster-management.io/v1alpha1 +kind: AddOnTemplate +metadata: + name: fleetconfig-controller-manager +spec: + addonName: fleetconfig-controller-manager + agentSpec: + workload: + manifests: + - kind: Namespace + apiVersion: v1 + metadata: + name: {{ .Release.Namespace }} + - kind: Deployment + apiVersion: apps/v1 + metadata: + name: {{ include "chart.fullname" . }}-manager + namespace: {{ .Release.Namespace }} + labels: + control-plane: controller-manager + {{- include "chart.labels" . | nindent 12 }} + spec: + replicas: {{ .Values.replicas }} + selector: + matchLabels: + control-plane: controller-manager + {{- include "chart.selectorLabels" . | nindent 14 }} + template: + metadata: + labels: + control-plane: controller-manager + {{- include "chart.selectorLabels" . | nindent 16 }} + annotations: + kubectl.kubernetes.io/default-container: manager + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 16 }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 16 }} + serviceAccountName: {{ include "chart.fullname" . }}-manager + terminationGracePeriodSeconds: 10 + containers: + - args: + - "--leader-elect" + - "--health-probe-bind-address=:{{ .Values.healthCheck.port }}" + - "--spoke-concurrent-reconciles=1" + - "--cluster-type=spoke" + command: + - /manager + env: + - name: KUBERNETES_CLUSTER_DOMAIN + value: {{ quote .Values.kubernetesClusterDomain }} + - name: CLUSTER_NAMESPACE + value: '{{ `{{CLUSTER_NAMESPACE}}` }}' + - name: HUB_NAMESPACE + value: '{{ `{{HUB_NAMESPACE}}` }}' + - name: CONTROLLER_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + image: quay.io/open-cluster-management/fleetconfig-controller:local # TODO @arturshadnik - use the actual image + # image: {{ include "controller.image" . }} + imagePullPolicy: {{ quote .Values.image.pullPolicy }} + name: manager + resources: + {{- toYaml .Values.resources | nindent 18 }} + securityContext: + {{- toYaml .Values.containerSecurityContext | nindent 18 }} + ports: + - containerPort: {{ .Values.healthCheck.port }} + name: healthz + protocol: TCP + livenessProbe: {{- toYaml .Values.livenessProbe | nindent 18 }} + readinessProbe: {{- toYaml .Values.readinessProbe | nindent 18 }} + - kind: ServiceAccount + apiVersion: v1 + metadata: + name: {{ include "chart.fullname" . }}-manager + namespace: {{ .Release.Namespace }} + - kind: ClusterRole + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + name: {{ include "chart.fullname" . }}-manager-spoke-role + namespace: {{ .Release.Namespace }} + rules: + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: [""] + resources: ["pods", "serviceaccounts"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterrolebindings", "clusterroles"] + verbs: ["get", "list", "watch"] + - apiGroups: ["operator.open-cluster-management.io"] + resources: ["klusterlets"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["work.open-cluster-management.io"] + resources: ["appliedmanifestworks"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["cluster.open-cluster-management.io"] + resources: ["clusterclaims"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - kind: ClusterRoleBinding + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + name: {{ include "chart.fullname" . }}-manager-spoke-rolebinding + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "chart.fullname" . }}-manager-spoke-role + subjects: + - kind: ServiceAccount + name: {{ include "chart.fullname" . }}-manager + namespace: {{ .Release.Namespace }} + - kind: Role + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + name: {{ include "chart.fullname" . }}-leader-election-role + namespace: {{ .Release.Namespace }} + labels: + {{- include "chart.labels" . | nindent 12 }} + rules: + - apiGroups: [""] + resources: [configmaps] + verbs: [get, list, watch, create, update, patch, delete] + - apiGroups: [coordination.k8s.io] + resources: [leases] + verbs: [get, list, watch, create, update, patch, delete] + - apiGroups: [""] + resources: [events] + verbs: [create, patch] + - kind: RoleBinding + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + name: {{ include "chart.fullname" . }}-leader-election-rolebinding + namespace: {{ .Release.Namespace }} + labels: + {{- include "chart.labels" . | nindent 12 }} + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "chart.fullname" . }}-leader-election-role + namespace: {{ .Release.Namespace }} + subjects: + - kind: ServiceAccount + name: {{ include "chart.fullname" . }}-manager + namespace: {{ .Release.Namespace }} + registration: # optional + # kubeClient or custom signer, if kubeClient, user and group is in a certain format. + # user is "system:open-cluster-management:cluster:{clusterName}:addon:{addonName}:agent:{agentName}" + # group is ["system:open-cluster-management:cluster:{clusterName}:addon:{addonName}", + # "system:open-cluster-management:addon:{addonName}", "system:authenticated"] + - type: KubeClient + kubeClient: + hubPermissions: + # this isnt suffcient, but any other permissions need to set by fcc, because we cant template the namespace + - type: CurrentCluster + currentCluster: + clusterRoleName: {{ include "chart.fullname" . }}-manager-role + # - type: SingleNamespace + # singleNamespace: + # # TODO @arturshadnik - figure out how to set this per-cluster's hubRef - probably needs to be done by fcc + # namespace: {{ .Release.Namespace }} + # # namespace: {{ `{{HUB_CLUSTER_NAMESPACE}}` }} + # roleRef: + # apiGroup: rbac.authorization.k8s.io + # kind: ClusterRole + # name: {{ include "chart.fullname" . }}-manager-role + # - type: SingleNamespace + # singleNamespace: + # # namespace: {{ .Release.Namespace }} # TODO @arturshadnik - figure out how to set this per-cluster + # # namespace: {{ `{{CLUSTER_NAMESPACE}}` }} + # namespace: default + # roleRef: + # apiGroup: rbac.authorization.k8s.io + # kind: ClusterRole + # # should be created by user; the addon manager will grant the permission to the agent, so if the + # # role/clusterRole contains some permissions that the addon manager doesn't have, user needs to grant + # # the permission to the addon-manager (service account open-cluster-management-hub/addon-manager-controller-sa), + # # otherwise the addon manager will fail to grant the permission to the agent + # name: {{ include "chart.fullname" . }}-manager-role diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-management-addon.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-management-addon.yaml new file mode 100644 index 00000000..47ea0897 --- /dev/null +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-management-addon.yaml @@ -0,0 +1,23 @@ +# yaml-language-server: $schema=https://datreeio.github.io/CRDs-catalog/addon.open-cluster-management.io/clustermanagementaddon_v1alpha1.json + +apiVersion: addon.open-cluster-management.io/v1alpha1 +kind: ClusterManagementAddOn +metadata: + name: fleetconfig-controller-manager +spec: + addOnMeta: + displayName: FleetConfig Controller Addon + description: | + fleetconfig-controller-manager is an addon to deploy fleetconfig-controller manager on the managed cluster. + It is used to enable decentalized management of spoke clusters. + supportedConfigs: + - group: addon.open-cluster-management.io + resource: addontemplates + defaultConfig: + name: fleetconfig-controller-manager + installStrategy: + type: Manual + # TODO - use `Placements`` once ManagedClusters can be labeled immediately during the registration process. See https://github.com/open-cluster-management-io/ocm/issues/1195, https://github.com/open-cluster-management-io/ocm/pull/1123 + # placements: + # - namespace: managed-cluster-set-spokes + # name: spokes diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-role-binding.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-role-binding.yaml new file mode 100644 index 00000000..013d0f35 --- /dev/null +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-role-binding.yaml @@ -0,0 +1,13 @@ +# required to grant addon-manager permissions to bind spoke addon agent's SA to the hub's fleetconfig-controller manager clusterrole +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: addon-manager-fleetconfig-controller-manager +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "chart.fullname" . }}-manager-role +subjects: + - kind: ServiceAccount + name: addon-manager-controller-sa + namespace: open-cluster-management-hub \ No newline at end of file diff --git a/fleetconfig-controller/cmd/main.go b/fleetconfig-controller/cmd/main.go index efb90066..db0288de 100644 --- a/fleetconfig-controller/cmd/main.go +++ b/fleetconfig-controller/cmd/main.go @@ -18,7 +18,6 @@ limitations under the License. package main import ( - "crypto/tls" "flag" "fmt" "os" @@ -33,14 +32,10 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" - metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" - "sigs.k8s.io/controller-runtime/pkg/webhook" apiv1alpha1 "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1alpha1" apiv1beta1 "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1beta1" - controllerv1alpha1 "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/controller/v1alpha1" - controllerv1beta1 "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/controller/v1beta1" - webhookv1beta1 "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/webhook/v1beta1" + "github.com/open-cluster-management-io/lab/fleetconfig-controller/cmd/manager" // +kubebuilder:scaffold:imports ) @@ -58,130 +53,52 @@ func init() { } func main() { - var ( - metricsAddr string - enableLeaderElection bool - probeAddr string - secureMetrics bool - enableHTTP2 bool - useWebhook bool - certDir string - webhookPort int - spokeConcurrentReconciles int - ) + mOpts := manager.Options{ + Scheme: scheme, + } - flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metric endpoint binds to. Use the port :8080. If not set, it will be 0 to disable the metrics server.") - flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") - flag.BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.") - flag.BoolVar(&secureMetrics, "metrics-secure", false, "If set, the metrics endpoint is served securely.") - flag.BoolVar(&enableHTTP2, "enable-http2", false, "If set, HTTP/2 will be enabled for the metrics and webhook servers.") + flag.StringVar(&mOpts.MetricsAddr, "metrics-bind-address", "0", "The address the metric endpoint binds to. Use the port :8080. If not set, it will be 0 to disable the metrics server.") + flag.StringVar(&mOpts.ProbeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") + flag.BoolVar(&mOpts.EnableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.") + flag.BoolVar(&mOpts.SecureMetrics, "metrics-secure", false, "If set, the metrics endpoint is served securely.") + flag.BoolVar(&mOpts.EnableHTTP2, "enable-http2", false, "If set, HTTP/2 will be enabled for the metrics and webhook servers.") - flag.BoolVar(&useWebhook, "use-webhook", useWebhook, "Enable admission webhooks") - flag.StringVar(&certDir, "webhook-cert-dir", certDir, "Admission webhook cert/key dir") - flag.IntVar(&webhookPort, "webhook-port", webhookPort, "Admission webhook port") + flag.BoolVar(&mOpts.UseWebhook, "use-webhook", mOpts.UseWebhook, "Enable admission webhooks") + flag.StringVar(&mOpts.CertDir, "webhook-cert-dir", mOpts.CertDir, "Admission webhook cert/key dir") + flag.IntVar(&mOpts.WebhookPort, "webhook-port", mOpts.WebhookPort, "Admission webhook port") - flag.IntVar(&spokeConcurrentReconciles, "spoke-concurrent-reconciles", apiv1beta1.SpokeDefaultMaxConcurrentReconciles, fmt.Sprintf("Maximum number of Spoke resources that may be reconciled in parallel. Defaults to %d.", apiv1beta1.SpokeDefaultMaxConcurrentReconciles)) + flag.IntVar(&mOpts.SpokeConcurrentReconciles, "spoke-concurrent-reconciles", apiv1beta1.SpokeDefaultMaxConcurrentReconciles, fmt.Sprintf("Maximum number of Spoke resources that may be reconciled in parallel. Defaults to %d.", apiv1beta1.SpokeDefaultMaxConcurrentReconciles)) + flag.StringVar(&mOpts.ClusterType, "cluster-type", apiv1beta1.ClusterTypeHub, "The type of cluster that this controller instance is installed in.") - opts := zap.Options{ + zOpts := zap.Options{ Development: true, } - opts.BindFlags(flag.CommandLine) + zOpts.BindFlags(flag.CommandLine) flag.Parse() - ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) - - // if the enable-http2 flag is false (the default), http/2 should be disabled - // due to its vulnerabilities. More specifically, disabling http/2 will - // prevent from being vulnerable to the HTTP/2 Stream Cancellation and - // Rapid Reset CVEs. For more information see: - // - https://github.com/advisories/GHSA-qppj-fm5r-hxr3 - // - https://github.com/advisories/GHSA-4374-p667-p6c8 - disableHTTP2 := func(c *tls.Config) { - setupLog.Info("disabling http/2") - c.NextProtos = []string{"http/1.1"} - } - - tlsOpts := []func(*tls.Config){} - if !enableHTTP2 { - tlsOpts = append(tlsOpts, disableHTTP2) - } + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&zOpts))) - webhookServer := webhook.NewServer(webhook.Options{ - CertDir: certDir, - Port: webhookPort, - TLSOpts: tlsOpts, - }) - - mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ - Scheme: scheme, - Metrics: metricsserver.Options{ - BindAddress: metricsAddr, - SecureServing: secureMetrics, - TLSOpts: tlsOpts, - }, - WebhookServer: webhookServer, - HealthProbeBindAddress: probeAddr, - LeaderElection: enableLeaderElection, - LeaderElectionID: "9aac6663.open-cluster-management.io", - // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily - // when the Manager ends. This requires the binary to immediately end when the - // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly - // speeds up voluntary leader transitions as the new leader don't have to wait - // LeaseDuration time first. - // - // In the default scaffold provided, the program ends immediately after - // the manager stops, so would be fine to enable this option. However, - // if you are doing or is intended to do any operation such as perform cleanups - // after the manager stops then its usage might be unsafe. - // LeaderElectionReleaseOnCancel: true, - }) - if err != nil { - setupLog.Error(err, "unable to start manager") - os.Exit(1) - } - - if err = (&controllerv1alpha1.FleetConfigReconciler{ - Client: mgr.GetClient(), - Log: ctrl.Log.WithName("controllers").WithName("FleetConfig"), - Scheme: mgr.GetScheme(), - }).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "FleetConfig") - os.Exit(1) - } - - if err := (&controllerv1beta1.HubReconciler{ - Client: mgr.GetClient(), - Log: ctrl.Log.WithName("controllers").WithName("Hub"), - Scheme: mgr.GetScheme(), - }).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Hub") - os.Exit(1) - } - - if err := (&controllerv1beta1.SpokeReconciler{ - Client: mgr.GetClient(), - Log: ctrl.Log.WithName("controllers").WithName("Spoke"), - ConcurrentReconciles: spokeConcurrentReconciles, - Scheme: mgr.GetScheme(), - }).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Spoke") - os.Exit(1) - } + var ( + mgr ctrl.Manager + err error + ) - // nolint:goconst - if useWebhook || os.Getenv("ENABLE_WEBHOOKS") != "false" { - if err = apiv1alpha1.SetupFleetConfigWebhookWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create webhook", "webhook", "FleetConfig") + switch mOpts.ClusterType { + case apiv1beta1.ClusterTypeHub: + mgr, err = manager.ForHub(setupLog, mOpts) + if err != nil { + setupLog.Error(err, "unable to start manager") os.Exit(1) } - if err := webhookv1beta1.SetupSpokeWebhookWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create webhook", "webhook", "Spoke") - os.Exit(1) - } - if err := webhookv1beta1.SetupHubWebhookWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create webhook", "webhook", "Hub") + case apiv1beta1.ClusterTypeSpoke: + mgr, err = manager.ForSpoke(setupLog, mOpts) + if err != nil { + setupLog.Error(err, "unable to start manager") os.Exit(1) } + default: + setupLog.Info("unable to create controller for unknown cluster type", "controller", "Spoke", "clusterType", mOpts.ClusterType, "allowed", apiv1beta1.SupportedClusterTypes) + os.Exit(1) } // +kubebuilder:scaffold:builder diff --git a/fleetconfig-controller/cmd/manager/manager.go b/fleetconfig-controller/cmd/manager/manager.go new file mode 100644 index 00000000..611dbd2b --- /dev/null +++ b/fleetconfig-controller/cmd/manager/manager.go @@ -0,0 +1,290 @@ +// Package manager contains functions for configuring a controller manager. +package manager + +import ( + "bytes" + "crypto/tls" + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/rest" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + "sigs.k8s.io/controller-runtime/pkg/webhook" + + apiv1alpha1 "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1alpha1" + apiv1beta1 "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1beta1" + controllerv1alpha1 "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/controller/v1alpha1" + controllerv1beta1 "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/controller/v1beta1" + "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/kube" + webhookv1beta1 "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/webhook/v1beta1" + // +kubebuilder:scaffold:imports +) + +// Options are the options for configuring a controller manager +type Options struct { + MetricsAddr string + EnableLeaderElection bool + ProbeAddr string + SecureMetrics bool + EnableHTTP2 bool + UseWebhook bool + CertDir string + WebhookPort int + SpokeConcurrentReconciles int + ClusterType string + Scheme *runtime.Scheme +} + +// ForHub configures a manager instance for a Hub cluster. +func ForHub(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { + // if the EnableHTTP2 flag is false (the default), http/2 should be disabled + // due to its vulnerabilities. More specifically, disabling http/2 will + // prevent from being vulnerable to the HTTP/2 Stream Cancellation and + // Rapid Reset CVEs. For more information see: + // - https://github.com/advisories/GHSA-qppj-fm5r-hxr3 + // - https://github.com/advisories/GHSA-4374-p667-p6c8 + disableHTTP2 := func(c *tls.Config) { + setupLog.Info("disabling http/2") + c.NextProtos = []string{"http/1.1"} + } + + tlsOpts := []func(*tls.Config){} + if !opts.EnableHTTP2 { + tlsOpts = append(tlsOpts, disableHTTP2) + } + + webhookServer := webhook.NewServer(webhook.Options{ + CertDir: opts.CertDir, + Port: opts.WebhookPort, + TLSOpts: tlsOpts, + }) + + mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ + Scheme: opts.Scheme, + Metrics: metricsserver.Options{ + BindAddress: opts.MetricsAddr, + SecureServing: opts.SecureMetrics, + TLSOpts: tlsOpts, + }, + WebhookServer: webhookServer, + HealthProbeBindAddress: opts.ProbeAddr, + LeaderElection: opts.EnableLeaderElection, + LeaderElectionID: "9aac6663.open-cluster-management.io", + // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily + // when the Manager ends. This requires the binary to immediately end when the + // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly + // speeds up voluntary leader transitions as the new leader don't have to wait + // LeaseDuration time first. + // + // In the default scaffold provided, the program ends immediately after + // the manager stops, so would be fine to enable this option. However, + // if you are doing or is intended to do any operation such as perform cleanups + // after the manager stops then its usage might be unsafe. + // LeaderElectionReleaseOnCancel: true, + }) + if err != nil { + return nil, err + } + + if err = (&controllerv1alpha1.FleetConfigReconciler{ + Client: mgr.GetClient(), + Log: ctrl.Log.WithName("controllers").WithName("FleetConfig"), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "FleetConfig") + return nil, err + } + + if err := (&controllerv1beta1.HubReconciler{ + Client: mgr.GetClient(), + Log: ctrl.Log.WithName("controllers").WithName("Hub"), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Hub") + return nil, err + } + if err := (&controllerv1beta1.SpokeReconciler{ + Client: mgr.GetClient(), + Log: ctrl.Log.WithName("controllers").WithName("Spoke"), + ConcurrentReconciles: opts.SpokeConcurrentReconciles, + Scheme: mgr.GetScheme(), + ClusterType: opts.ClusterType, + }).SetupWithManagerForHub(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Spoke") + return nil, err + } + + // nolint:goconst + if (opts.UseWebhook || os.Getenv("ENABLE_WEBHOOKS") != "false") && opts.ClusterType != apiv1beta1.ClusterTypeSpoke { + if err = apiv1alpha1.SetupFleetConfigWebhookWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create webhook", "webhook", "FleetConfig") + return nil, err + } + if err := webhookv1beta1.SetupHubWebhookWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create webhook", "webhook", "Hub") + return nil, err + } + if err := webhookv1beta1.SetupSpokeWebhookWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create webhook", "webhook", "Spoke") + return nil, err + } + } + return mgr, nil +} + +// ForSpoke configures a manager instance for a Spoke cluster. +func ForSpoke(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { + // if the EnableHTTP2 flag is false (the default), http/2 should be disabled + // due to its vulnerabilities. More specifically, disabling http/2 will + // prevent from being vulnerable to the HTTP/2 Stream Cancellation and + // Rapid Reset CVEs. For more information see: + // - https://github.com/advisories/GHSA-qppj-fm5r-hxr3 + // - https://github.com/advisories/GHSA-4374-p667-p6c8 + disableHTTP2 := func(c *tls.Config) { + setupLog.Info("disabling http/2") + c.NextProtos = []string{"http/1.1"} + } + + tlsOpts := []func(*tls.Config){} + if !opts.EnableHTTP2 { + tlsOpts = append(tlsOpts, disableHTTP2) + } + + webhookServer := webhook.NewServer(webhook.Options{ + CertDir: opts.CertDir, + Port: opts.WebhookPort, + TLSOpts: tlsOpts, + }) + + // enables watching resources in the hub cluster + hubRestCfg, err := getHubRestConfig() + if err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Spoke") + return nil, err + } + + // enables leader election in the spoke cluster + localRestCfg, err := ctrl.GetConfig() + if err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Spoke") + return nil, err + } + + spokeNamespace := os.Getenv(apiv1beta1.SpokeNamespaceEnvVar) + hubNamespace := os.Getenv(apiv1beta1.HubNamespaceEnvVar) + + mgr, err := ctrl.NewManager(hubRestCfg, ctrl.Options{ + Scheme: opts.Scheme, + Metrics: metricsserver.Options{ + BindAddress: opts.MetricsAddr, + SecureServing: opts.SecureMetrics, + TLSOpts: tlsOpts, + }, + // Configure Informer Cache to only watch the resources it needs to perform a reconcile, in specific namespaces. + // There are 2 related reasons for this. + // 1. Limit scope of access to the hub for the the spoke's controller. Since it only needs to watch a 3 resources across 2 namespaces, it should not watch other namespaces. + // 2. OCM AddOn framework limits the scope of access that addon agents have to the hub. + // Only access to either the ManagedCluster namespace, or a user-specified namespace is allowed. No ClusterRoleBindings created by the addon manager controller. + Cache: cache.Options{ + ByObject: map[client.Object]cache.ByObject{ + &apiv1beta1.Spoke{}: { + Namespaces: map[string]cache.Config{ + spokeNamespace: { + LabelSelector: labels.Everything(), // prevent default + FieldSelector: fields.Everything(), // prevent default + Transform: func(in any) (any, error) { return in, nil }, // prevent default + }, + }, + }, + // required to retrieve klusterlet values from configmap in the spoke namespace + &corev1.ConfigMap{}: { + Namespaces: map[string]cache.Config{ + spokeNamespace: { + LabelSelector: labels.Everything(), // prevent default + FieldSelector: fields.Everything(), // prevent default + Transform: func(in any) (any, error) { return in, nil }, // prevent default + }, + }, + }, + // required to retrieve hub from the spoke.spec.hubRef namespace + &apiv1beta1.Hub{}: { + Namespaces: map[string]cache.Config{ + hubNamespace: { + LabelSelector: labels.Everything(), // prevent default + FieldSelector: fields.Everything(), // prevent default + Transform: func(in any) (any, error) { return in, nil }, // prevent default + }, + }, + }, + }, + }, + + WebhookServer: webhookServer, + HealthProbeBindAddress: opts.ProbeAddr, + LeaderElection: opts.EnableLeaderElection, + LeaderElectionConfig: localRestCfg, // use local restConfig. alternatively, we can disable leader election if HA is not a concern. + LeaderElectionID: "9aac6663.open-cluster-management.io", + // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily + // when the Manager ends. This requires the binary to immediately end when the + // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly + // speeds up voluntary leader transitions as the new leader don't have to wait + // LeaseDuration time first. + // + // In the default scaffold provided, the program ends immediately after + // the manager stops, so would be fine to enable this option. However, + // if you are doing or is intended to do any operation such as perform cleanups + // after the manager stops then its usage might be unsafe. + // LeaderElectionReleaseOnCancel: true, + }) + if err != nil { + return nil, err + } + + if err := (&controllerv1beta1.SpokeReconciler{ + Client: mgr.GetClient(), // Uses the manager's client which has the correct scheme and hub config + Log: ctrl.Log.WithName("controllers").WithName("Spoke"), + ConcurrentReconciles: opts.SpokeConcurrentReconciles, + Scheme: mgr.GetScheme(), + ClusterType: opts.ClusterType, + }).SetupWithManagerForSpoke(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Spoke") + return nil, err + } + return mgr, nil +} + +func getHubRestConfig() (*rest.Config, error) { + hubKubeconfigPath := os.Getenv(apiv1beta1.HubKubeconfigEnvVar) + if hubKubeconfigPath == "" { + hubKubeconfigPath = apiv1beta1.HubKubeconfigFallbackPath + } + + basePath := strings.TrimSuffix(hubKubeconfigPath, "kubeconfig") + certPath := filepath.Join(basePath, "tls.crt") + keyPath := filepath.Join(basePath, "tls.key") + + hubKubeconfig, err := os.ReadFile(hubKubeconfigPath) + if err != nil { + return nil, fmt.Errorf("failed to read kubeconfig from mount: %v", err) + } + + // patch the kubeconfig with absolute paths to cert/key + hubKubeconfig = bytes.ReplaceAll(hubKubeconfig, []byte("tls.crt"), []byte(certPath)) + hubKubeconfig = bytes.ReplaceAll(hubKubeconfig, []byte("tls.key"), []byte(keyPath)) + + cfg, err := kube.RestConfigFromKubeconfig(hubKubeconfig) + if err != nil { + return nil, fmt.Errorf("failed to create REST config for hub: %v", err) + } + return cfg, nil +} diff --git a/fleetconfig-controller/config/devspace/manager.yaml b/fleetconfig-controller/config/devspace/hub/manager.yaml similarity index 89% rename from fleetconfig-controller/config/devspace/manager.yaml rename to fleetconfig-controller/config/devspace/hub/manager.yaml index 4852e273..022c18ca 100644 --- a/fleetconfig-controller/config/devspace/manager.yaml +++ b/fleetconfig-controller/config/devspace/hub/manager.yaml @@ -23,6 +23,11 @@ spec: - -- args: - while true; do sleep 30; done; + env: + - name: CONTROLLER_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace image: quay.io/open-cluster-management/fleetconfig-controller:dev imagePullPolicy: IfNotPresent ports: diff --git a/fleetconfig-controller/config/devspace/spoke/manager.yaml b/fleetconfig-controller/config/devspace/spoke/manager.yaml new file mode 100644 index 00000000..aa20e251 --- /dev/null +++ b/fleetconfig-controller/config/devspace/spoke/manager.yaml @@ -0,0 +1,56 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fleetconfig-controller-manager +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/instance: fleetconfig-controller + app.kubernetes.io/name: fleetconfig-controller + template: + metadata: + labels: + app.kubernetes.io/instance: fleetconfig-controller + app.kubernetes.io/name: fleetconfig-controller + spec: + serviceAccountName: fleetconfig-controller-manager + containers: + - name: fleetconfig-controller-manager + env: + - name: KUBERNETES_CLUSTER_DOMAIN + value: cluster.local + - name: CLUSTER_NAMESPACE + value: fleetconfig-system + - name: HUB_KUBECONFIG + value: /managed/hub-kubeconfig/kubeconfig + - name: CLUSTER_NAME + value: spoke-1 + - name: INSTALL_NAMESPACE + value: fleetconfig-system + - name: HUB_NAMESPACE + value: default + - name: CONTROLLER_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + command: + - /bin/bash + - -c + - -- + args: + - while true; do sleep 30; done; + image: quay.io/open-cluster-management/fleetconfig-controller:dev + imagePullPolicy: IfNotPresent + ports: + - containerPort: 9440 + name: healthz + protocol: TCP + volumeMounts: + - mountPath: /managed/hub-kubeconfig + name: hub-kubeconfig + volumes: + - name: hub-kubeconfig + secret: + defaultMode: 420 + secretName: fleetconfig-controller-manager-hub-kubeconfig \ No newline at end of file diff --git a/fleetconfig-controller/devspace.yaml b/fleetconfig-controller/devspace.yaml index 12fc7839..69e83d96 100644 --- a/fleetconfig-controller/devspace.yaml +++ b/fleetconfig-controller/devspace.yaml @@ -9,8 +9,10 @@ vars: value: "." IMAGE_REPOSITORY: quay.io/open-cluster-management/fleetconfig-controller IMAGE_TAG: latest - PORT: + HUB_PORT: value: "2344" + SPOKE_PORT: + value: "2345" PROVIDER: value: "production" # production (generic), eks, gke DEVSPACE_ENV_FILE: './hack/.versions.env' @@ -37,7 +39,7 @@ pipelines: create_deployments cert-manager kubectl apply -f ./hack/dev/cluster-issuer.yaml create_deployments fleetconfig-controller-dev - start_dev --all + start_dev fleetconfig-controller-dev-hub deploy: |- run_dependencies --all create_deployments cert-manager @@ -55,7 +57,13 @@ pipelines: build_images fleetconfig-controller-dev kubectl -n fleetconfig-system delete deployment fleetconfig-controller-manager create_deployments debug - start_dev --all + start_dev fleetconfig-controller-dev-hub + debug-spoke: |- + run_dependencies --all + build_images fleetconfig-controller-dev + kubectl -n fleetconfig-system delete deployment fleetconfig-controller-manager + create_deployments debug-spoke + start_dev fleetconfig-controller-dev-spoke images: fleetconfig-controller-dev: @@ -103,6 +111,8 @@ deployments: name: ${CONTEXT}/charts/fleetconfig-controller values: devspaceEnabled: true + fleetConfig: + enabled: ${FLEETCONFIG_ENABLED} valuesFiles: - ${CONTEXT}/charts/fleetconfig-controller/values.yaml @@ -124,7 +134,13 @@ deployments: debug: kubectl: manifests: - - ${CONTEXT}/config/devspace/ + - ${CONTEXT}/config/devspace/hub + updateImageTags: false + + debug-spoke: + kubectl: + manifests: + - ${CONTEXT}/config/devspace/spoke updateImageTags: false cert-manager: @@ -144,15 +160,45 @@ hooks: events: ["before:deploy"] dev: - fleetconfig-controller-dev: + fleetconfig-controller-dev-hub: + imageSelector: ${IMAGE_REPOSITORY} + terminal: + enabled: true + disableReplace: true + workDir: /workspace + command: ./hack/dev/devspace-start-hub.sh + ports: + - port: ${HUB_PORT} + sync: + - path: .:/workspace + excludePaths: + - '**' + - '!/api' + - '!/charts' + - '!/cmd' + - '!/config' + - '!/dependencymagnet' + - '!/internal' + - '!/pkg' + - '!/hack/dev/devspace-start-hub.sh' + - '!/hack' + - '!/go.mod' + - '!/go.sum' + - 'Makefile' + - path: ./hack/dev:/workspace + excludePaths: + - '**' + - '!devspace-start-hub.sh' + + fleetconfig-controller-dev-spoke: imageSelector: ${IMAGE_REPOSITORY} terminal: enabled: true disableReplace: true workDir: /workspace - command: ./devspace-start.sh + command: ./devspace-start-spoke.sh ports: - - port: ${PORT} + - port: ${SPOKE_PORT} sync: - path: .:/workspace excludePaths: @@ -164,8 +210,11 @@ dev: - '!/dependencymagnet' - '!/internal' - '!/pkg' - - '!/devspace-start.sh' - '!/hack' - '!/go.mod' - '!/go.sum' - 'Makefile' + - path: ./hack/dev:/workspace + excludePaths: + - '**' + - '!devspace-start-spoke.sh' \ No newline at end of file diff --git a/fleetconfig-controller/devspace-start.sh b/fleetconfig-controller/hack/dev/devspace-start-hub.sh similarity index 100% rename from fleetconfig-controller/devspace-start.sh rename to fleetconfig-controller/hack/dev/devspace-start-hub.sh diff --git a/fleetconfig-controller/hack/dev/devspace-start-spoke.sh b/fleetconfig-controller/hack/dev/devspace-start-spoke.sh new file mode 100755 index 00000000..ffb35f16 --- /dev/null +++ b/fleetconfig-controller/hack/dev/devspace-start-spoke.sh @@ -0,0 +1,41 @@ +#!/bin/bash +set +e # Continue on errors + +COLOR_CYAN="\033[0;36m" +COLOR_RESET="\033[0m" + +export CGO_ENABLED=0 +FLAGS="--cluster-type=spoke --spoke-concurrent-reconciles=1" +RUN_CMD="go run ./cmd/main.go $FLAGS" +DEBUG_CMD="dlv debug ./cmd/main.go --listen=0.0.0.0:2345 --api-version=2 --output /tmp/__debug_bin --headless -- $FLAGS" + +echo -e "${COLOR_CYAN} + ____ ____ + | _ \ _____ __/ ___| _ __ __ _ ___ ___ + | | | |/ _ \ \ / /\___ \| '_ \ / _\` |/ __/ _ \\ + | |_| | __/\ V / ___) | |_) | (_| | (_| __/ + |____/ \___| \_/ |____/| .__/ \__,_|\___\___| + |_| +${COLOR_RESET} +Welcome to your development container! +This is how you can work with it: +- Run \`${COLOR_CYAN}${RUN_CMD}${COLOR_RESET}\` to start the fleetconfig controller manager +- ${COLOR_CYAN}Files will be synchronized${COLOR_RESET} between your local machine and this container + +If you wish to run the fleetconfig controller manager in debug mode with delve, run: + \`${COLOR_CYAN}${DEBUG_CMD}${COLOR_RESET}\` + Wait until the \`${COLOR_CYAN}API server listening at: [::]:2344${COLOR_RESET}\` message appears + Start the \"Debug (localhost:2344)\" configuration in VSCode to connect your debugger session. + ${COLOR_CYAN}Note:${COLOR_RESET} fleetconfig controller manager won't start until you connect with the debugger. + ${COLOR_CYAN}Note:${COLOR_RESET} fleetconfig controller manager will be stopped once you detach your debugger session. + +${COLOR_CYAN}TIP:${COLOR_RESET} hit an up arrow on your keyboard to find the commands mentioned above :) +" +# add useful commands to the history for convenience +export HISTFILE=/tmp/.bash_history +history -s "$RUN_CMD" +history -s "$DEBUG_CMD" +history -a + +# hide "I have no name!" from the bash prompt +bash --init-file <(echo "export PS1=\"\\H:\\W\\$ \"") diff --git a/fleetconfig-controller/hack/install_crds.sh b/fleetconfig-controller/hack/install_crds.sh index 310ee497..3af9ac3d 100755 --- a/fleetconfig-controller/hack/install_crds.sh +++ b/fleetconfig-controller/hack/install_crds.sh @@ -26,3 +26,4 @@ tar -xzf "$tmp_dir/$ocm_tarball" -C "$tmp_dir" cp "$tmp_dir/$ocm_asset_dir"/cluster/v1beta1/*.crd.yaml "$chart_dir/crds" cp "$tmp_dir/$ocm_asset_dir"/cluster/v1beta2/*.crd.yaml "$chart_dir/crds" cp "$tmp_dir/$ocm_asset_dir"/cluster/v1/*.crd.yaml "$chart_dir/crds" +cp "$tmp_dir/$ocm_asset_dir"/addon/v1alpha1/*.crd.yaml "$chart_dir/crds" diff --git a/fleetconfig-controller/internal/controller/v1beta1/addon.go b/fleetconfig-controller/internal/controller/v1beta1/addon.go index ada0fef0..3915339a 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/addon.go +++ b/fleetconfig-controller/internal/controller/v1beta1/addon.go @@ -2,6 +2,7 @@ package v1beta1 import ( "context" + "encoding/json" "fmt" "net/url" "os/exec" @@ -16,6 +17,7 @@ import ( "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/types" + addonv1alpha1 "open-cluster-management.io/api/addon/v1alpha1" addonapi "open-cluster-management.io/api/client/addon/clientset/versioned" workapi "open-cluster-management.io/api/client/work/clientset/versioned" workv1 "open-cluster-management.io/api/work/v1" @@ -346,7 +348,7 @@ func handleSpokeAddons(ctx context.Context, addonC *addonapi.Clientset, spoke *v } // Enable new addons and updated addons - newEnabledAddons, err := handleAddonEnable(ctx, spoke, addonsToEnable) + newEnabledAddons, err := handleAddonEnable(ctx, spoke, addonsToEnable, addonC) // even if an error is returned, any addon which was successfully enabled is tracked, so append before returning enabledAddons = append(enabledAddons, newEnabledAddons...) if err != nil { @@ -362,7 +364,7 @@ func handleSpokeAddons(ctx context.Context, addonC *addonapi.Clientset, spoke *v return enabledAddons, nil } -func handleAddonEnable(ctx context.Context, spoke *v1beta1.Spoke, addons []v1beta1.AddOn) ([]string, error) { +func handleAddonEnable(ctx context.Context, spoke *v1beta1.Spoke, addons []v1beta1.AddOn, addonC *addonapi.Clientset) ([]string, error) { if len(addons) == 0 { return nil, nil } @@ -404,6 +406,16 @@ func handleAddonEnable(ctx context.Context, spoke *v1beta1.Spoke, addons []v1bet enableErrs = append(enableErrs, fmt.Errorf("failed to enable addon: %v, output: %s", err, string(out))) continue } + // TODO - do this natively with clusteradm once https://github.com/open-cluster-management-io/clusteradm/issues/501 is resolved. + // OR switch to using Placements strategy once https://github.com/open-cluster-management-io/ocm/pull/1123 is merged. + if a.ConfigName == v1beta1.FCCAddOnName { + err = patchFCCMca(ctx, spoke.Name, addonC) + if err != nil { + enableErrs = append(enableErrs, err) + continue + } + } + enabledAddons = append(enabledAddons, a.ConfigName) logger.V(1).Info("enabled addon", "managedcluster", spoke.Name, "addon", a.ConfigName, "output", string(stdout)) } @@ -414,6 +426,44 @@ func handleAddonEnable(ctx context.Context, spoke *v1beta1.Spoke, addons []v1bet return enabledAddons, nil } +func patchFCCMca(ctx context.Context, spokeName string, addonC *addonapi.Clientset) error { + mca, err := addonC.AddonV1alpha1().ManagedClusterAddOns(spokeName).Get(ctx, v1beta1.FCCAddOnName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to configure fleetconfig-controller-manager: %v", err) + } + mca.Spec.Configs = append(mca.Spec.Configs, addonv1alpha1.AddOnConfig{ + ConfigGroupResource: addonv1alpha1.ConfigGroupResource{ + Group: addonv1alpha1.GroupName, + Resource: "addondeploymentconfigs", // TODO - no magic string + }, + ConfigReferent: addonv1alpha1.ConfigReferent{ + Name: v1beta1.FCCAddOnName, + Namespace: spokeName, + }, + }) + + patchBytes, err := json.Marshal(map[string]any{ + "spec": map[string]any{ + "configs": mca.Spec.Configs, + }, + }) + if err != nil { + return fmt.Errorf("failed to marshal patch for fleetconfig-controller-manager: %v", err) + } + _, err = addonC.AddonV1alpha1().ManagedClusterAddOns(spokeName).Patch( + ctx, + v1beta1.FCCAddOnName, + types.MergePatchType, + patchBytes, + metav1.PatchOptions{}, + ) + if err != nil { + return fmt.Errorf("failed to patch fleetconfig-controller-manager: %v", err) + + } + return nil +} + func handleAddonDisable(ctx context.Context, spoke *v1beta1.Spoke, enabledAddons []string) error { if len(enabledAddons) == 0 { return nil @@ -654,8 +704,8 @@ func waitForAddonManifestWorksCleanup(ctx context.Context, workC *workapi.Client return false, nil } - // Success condition: no manifestWorks remaining - if len(manifestWorks.Items) == 0 { + // Success condition: only FCC manifestWork remaining + if len(manifestWorks.Items) == 1 { // TODO @arturshadnik ................. logger.V(1).Info("addon manifestWorks cleanup completed", "spokeName", spokeName, "remainingManifestWorks", len(manifestWorks.Items)) return true, nil } diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go index 5e580792..3c7f7a43 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go @@ -18,24 +18,15 @@ package v1beta1 import ( "context" - "encoding/json" - "errors" "fmt" - "maps" - "os/exec" + "os" "reflect" "slices" - "strings" - "dario.cat/mergo" - certificatesv1 "k8s.io/api/certificates/v1" - corev1 "k8s.io/api/core/v1" kerrs "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" - clusterv1 "open-cluster-management.io/api/cluster/v1" - operatorv1 "open-cluster-management.io/api/operator/v1" "sigs.k8s.io/cluster-api/util/patch" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" @@ -46,18 +37,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" - "sigs.k8s.io/yaml" "github.com/go-logr/logr" - "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1alpha1" "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1beta1" - "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/args" - exec_utils "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/exec" - "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/file" - "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/hash" - "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/kube" - "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/version" - "github.com/open-cluster-management-io/lab/fleetconfig-controller/pkg/common" ) // SpokeReconciler reconciles a Spoke object @@ -66,6 +48,7 @@ type SpokeReconciler struct { Log logr.Logger Scheme *runtime.Scheme ConcurrentReconciles int + ClusterType string } // +kubebuilder:rbac:groups=fleetconfig.open-cluster-management.io,resources=spokes,verbs=get;list;watch;create;update;patch;delete @@ -108,18 +91,18 @@ func (r *SpokeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl spoke.Status.Phase = v1beta1.Unhealthy } - // Add a finalizer if not already present, set defaults, and requeue - if !slices.Contains(spoke.Finalizers, v1beta1.SpokeCleanupFinalizer) { + // Add finalizers if not already present, set defaults, and requeue + if !slices.Contains(spoke.Finalizers, v1beta1.HubCleanupFinalizer) { setDefaults(ctx, spoke, hubMeta) - spoke.Finalizers = append(spoke.Finalizers, v1beta1.SpokeCleanupFinalizer) + spoke.Finalizers = append( + spoke.Finalizers, + v1beta1.HubCleanupPreflightFinalizer, // removed by the hub to signal to the spoke that preflight is completed + v1beta1.SpokeCleanupFinalizer, // removed by the spoke to signal to the hub that unjoin succeeded + v1beta1.HubCleanupFinalizer, // removed by the hub after post-unjoin cleanup is finished + ) return ret(ctx, ctrl.Result{RequeueAfter: requeue}, nil) } - spokeKubeconfig, err := kube.KubeconfigFromSecretOrCluster(ctx, r.Client, spoke.Spec.Kubeconfig, spoke.Namespace) - if err != nil { - return ret(ctx, ctrl.Result{}, err) - } - // Handle deletion logic with finalizer if !spoke.DeletionTimestamp.IsZero() { if spoke.Status.Phase != v1beta1.Deleting { @@ -127,17 +110,15 @@ func (r *SpokeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl return ret(ctx, ctrl.Result{RequeueAfter: requeue}, nil) } - if slices.Contains(spoke.Finalizers, v1beta1.SpokeCleanupFinalizer) { - if err := r.cleanup(ctx, spoke, spokeKubeconfig, hubMeta); err != nil { + // HubCleanupFinalizer is the last finalizer to be removed + if slices.Contains(spoke.Finalizers, v1beta1.HubCleanupFinalizer) { + if err := r.cleanup(ctx, spoke, hubMeta.kubeconfig); err != nil { spoke.SetConditions(true, v1beta1.NewCondition( err.Error(), v1beta1.CleanupFailed, metav1.ConditionTrue, metav1.ConditionFalse, )) return ret(ctx, ctrl.Result{}, err) } } - spoke.Finalizers = slices.DeleteFunc(spoke.Finalizers, func(s string) bool { - return s == v1beta1.SpokeCleanupFinalizer - }) // end reconciliation return ret(ctx, ctrl.Result{}, nil) } @@ -164,7 +145,7 @@ func (r *SpokeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl } // Handle Spoke cluster: join and/or upgrade - if err := r.handleSpoke(ctx, spoke, hubMeta, spokeKubeconfig); err != nil { + if err := r.handleSpoke(ctx, spoke, hubMeta); err != nil { logger.Error(err, "Failed to handle spoke operations") spoke.Status.Phase = v1beta1.Unhealthy } @@ -209,711 +190,8 @@ func setDefaults(ctx context.Context, spoke *v1beta1.Spoke, hubMeta hubMeta) { } } -// cleanup cleans up a Spoke and its associated resources. -func (r *SpokeReconciler) cleanup(ctx context.Context, spoke *v1beta1.Spoke, spokeKubeconfig []byte, hubMeta hubMeta) error { - logger := log.FromContext(ctx) - - clusterC, err := common.ClusterClient(hubMeta.kubeconfig) - if err != nil { - return err - } - workC, err := common.WorkClient(hubMeta.kubeconfig) - if err != nil { - return err - } - addonC, err := common.AddOnClient(hubMeta.kubeconfig) - if err != nil { - return fmt.Errorf("failed to create addon client for cleanup: %w", err) - } - - // skip clean up if the ManagedCluster resource is not found or if any manifestWorks exist - managedCluster, err := clusterC.ClusterV1().ManagedClusters().Get(ctx, spoke.Name, metav1.GetOptions{}) - if kerrs.IsNotFound(err) { - logger.Info("ManagedCluster resource not found; nothing to do") - return nil - } else if err != nil { - return fmt.Errorf("unexpected error listing managedClusters: %w", err) - } - manifestWorks, err := workC.WorkV1().ManifestWorks(managedCluster.Name).List(ctx, metav1.ListOptions{}) - if err != nil { - return fmt.Errorf("failed to list manifestWorks for managedCluster %s: %w", managedCluster.Name, err) - } - - // check that the number of manifestWorks is the same as the number of addons enabled for that spoke - if len(manifestWorks.Items) > 0 && !allOwnersAddOns(manifestWorks.Items) { - msg := fmt.Sprintf("Found manifestWorks for ManagedCluster %s; cannot unjoin spoke cluster while it has active ManifestWorks", managedCluster.Name) - logger.Info(msg) - return errors.New(msg) - } - - // remove addons only after confirming that the cluster can be unjoined - this avoids leaving dangling resources that may rely on the addon - spokeCopy := spoke.DeepCopy() - spokeCopy.Spec.AddOns = nil - if _, err := handleSpokeAddons(ctx, addonC, spokeCopy); err != nil { - spoke.SetConditions(true, v1beta1.NewCondition( - err.Error(), v1beta1.AddonsConfigured, metav1.ConditionTrue, metav1.ConditionFalse, - )) - return err - } - - if len(spoke.Status.EnabledAddons) > 0 { - // Wait for addon manifestWorks to be fully cleaned up before proceeding with unjoin - if err := waitForAddonManifestWorksCleanup(ctx, workC, spoke.Name, addonCleanupTimeout); err != nil { - spoke.SetConditions(true, v1beta1.NewCondition( - err.Error(), v1beta1.AddonsConfigured, metav1.ConditionTrue, metav1.ConditionFalse, - )) - return fmt.Errorf("addon manifestWorks cleanup failed: %w", err) - } - spoke.SetConditions(true, v1beta1.NewCondition( - v1beta1.AddonsConfigured, v1beta1.AddonsConfigured, metav1.ConditionFalse, metav1.ConditionFalse, - )) - } - - if err := r.unjoinSpoke(ctx, spoke, spokeKubeconfig); err != nil { - return err - } - - // remove CSR - csrList := &certificatesv1.CertificateSigningRequestList{} - if err := r.List(ctx, csrList, client.HasLabels{"open-cluster-management.io/cluster-name"}); err != nil { - return err - } - for _, c := range csrList.Items { - trimmedName := csrSuffixPattern.ReplaceAllString(c.Name, "") - if trimmedName == spoke.Name { - if err := r.Delete(ctx, &c); err != nil { - return err - } - } - } - - // remove ManagedCluster - if err = clusterC.ClusterV1().ManagedClusters().Delete(ctx, spoke.Name, metav1.DeleteOptions{}); err != nil { - return client.IgnoreNotFound(err) - } - - // remove Namespace - ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: spoke.Name}} - if err := r.Delete(ctx, ns); err != nil { - return client.IgnoreNotFound(err) - } - - return nil -} - -// handleSpoke manages Spoke cluster join and upgrade operations -func (r *SpokeReconciler) handleSpoke(ctx context.Context, spoke *v1beta1.Spoke, hubMeta hubMeta, spokeKubeconfig []byte) error { - logger := log.FromContext(ctx) - logger.V(0).Info("handleSpoke", "spoke", spoke.Name) - - hub := hubMeta.hub - hubKubeconfig := hubMeta.kubeconfig - - clusterClient, err := common.ClusterClient(hubKubeconfig) - if err != nil { - return err - } - addonC, err := common.AddOnClient(hubKubeconfig) - if err != nil { - return fmt.Errorf("failed to create addon client: %w", err) - } - - // check if the spoke has already been joined to the hub - managedCluster, err := common.GetManagedCluster(ctx, clusterClient, spoke.Name) - if err != nil { - logger.Error(err, "failed to get managedCluster", "spoke", spoke.Name) - return err - } - - klusterletValues, err := r.mergeKlusterletValues(ctx, spoke) - if err != nil { - return err - } - - // attempt to join the spoke cluster if it hasn't already been joined - if managedCluster == nil { - if err := r.joinSpoke(ctx, spoke, hubMeta, klusterletValues, spokeKubeconfig); err != nil { - spoke.SetConditions(true, v1beta1.NewCondition( - err.Error(), v1beta1.SpokeJoined, metav1.ConditionFalse, metav1.ConditionTrue, - )) - return err - } - - // Accept the cluster join request - if err := acceptCluster(ctx, spoke, false); err != nil { - spoke.SetConditions(true, v1beta1.NewCondition( - err.Error(), v1beta1.SpokeJoined, metav1.ConditionFalse, metav1.ConditionTrue, - )) - return err - } - - managedCluster, err = common.GetManagedCluster(ctx, clusterClient, spoke.Name) - if err != nil { - logger.Error(err, "failed to get managedCluster after join", "spoke", spoke.Name) - return err - } - } - - // check managed clusters joined condition - jc := r.getJoinedCondition(managedCluster) - if jc == nil { - logger.V(0).Info("waiting for spoke cluster to join", "name", spoke.Name) - msg := fmt.Sprintf("ManagedClusterJoined condition not found in ManagedCluster for spoke cluster %s", spoke.Name) - spoke.SetConditions(true, v1beta1.NewCondition( - msg, v1beta1.SpokeJoined, metav1.ConditionFalse, metav1.ConditionTrue, - )) - // Re-accept all join requests for the spoke cluster - if err := acceptCluster(ctx, spoke, true); err != nil { - logger.Error(err, "failed to accept spoke cluster join request(s)", "spoke", spoke.Name) - } - return nil - } - - logger.V(0).Info("found join condition", "reason", jc.Reason, "status", jc.Status, "message", jc.Message) - if jc.Status != metav1.ConditionTrue { - msg := fmt.Sprintf("failed to join spoke cluster %s: %s", spoke.Name, jc.Message) - spoke.SetConditions(true, v1beta1.NewCondition( - msg, v1beta1.SpokeJoined, metav1.ConditionFalse, metav1.ConditionTrue, - )) - return errors.New(msg) - } - - // spoke cluster has joined successfully - spoke.SetConditions(true, v1beta1.NewCondition( - "Joined", v1beta1.SpokeJoined, metav1.ConditionTrue, metav1.ConditionTrue, - )) - - // Label the spoke ManagedCluster if in hub-as-spoke mode. - // This allows the 'spoke' ManagedClusterSet to omit the hub-as-spoke cluster from its list - // of spoke clusters. - if managedCluster != nil && spoke.Spec.Kubeconfig.InCluster { - if managedCluster.Labels == nil { - managedCluster.Labels = make(map[string]string) - } - managedCluster.Labels[v1beta1.LabelManagedClusterType] = v1beta1.ManagedClusterTypeHubAsSpoke - if err := common.UpdateManagedCluster(ctx, clusterClient, managedCluster); err != nil { - return err - } - logger.V(0).Info("labeled ManagedCluster as hub-as-spoke", "name", spoke.Name) - } - - // attempt an upgrade whenever the klusterlet's bundleVersion or values change - currKlusterletHash, err := hash.ComputeHash(klusterletValues) - if err != nil { - return fmt.Errorf("failed to compute hash of spoke %s klusterlet values: %w", spoke.Name, err) - } - if hub != nil && hub.Spec.ClusterManager.Source.BundleVersion != "" { - upgrade, err := r.spokeNeedsUpgrade(ctx, spoke, currKlusterletHash, hub.Spec.ClusterManager.Source, spokeKubeconfig) - if err != nil { - return fmt.Errorf("failed to check if spoke cluster needs upgrade: %w", err) - } - - if upgrade { - if err := r.upgradeSpoke(ctx, spoke, klusterletValues, hub.Spec.ClusterManager.Source, spokeKubeconfig); err != nil { - return fmt.Errorf("failed to upgrade spoke cluster %s: %w", spoke.Name, err) - } - } - } - - enabledAddons, err := handleSpokeAddons(ctx, addonC, spoke) - if err != nil { - msg := fmt.Sprintf("failed to enable addons for spoke cluster %s: %s", spoke.Name, err.Error()) - spoke.SetConditions(true, v1beta1.NewCondition( - msg, v1beta1.AddonsConfigured, metav1.ConditionFalse, metav1.ConditionTrue, - )) - return err - } - - // Update status with enabled addons and klusterlet hash - spoke.Status.EnabledAddons = enabledAddons - spoke.Status.KlusterletHash = currKlusterletHash - - return nil -} - -type tokenMeta struct { - Token string `json:"hub-token"` - HubAPIServer string `json:"hub-apiserver"` -} - -type hubMeta struct { - hub *v1beta1.Hub - kubeconfig []byte -} - -// joinSpoke joins a Spoke cluster to the Hub cluster -func (r *SpokeReconciler) joinSpoke(ctx context.Context, spoke *v1beta1.Spoke, hubMeta hubMeta, klusterletValues *v1beta1.KlusterletChartConfig, spokeKubeconfig []byte) error { - logger := log.FromContext(ctx) - logger.V(0).Info("joinSpoke", "spoke", spoke.Name) - - hub := hubMeta.hub - - if hub == nil { - return errors.New("hub not found") - } - // dont start join until the hub is ready - hubInitCond := hubMeta.hub.GetCondition(v1beta1.HubInitialized) - if hubInitCond == nil || hubInitCond.Status != metav1.ConditionTrue { - return errors.New("hub does not have initialized condition") - } - - tokenMeta, err := getToken(ctx, hubMeta) - if err != nil { - return fmt.Errorf("failed to get join token: %w", err) - } - - joinArgs := append([]string{ - "join", - "--cluster-name", spoke.Name, - fmt.Sprintf("--create-namespace=%t", spoke.Spec.CreateNamespace), - fmt.Sprintf("--enable-sync-labels=%t", spoke.Spec.SyncLabels), - "--hub-token", tokenMeta.Token, - "--wait=true", - // klusterlet args - "--mode", spoke.Spec.Klusterlet.Mode, - "--feature-gates", spoke.Spec.Klusterlet.FeatureGates, - fmt.Sprintf("--force-internal-endpoint-lookup=%t", spoke.Spec.Klusterlet.ForceInternalEndpointLookup), - fmt.Sprintf("--singleton=%t", spoke.Spec.Klusterlet.Singleton), - // source args - "--bundle-version", hub.Spec.ClusterManager.Source.BundleVersion, - "--image-registry", hub.Spec.ClusterManager.Source.Registry, - }, spoke.BaseArgs()...) - - for k, v := range spoke.Spec.Klusterlet.Annotations { - joinArgs = append(joinArgs, fmt.Sprintf("--klusterlet-annotation=%s=%s", k, v)) - } - - // resources args - joinArgs = append(joinArgs, args.PrepareResources(spoke.Spec.Klusterlet.Resources)...) - - // Use hub API server from spec if provided and not forced to use internal endpoint, - // otherwise fall back to the hub API server from the tokenMeta - if hub.Spec.APIServer != "" && !spoke.Spec.Klusterlet.ForceInternalEndpointLookup { - joinArgs = append(joinArgs, "--hub-apiserver", hub.Spec.APIServer) - } else if tokenMeta.HubAPIServer != "" { - joinArgs = append(joinArgs, "--hub-apiserver", tokenMeta.HubAPIServer) - } - - if hub.Spec.Ca != "" { - caFile, caCleanup, err := file.TmpFile([]byte(hub.Spec.Ca), "ca") - if caCleanup != nil { - defer caCleanup() - } - if err != nil { - return fmt.Errorf("failed to write hub CA to disk: %w", err) - } - joinArgs = append([]string{fmt.Sprintf("--ca-file=%s", caFile)}, joinArgs...) - } - - ra := hub.Spec.RegistrationAuth - if ra.Driver == v1alpha1.AWSIRSARegistrationDriver { - raArgs := []string{ - fmt.Sprintf("--registration-auth=%s", ra.Driver), - } - if ra.HubClusterARN != "" { - raArgs = append(raArgs, fmt.Sprintf("--hub-cluster-arn=%s", ra.HubClusterARN)) - } - if spoke.Spec.ClusterARN != "" { - raArgs = append(raArgs, fmt.Sprintf("--managed-cluster-arn=%s", spoke.Spec.ClusterARN)) - } - - joinArgs = append(joinArgs, raArgs...) - } - - if spoke.Spec.Klusterlet.Mode == string(operatorv1.InstallModeHosted) { - joinArgs = append(joinArgs, - fmt.Sprintf("--force-internal-endpoint-lookup-managed=%t", spoke.Spec.Klusterlet.ForceInternalEndpointLookupManaged), - ) - raw, err := kube.KubeconfigFromSecretOrCluster(ctx, r.Client, spoke.Spec.Klusterlet.ManagedClusterKubeconfig, spoke.Namespace) - if err != nil { - return err - } - mgdKcfg, mgdKcfgCleanup, err := file.TmpFile(raw, "kubeconfig") - if mgdKcfgCleanup != nil { - defer mgdKcfgCleanup() - } - if err != nil { - return fmt.Errorf("failed to write managedClusterKubeconfig to disk: %w", err) - } - joinArgs = append(joinArgs, "--managed-cluster-kubeconfig", mgdKcfg) - } - - if spoke.Spec.ProxyCa != "" { - proxyCaFile, proxyCaCleanup, err := file.TmpFile([]byte(spoke.Spec.ProxyCa), "proxy-ca") - if proxyCaCleanup != nil { - defer proxyCaCleanup() - } - if err != nil { - return fmt.Errorf("failed to write proxy CA to disk: %w", err) - } - joinArgs = append(joinArgs, fmt.Sprintf("--proxy-ca-file=%s", proxyCaFile)) - } - if spoke.Spec.ProxyURL != "" { - joinArgs = append(joinArgs, fmt.Sprintf("--proxy-url=%s", spoke.Spec.ProxyURL)) - } - - valuesArgs, valuesCleanup, err := prepareKlusterletValuesFile(klusterletValues) - if valuesCleanup != nil { - defer valuesCleanup() - } - if err != nil { - return err - } - joinArgs = append(joinArgs, valuesArgs...) - - joinArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, spokeKubeconfig, spoke.Spec.Kubeconfig.Context, joinArgs) - if cleanupKcfg != nil { - defer cleanupKcfg() - } - if err != nil { - return err - } - - logger.V(1).Info("clusteradm join", "args", joinArgs) - - cmd := exec.Command(clusteradm, joinArgs...) - stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, fmt.Sprintf("waiting for 'clusteradm join' to complete for spoke %s...", spoke.Name)) - if err != nil { - out := append(stdout, stderr...) - return fmt.Errorf("clusteradm join command failed for spoke %s: %v, output: %s", spoke.Name, err, string(out)) - } - logger.V(1).Info("successfully requested spoke cluster join", "output", string(stdout)) - - return nil -} - -// acceptCluster accepts a Spoke cluster's join request -func acceptCluster(ctx context.Context, spoke *v1beta1.Spoke, skipApproveCheck bool) error { - logger := log.FromContext(ctx) - logger.V(0).Info("acceptCluster", "spoke", spoke.Name) - - acceptArgs := append([]string{ - "accept", "--cluster", spoke.Name, - }, spoke.BaseArgs()...) - - logger.V(1).Info("clusteradm accept", "args", acceptArgs) - - // TODO: handle other args: - // --requesters=[]: - // Common Names of agents to be approved. - - if skipApproveCheck { - acceptArgs = append(acceptArgs, "--skip-approve-check") - } - - cmd := exec.Command(clusteradm, acceptArgs...) - stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, fmt.Sprintf("waiting for 'clusteradm accept' to complete for spoke %s...", spoke.Name)) - if err != nil { - out := append(stdout, stderr...) - return fmt.Errorf("failed to accept spoke cluster join request: %v, output: %s", err, string(out)) - } - logger.V(1).Info("spoke cluster join request accepted", "output", string(stdout)) - - return nil -} - -// getJoinedCondition gets the joined condition from a managed cluster -func (r *SpokeReconciler) getJoinedCondition(managedCluster *clusterv1.ManagedCluster) *metav1.Condition { - if managedCluster == nil || managedCluster.Status.Conditions == nil { - return nil - } - - for _, c := range managedCluster.Status.Conditions { - if c.Type == "ManagedClusterJoined" { - return &c - } - } - - return nil -} - -// spokeNeedsUpgrade checks if the klusterlet on a Spoke cluster requires an upgrade -func (r *SpokeReconciler) spokeNeedsUpgrade(ctx context.Context, spoke *v1beta1.Spoke, currKlusterletHash string, source v1beta1.OCMSource, spokeKubeconfig []byte) (bool, error) { - logger := log.FromContext(ctx) - logger.V(0).Info("spokeNeedsUpgrade", "spokeClusterName", spoke.Name) - - hashChanged := spoke.Status.KlusterletHash != currKlusterletHash - logger.V(2).Info("comparing klusterlet values hash", - "spoke", spoke.Name, - "prevHash", spoke.Status.KlusterletHash, - "currHash", currKlusterletHash, - ) - if hashChanged { - return true, nil - } - - if source.BundleVersion == "default" { - logger.V(0).Info("klusterlet bundleVersion is default, skipping upgrade") - return false, nil - } - if source.BundleVersion == "latest" { - logger.V(0).Info("klusterlet bundleVersion is latest, attempting upgrade") - return true, nil - } - - operatorC, err := common.OperatorClient(spokeKubeconfig) - if err != nil { - return false, err - } - - k, err := operatorC.OperatorV1().Klusterlets().Get(ctx, "klusterlet", metav1.GetOptions{}) - if err != nil { - return false, fmt.Errorf("failed to get klusterlet: %w", err) - } - - // identify lowest bundleVersion referenced in the klusterlet spec - bundleSpecs := make([]string, 0) - if k.Spec.ImagePullSpec != "" { - bundleSpecs = append(bundleSpecs, k.Spec.ImagePullSpec) - } - if k.Spec.RegistrationImagePullSpec != "" { - bundleSpecs = append(bundleSpecs, k.Spec.RegistrationImagePullSpec) - } - if k.Spec.WorkImagePullSpec != "" { - bundleSpecs = append(bundleSpecs, k.Spec.WorkImagePullSpec) - } - activeBundleVersion, err := version.LowestBundleVersion(ctx, bundleSpecs) - if err != nil { - return false, fmt.Errorf("failed to detect bundleVersion from klusterlet spec: %w", err) - } - desiredBundleVersion, err := version.Normalize(source.BundleVersion) - if err != nil { - return false, err - } - - logger.V(0).Info("found klusterlet bundleVersions", - "activeBundleVersion", activeBundleVersion, - "desiredBundleVersion", desiredBundleVersion, - ) - return activeBundleVersion != desiredBundleVersion, nil -} - -// upgradeSpoke upgrades the Spoke cluster's klusterlet -func (r *SpokeReconciler) upgradeSpoke(ctx context.Context, spoke *v1beta1.Spoke, klusterletValues *v1beta1.KlusterletChartConfig, source v1beta1.OCMSource, spokeKubeconfig []byte) error { - logger := log.FromContext(ctx) - logger.V(0).Info("upgradeSpoke", "spoke", spoke.Name) - - upgradeArgs := append([]string{ - "upgrade", "klusterlet", - "--bundle-version", source.BundleVersion, - "--image-registry", source.Registry, - "--wait=true", - }, spoke.BaseArgs()...) - - valuesArgs, valuesCleanup, err := prepareKlusterletValuesFile(klusterletValues) - if valuesCleanup != nil { - defer valuesCleanup() - } - if err != nil { - return err - } - upgradeArgs = append(upgradeArgs, valuesArgs...) - - upgradeArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, spokeKubeconfig, spoke.Spec.Kubeconfig.Context, upgradeArgs) - if cleanupKcfg != nil { - defer cleanupKcfg() - } - if err != nil { - return err - } - - logger.V(1).Info("clusteradm upgrade klusterlet", "args", upgradeArgs) - - cmd := exec.Command(clusteradm, upgradeArgs...) - stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, fmt.Sprintf("waiting for 'clusteradm upgrade klusterlet' to complete for spoke %s...", spoke.Name)) - if err != nil { - out := append(stdout, stderr...) - return fmt.Errorf( - "failed to upgrade klusterlet on spoke cluster %s to %s: %v, output: %s", - spoke.Name, source.BundleVersion, err, string(out), - ) - } - logger.V(1).Info("klusterlet upgraded", "output", string(stdout)) - - return nil -} - -// unjoinSpoke unjoins a spoke from the hub -func (r *SpokeReconciler) unjoinSpoke(ctx context.Context, spoke *v1beta1.Spoke, spokeKubeconfig []byte) error { - logger := log.FromContext(ctx) - logger.V(0).Info("unjoinSpoke", "spoke", spoke.Name) - - unjoinArgs := append([]string{ - "unjoin", - "--cluster-name", spoke.GetName(), - fmt.Sprintf("--purge-operator=%t", spoke.Spec.Klusterlet.PurgeOperator), - }, spoke.BaseArgs()...) - - unjoinArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, spokeKubeconfig, spoke.Spec.Kubeconfig.Context, unjoinArgs) - if cleanupKcfg != nil { - defer cleanupKcfg() - } - if err != nil { - return fmt.Errorf("failed to unjoin spoke cluster %s: %w", spoke.GetName(), err) - } - - logger.V(1).Info("clusteradm unjoin", "args", unjoinArgs) - - cmd := exec.Command(clusteradm, unjoinArgs...) - stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, fmt.Sprintf("waiting for 'clusteradm unjoin' to complete for spoke %s...", spoke.GetName())) - out := append(stdout, stderr...) - if err != nil || strings.Contains(string(out), amwExistsError) { - return fmt.Errorf("failed to unjoin spoke cluster %s: %v, output: %s", spoke.GetName(), err, string(out)) - } - logger.V(1).Info("spoke cluster unjoined", "output", string(stdout)) - - return nil -} - -// getToken gets a join token from the Hub cluster via 'clusteradm get token' -func getToken(ctx context.Context, hubMeta hubMeta) (*tokenMeta, error) { - logger := log.FromContext(ctx) - logger.V(0).Info("getToken") - - tokenArgs := append([]string{ - "get", "token", "--output=json", - }, hubMeta.hub.BaseArgs()...) - - if hubMeta.hub.Spec.ClusterManager != nil { - tokenArgs = append(tokenArgs, fmt.Sprintf("--use-bootstrap-token=%t", hubMeta.hub.Spec.ClusterManager.UseBootstrapToken)) - } - tokenArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, hubMeta.kubeconfig, hubMeta.hub.Spec.Kubeconfig.Context, tokenArgs) - if cleanupKcfg != nil { - defer cleanupKcfg() - } - if err != nil { - return nil, fmt.Errorf("failed to prepare kubeconfig: %w", err) - } - - logger.V(1).Info("clusteradm get token", "args", tokenArgs) - - cmd := exec.Command(clusteradm, tokenArgs...) - stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm get token' to complete...") - if err != nil { - out := append(stdout, stderr...) - return nil, fmt.Errorf("failed to get join token: %v, output: %s", err, string(out)) - } - logger.V(1).Info("got join token", "output", string(stdout)) - - tokenMeta := &tokenMeta{} - if err := json.Unmarshal(stdout, &tokenMeta); err != nil { - return nil, fmt.Errorf("failed to unmarshal join token: %w", err) - } - return tokenMeta, nil -} - -func (r *SpokeReconciler) getHubMeta(ctx context.Context, hubRef v1beta1.HubRef) (hubMeta, error) { - hub := &v1beta1.Hub{} - hubMeta := hubMeta{} - nn := types.NamespacedName{Name: hubRef.Name, Namespace: hubRef.Namespace} - - // get Hub using local client - err := r.Get(ctx, nn, hub) - if err != nil { - return hubMeta, client.IgnoreNotFound(err) - } - hubMeta.hub = hub - // if found, load the hub's kubeconfig - hubKubeconfig, err := kube.KubeconfigFromSecretOrCluster(ctx, r.Client, hub.Spec.Kubeconfig, hub.Namespace) - if err != nil { - return hubMeta, err - } - hubMeta.kubeconfig = hubKubeconfig - return hubMeta, nil -} - -func (r *SpokeReconciler) mergeKlusterletValues(ctx context.Context, spoke *v1beta1.Spoke) (*v1beta1.KlusterletChartConfig, error) { - logger := log.FromContext(ctx) - - if spoke.Spec.Klusterlet.ValuesFrom == nil && spoke.Spec.Klusterlet.Values == nil { - logger.V(3).Info("no values or valuesFrom provided. Using default klusterlet chart values", "spoke", spoke.Name) - return nil, nil - } - - var fromInterface = map[string]any{} - var specInterface = map[string]any{} - - if spoke.Spec.Klusterlet.ValuesFrom != nil { - cm := &corev1.ConfigMap{} - nn := types.NamespacedName{Name: spoke.Spec.Klusterlet.ValuesFrom.Name, Namespace: spoke.Namespace} - err := r.Get(ctx, nn, cm) - if err != nil { - if kerrs.IsNotFound(err) { - // cm not found, return spec's values - logger.V(1).Info("warning: Klusterlet values ConfigMap not found", "spoke", spoke.Name, "configMap", nn) - return spoke.Spec.Klusterlet.Values, nil - } - return nil, fmt.Errorf("failed to retrieve Klusterlet values ConfigMap %s: %w", nn, err) - } - fromValues, ok := cm.Data[spoke.Spec.Klusterlet.ValuesFrom.Key] - if !ok { - logger.V(1).Info("warning: Klusterlet values ConfigMap not found", "spoke", spoke.Name, "configMap", nn, "key", spoke.Spec.Klusterlet.ValuesFrom.Key) - return spoke.Spec.Klusterlet.Values, nil - } - fromBytes := []byte(fromValues) - err = yaml.Unmarshal(fromBytes, &fromInterface) - if err != nil { - return nil, fmt.Errorf("failed to unmarshal YAML values from ConfigMap %s key %s: %w", nn, spoke.Spec.Klusterlet.ValuesFrom.Key, err) - } - } - - if spoke.Spec.Klusterlet.Values != nil { - specBytes, err := yaml.Marshal(spoke.Spec.Klusterlet.Values) - if err != nil { - return nil, fmt.Errorf("failed to marshal Klusterlet values from spoke spec for spoke %s: %w", spoke.Name, err) - } - err = yaml.Unmarshal(specBytes, &specInterface) - if err != nil { - return nil, fmt.Errorf("failed to unmarshal Klusterlet values from spoke spec for spoke %s: %w", spoke.Name, err) - } - } - - mergedMap := map[string]any{} - maps.Copy(mergedMap, fromInterface) - - // Merge spec on top but ignore zero-values from spec - if err := mergo.Map(&mergedMap, specInterface, mergo.WithOverride); err != nil { - return nil, fmt.Errorf("merge failed for spoke %s: %w", spoke.Name, err) - } - - mergedBytes, err := yaml.Marshal(mergedMap) - if err != nil { - return nil, fmt.Errorf("failed to marshal merged Klusterlet values for spoke %s: %w", spoke.Name, err) - } - - merged := &v1beta1.KlusterletChartConfig{} - err = yaml.Unmarshal(mergedBytes, merged) - if err != nil { - return nil, fmt.Errorf("failed to unmarshal merged values into KlusterletChartConfig for spoke %s: %w", spoke.Name, err) - } - - return merged, nil - -} - -// prepareKlusterletValuesFile creates a temporary file with klusterlet values and returns -// args to append and a cleanup function. Returns empty slice if values are empty. -func prepareKlusterletValuesFile(values *v1beta1.KlusterletChartConfig) ([]string, func(), error) { - if values == nil { - return nil, nil, nil - } - - if values.IsEmpty() { - return nil, nil, nil - } - valuesYAML, err := yaml.Marshal(values) - if err != nil { - return nil, nil, fmt.Errorf("failed to marshal klusterlet values to YAML: %w", err) - } - valuesFile, valuesCleanup, err := file.TmpFile(valuesYAML, "klusterlet-values") - if err != nil { - return nil, nil, fmt.Errorf("failed to write klusterlet values to disk: %w", err) - } - return []string{"--klusterlet-values-file", valuesFile}, valuesCleanup, nil -} - -// SetupWithManager sets up the controller with the Manager. -func (r *SpokeReconciler) SetupWithManager(mgr ctrl.Manager) error { +// SetupWithManagerForHub sets up the controller with the Manager to run on a Hub cluster. +func (r *SpokeReconciler) SetupWithManagerForHub(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&v1beta1.Spoke{}). WithOptions(controller.Options{ @@ -951,6 +229,24 @@ func (r *SpokeReconciler) SetupWithManager(mgr ctrl.Manager) error { Complete(r) } +// SetupWithManagerForSpoke sets up the controller with the Manager to run on a Spoke cluster. +func (r *SpokeReconciler) SetupWithManagerForSpoke(mgr ctrl.Manager) error { + spokeName := os.Getenv(v1beta1.SpokeNameEnvVar) + return ctrl.NewControllerManagedBy(mgr). + For(&v1beta1.Spoke{}, + builder.WithPredicates(predicate.NewPredicateFuncs( + func(object client.Object) bool { + return object.GetName() == spokeName // no need to include namespace in the predicate, since the manager cache is configured to only watch 1 namespace + }, + )), + ). + WithOptions(controller.Options{ + MaxConcurrentReconciles: r.ConcurrentReconciles, + }). + Named("spoke"). + Complete(r) +} + // sharedFieldsChanged checks whether the spec fields that are shared between Hub and Spokes were updated, // to prevent unnecessary reconciles of Spokes func sharedFieldsChanged(oldSpec, newSpec *v1beta1.HubSpec) bool { diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go new file mode 100644 index 00000000..2242e97c --- /dev/null +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -0,0 +1,981 @@ +package v1beta1 + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "maps" + "os" + "os/exec" + "slices" + "strings" + + "dario.cat/mergo" + certificatesv1 "k8s.io/api/certificates/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + kerrs "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + addonv1alpha1 "open-cluster-management.io/api/addon/v1alpha1" + clusterv1 "open-cluster-management.io/api/cluster/v1" + operatorv1 "open-cluster-management.io/api/operator/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/yaml" + + "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1alpha1" + "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1beta1" + "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/args" + exec_utils "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/exec" + "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/file" + "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/hash" + "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/kube" + "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/version" + "github.com/open-cluster-management-io/lab/fleetconfig-controller/pkg/common" +) + +// cleanup cleans up a Spoke and its associated resources. +func (r *SpokeReconciler) cleanup(ctx context.Context, spoke *v1beta1.Spoke, hubKubeconfig []byte) error { + switch r.ClusterType { + case v1beta1.ClusterTypeHub: + err := r.doHubCleanup(ctx, spoke, hubKubeconfig) + if err != nil { + return err + } + if spoke.IsHubAsSpoke() { + err = r.doSpokeCleanup(ctx, spoke) + if err != nil { + return err + } + } + return nil + case v1beta1.ClusterTypeSpoke: + return r.doSpokeCleanup(ctx, spoke) + default: + // this is guarded against when the manager is initialized. should never reach this point + panic(fmt.Sprintf("unknown cluster type %s. Must be one of %v", r.ClusterType, v1beta1.SupportedClusterTypes)) + } +} + +// handleSpoke manages Spoke cluster join and upgrade operations +func (r *SpokeReconciler) handleSpoke(ctx context.Context, spoke *v1beta1.Spoke, hubMeta hubMeta) error { + klusterletValues, err := r.mergeKlusterletValues(ctx, spoke) + if err != nil { + return err + } + switch r.ClusterType { + case v1beta1.ClusterTypeHub: + err = r.doHubWork(ctx, spoke, hubMeta, klusterletValues) + if err != nil { + return err + } + if spoke.IsHubAsSpoke() { // hub-as-spoke + err = r.doSpokeWork(ctx, spoke, hubMeta.hub, klusterletValues) + if err != nil { + return err + } + } + return nil + case v1beta1.ClusterTypeSpoke: + return r.doSpokeWork(ctx, spoke, hubMeta.hub, klusterletValues) + default: + // this is guarded against when the manager is initialized. should never reach this point + panic(fmt.Sprintf("unknown cluster type %s. Must be one of %v", r.ClusterType, v1beta1.SupportedClusterTypes)) + } +} + +func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, hubMeta hubMeta, klusterletValues *v1beta1.KlusterletChartConfig) error { + logger := log.FromContext(ctx) + logger.V(0).Info("handleSpoke", "spoke", spoke.Name) + + hubKubeconfig := hubMeta.kubeconfig + + clusterClient, err := common.ClusterClient(hubKubeconfig) + if err != nil { + return err + } + addonC, err := common.AddOnClient(hubKubeconfig) + if err != nil { + return fmt.Errorf("failed to create addon client: %w", err) + } + + // check if the spoke has already been joined to the hub + managedCluster, err := common.GetManagedCluster(ctx, clusterClient, spoke.Name) + if err != nil { + logger.Error(err, "failed to get managedCluster", "spoke", spoke.Name) + return err + } + + // attempt to join the spoke cluster if it hasn't already been joined + if managedCluster == nil { + spokeKubeconfig, err := kube.KubeconfigFromSecretOrCluster(ctx, r.Client, spoke.Spec.Kubeconfig, spoke.Namespace) + if err != nil { + return fmt.Errorf("failed to load spoke kubeconfig: %v", err) + } + if err := r.joinSpoke(ctx, spoke, hubMeta, klusterletValues, spokeKubeconfig); err != nil { + spoke.SetConditions(true, v1beta1.NewCondition( + err.Error(), v1beta1.SpokeJoined, metav1.ConditionFalse, metav1.ConditionTrue, + )) + return err + } + + // Accept the cluster join request + if err := acceptCluster(ctx, spoke, false); err != nil { + spoke.SetConditions(true, v1beta1.NewCondition( + err.Error(), v1beta1.SpokeJoined, metav1.ConditionFalse, metav1.ConditionTrue, + )) + return err + } + + managedCluster, err = common.GetManagedCluster(ctx, clusterClient, spoke.Name) + if err != nil { + logger.Error(err, "failed to get managedCluster after join", "spoke", spoke.Name) + return err + } + } + + // check managed clusters joined condition + jc := r.getJoinedCondition(managedCluster) + if jc == nil { + logger.V(0).Info("waiting for spoke cluster to join", "name", spoke.Name) + msg := fmt.Sprintf("ManagedClusterJoined condition not found in ManagedCluster for spoke cluster %s", spoke.Name) + spoke.SetConditions(true, v1beta1.NewCondition( + msg, v1beta1.SpokeJoined, metav1.ConditionFalse, metav1.ConditionTrue, + )) + // Re-accept all join requests for the spoke cluster + if err := acceptCluster(ctx, spoke, true); err != nil { + logger.Error(err, "failed to accept spoke cluster join request(s)", "spoke", spoke.Name) + } + return nil + } + + logger.V(0).Info("found join condition", "reason", jc.Reason, "status", jc.Status, "message", jc.Message) + if jc.Status != metav1.ConditionTrue { + msg := fmt.Sprintf("failed to join spoke cluster %s: %s", spoke.Name, jc.Message) + spoke.SetConditions(true, v1beta1.NewCondition( + msg, v1beta1.SpokeJoined, metav1.ConditionFalse, metav1.ConditionTrue, + )) + return errors.New(msg) + } + + // spoke cluster has joined successfully + spoke.SetConditions(true, v1beta1.NewCondition( + "Joined", v1beta1.SpokeJoined, metav1.ConditionTrue, metav1.ConditionTrue, + )) + + // Label the spoke ManagedCluster if in hub-as-spoke mode. + // This allows the 'spoke' ManagedClusterSet to omit the hub-as-spoke cluster from its list + // of spoke clusters. + if managedCluster != nil && spoke.IsHubAsSpoke() { + if managedCluster.Labels == nil { + managedCluster.Labels = make(map[string]string) + } + managedCluster.Labels[v1beta1.LabelManagedClusterType] = v1beta1.ManagedClusterTypeHubAsSpoke + if err := common.UpdateManagedCluster(ctx, clusterClient, managedCluster); err != nil { + return err + } + logger.V(0).Info("labeled ManagedCluster as hub-as-spoke", "name", spoke.Name) + } + + if !spoke.IsHubAsSpoke() { + adc := &addonv1alpha1.AddOnDeploymentConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: "fleetconfig-controller-manager", + Namespace: spoke.Name, + }, + Spec: addonv1alpha1.AddOnDeploymentConfigSpec{ + AgentInstallNamespace: os.Getenv(v1beta1.ControllerNamespaceEnvVar), + CustomizedVariables: []addonv1alpha1.CustomizedVariable{ + { + Name: v1beta1.HubNamespaceEnvVar, + Value: spoke.Spec.HubRef.Namespace, + }, + { + Name: v1beta1.SpokeNamespaceEnvVar, + Value: spoke.Namespace, + }, + }, + }, + } + _, err = addonC.AddonV1alpha1().AddOnDeploymentConfigs(spoke.Name).Create(ctx, adc, metav1.CreateOptions{}) + if err != nil && !kerrs.IsAlreadyExists(err) { + return err + } + + err = r.bindAddonAgent(ctx, spoke) + if err != nil { + return err + } + } + + enabledAddons, err := handleSpokeAddons(ctx, addonC, spoke) + if err != nil { + msg := fmt.Sprintf("failed to enable addons for spoke cluster %s: %s", spoke.Name, err.Error()) + spoke.SetConditions(true, v1beta1.NewCondition( + msg, v1beta1.AddonsConfigured, metav1.ConditionFalse, metav1.ConditionTrue, + )) + return err + } + spoke.Status.EnabledAddons = enabledAddons + return nil +} + +func (r *SpokeReconciler) bindAddonAgent(ctx context.Context, spoke *v1beta1.Spoke) error { + roleRef := rbacv1.RoleRef{ + Kind: "ClusterRole", + APIGroup: rbacv1.GroupName, + Name: "fleetconfig-controller-manager-role", // TODO @arturshadnik get this some other way + } + + err := r.doBind(ctx, roleRef, spoke.Namespace, spoke.Name) + if err != nil { + return err + } + if spoke.Spec.HubRef.Namespace != spoke.Namespace { + err = r.doBind(ctx, roleRef, spoke.Spec.HubRef.Namespace, spoke.Name) + if err != nil { + return err + } + } + return nil +} + +func (r *SpokeReconciler) doBind(ctx context.Context, roleRef rbacv1.RoleRef, namespace, spokeName string) error { + binding := &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("open-cluster-management:%s:%s:agent-%s", + v1beta1.FCCAddOnName, strings.ToLower(roleRef.Kind), spokeName), + Namespace: namespace, + Labels: map[string]string{ + addonv1alpha1.AddonLabelKey: v1beta1.FCCAddOnName, + }, + }, + RoleRef: roleRef, + Subjects: []rbacv1.Subject{ + { + Kind: rbacv1.GroupKind, + APIGroup: rbacv1.GroupName, + Name: clusterAddonGroup(spokeName, v1beta1.FCCAddOnName), + }, + }, + } + + err := r.Create(ctx, binding, &client.CreateOptions{}) + if err != nil { + return client.IgnoreAlreadyExists(err) + } + return nil +} + +// clusterAddonGroup returns the group that represents the addon for the cluster +// ref: https://github.com/open-cluster-management-io/ocm/blob/main/pkg/addon/templateagent/registration.go#L484 +func clusterAddonGroup(clusterName, addonName string) string { + return fmt.Sprintf("system:open-cluster-management:cluster:%s:addon:%s", clusterName, addonName) +} + +func (r *SpokeReconciler) doSpokeWork(ctx context.Context, spoke *v1beta1.Spoke, hub *v1beta1.Hub, klusterletValues *v1beta1.KlusterletChartConfig) error { + logger := log.FromContext(ctx) + logger.V(0).Info("handleSpoke", "spoke", spoke.Name) + + spokeKubeconfig, err := kube.RawFromInClusterRestConfig() + if err != nil { + return fmt.Errorf("failed to load kubeconfig from inCluster: %v", err) + } + // attempt an upgrade whenever the klusterlet's bundleVersion or values change + currKlusterletHash, err := hash.ComputeHash(klusterletValues) + if err != nil { + return fmt.Errorf("failed to compute hash of spoke %s klusterlet values: %w", spoke.Name, err) + } + if hub != nil && hub.Spec.ClusterManager.Source.BundleVersion != "" { + upgrade, err := r.spokeNeedsUpgrade(ctx, spoke, currKlusterletHash, hub.Spec.ClusterManager.Source, spokeKubeconfig) + if err != nil { + return fmt.Errorf("failed to check if spoke cluster needs upgrade: %w", err) + } + + if upgrade { + if err := r.upgradeSpoke(ctx, spoke, klusterletValues, hub.Spec.ClusterManager.Source, spokeKubeconfig); err != nil { + return fmt.Errorf("failed to upgrade spoke cluster %s: %w", spoke.Name, err) + } + } + } + spoke.Status.KlusterletHash = currKlusterletHash + + return nil +} + +func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke, hubKubeconfig []byte) error { + logger := log.FromContext(ctx) + clusterC, err := common.ClusterClient(hubKubeconfig) + if err != nil { + return err + } + workC, err := common.WorkClient(hubKubeconfig) + if err != nil { + return err + } + addonC, err := common.AddOnClient(hubKubeconfig) + if err != nil { + return fmt.Errorf("failed to create addon client for cleanup: %w", err) + } + + // skip clean up if the ManagedCluster resource is not found or if any manifestWorks exist + managedCluster, err := clusterC.ClusterV1().ManagedClusters().Get(ctx, spoke.Name, metav1.GetOptions{}) + if kerrs.IsNotFound(err) { + logger.Info("ManagedCluster resource not found; nothing to do") + return nil + } else if err != nil { + return fmt.Errorf("unexpected error listing managedClusters: %w", err) + } + manifestWorks, err := workC.WorkV1().ManifestWorks(managedCluster.Name).List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("failed to list manifestWorks for managedCluster %s: %w", managedCluster.Name, err) + } + + // check that the number of manifestWorks is the same as the number of addons enabled for that spoke + if len(manifestWorks.Items) > 0 && !allOwnersAddOns(manifestWorks.Items) { + msg := fmt.Sprintf("Found manifestWorks for ManagedCluster %s; cannot unjoin spoke cluster while it has active ManifestWorks", managedCluster.Name) + logger.Info(msg) + return errors.New(msg) + } + + // remove addons only after confirming that the cluster can be unjoined - this avoids leaving dangling resources that may rely on the addon + spokeCopy := spoke.DeepCopy() + spokeCopy.Spec.AddOns = nil + if !spoke.IsHubAsSpoke() { + spokeCopy.Spec.AddOns = append(spokeCopy.Spec.AddOns, v1beta1.AddOn{ConfigName: "fleetconfig-controller-manager"}) // disable all except fcc + } + if _, err := handleSpokeAddons(ctx, addonC, spokeCopy); err != nil { + spoke.SetConditions(true, v1beta1.NewCondition( + err.Error(), v1beta1.AddonsConfigured, metav1.ConditionTrue, metav1.ConditionFalse, + )) + return err + } + + if len(spoke.Status.EnabledAddons) > 0 { + // Wait for addon manifestWorks to be fully cleaned up before proceeding with unjoin + if err := waitForAddonManifestWorksCleanup(ctx, workC, spoke.Name, addonCleanupTimeout); err != nil { + spoke.SetConditions(true, v1beta1.NewCondition( + err.Error(), v1beta1.AddonsConfigured, metav1.ConditionTrue, metav1.ConditionFalse, + )) + return fmt.Errorf("addon manifestWorks cleanup failed: %w", err) + } + spoke.SetConditions(true, v1beta1.NewCondition( + v1beta1.AddonsConfigured, v1beta1.AddonsConfigured, metav1.ConditionFalse, metav1.ConditionFalse, + )) + } + + // remove preflight cleanup finalizer - this lets the spoke's controller know to proceed with unjoin. + spoke.Finalizers = slices.DeleteFunc(spoke.Finalizers, func(s string) bool { + return s == v1beta1.HubCleanupPreflightFinalizer + }) + + // requeue until unjoin is complete by the spoke's controller + if slices.Contains(spoke.Finalizers, v1beta1.SpokeCleanupFinalizer) { + return nil + } + + csrList := &certificatesv1.CertificateSigningRequestList{} + if err := r.List(ctx, csrList, client.HasLabels{"open-cluster-management.io/cluster-name"}); err != nil { + return err + } + for _, c := range csrList.Items { + trimmedName := csrSuffixPattern.ReplaceAllString(c.Name, "") + if trimmedName == spoke.Name { + if err := r.Delete(ctx, &c); err != nil { + return err + } + } + } + + // at this point, klusterlet-work-agent is uninstalled, so nothing can remove this finalizer. all resources are cleaned up by the spoke's controller, so to prevent a dangling mw/namespace, we remove the finalizer manually + mw, err := workC.WorkV1().ManifestWorks(spoke.Name).Get(ctx, "addon-fleetconfig-controller-manager-deploy-0", metav1.GetOptions{}) // TODO @arturshadnik - use a label maybe + if err != nil && !kerrs.IsNotFound(err) { + return err + } + mw.Finalizers = slices.DeleteFunc(mw.Finalizers, func(s string) bool { + return s == "cluster.open-cluster-management.io/manifest-work-cleanup" + }) + + patchBytes, err := json.Marshal(map[string]any{ + "metadata": map[string]any{ + "finalizers": mw.Finalizers, + }, + }) + if err != nil { + return err + } + + _, err = workC.WorkV1().ManifestWorks(spoke.Name).Patch( + ctx, + "addon-fleetconfig-controller-manager-deploy-0", + types.MergePatchType, + patchBytes, + metav1.PatchOptions{}, + ) + if err != nil && !kerrs.IsNotFound(err) { + return err + } + + // remove ManagedCluster + err = clusterC.ClusterV1().ManagedClusters().Delete(ctx, spoke.Name, metav1.DeleteOptions{}) + if err != nil && !kerrs.IsNotFound(err) { + return err + } + // remove Namespace + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: spoke.Name}} + err = r.Delete(ctx, ns) + if err != nil && !kerrs.IsNotFound(err) { + return err + } + + spoke.Finalizers = slices.DeleteFunc(spoke.Finalizers, func(s string) bool { + return s == v1beta1.HubCleanupFinalizer + }) + + return nil +} + +func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spoke) error { + // requeue until preflight is complete by the hub's controller + if slices.Contains(spoke.Finalizers, v1beta1.HubCleanupPreflightFinalizer) { + return nil + } + + spokeKubeconfig, err := kube.RawFromInClusterRestConfig() + if err != nil { + return err + } + err = r.unjoinSpoke(ctx, spoke, spokeKubeconfig) + if err != nil { + return err + } + spoke.Finalizers = slices.DeleteFunc(spoke.Finalizers, func(s string) bool { + return s == v1beta1.SpokeCleanupFinalizer + }) + + // hub-as-spoke case, no further cleanup needed + if r.ClusterType == v1beta1.ClusterTypeHub { + return nil + } + + // "self-destruct" any remaining namespaces/resources + operatorClient, err := common.OperatorClient(spokeKubeconfig) + if err != nil { + return err + } + if err := operatorClient.OperatorV1().Klusterlets().Delete(ctx, "klusterlet", metav1.DeleteOptions{}); err != nil && !kerrs.IsNotFound(err) { + return err + } + + namespacesToDelete := []string{ + "open-cluster-management-agent", + "open-cluster-management-agent-addon", + "open-cluster-management", + os.Getenv(v1beta1.ControllerNamespaceEnvVar), + } + + restCfg, err := kube.RestConfigFromKubeconfig(spokeKubeconfig) + if err != nil { + return err + } + spokeClient, err := client.New(restCfg, client.Options{}) + if err != nil { + return err + } + for _, nsName := range namespacesToDelete { + if nsName == "" { + continue + } + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: nsName}} + if err := spokeClient.Delete(ctx, ns); err != nil && !kerrs.IsNotFound(err) { + return err + } + } + + return nil +} + +type tokenMeta struct { + Token string `json:"hub-token"` + HubAPIServer string `json:"hub-apiserver"` +} + +type hubMeta struct { + hub *v1beta1.Hub + kubeconfig []byte +} + +// joinSpoke joins a Spoke cluster to the Hub cluster +func (r *SpokeReconciler) joinSpoke(ctx context.Context, spoke *v1beta1.Spoke, hubMeta hubMeta, klusterletValues *v1beta1.KlusterletChartConfig, spokeKubeconfig []byte) error { + logger := log.FromContext(ctx) + logger.V(0).Info("joinSpoke", "spoke", spoke.Name) + + hub := hubMeta.hub + + if hub == nil { + return errors.New("hub not found") + } + // dont start join until the hub is ready + hubInitCond := hubMeta.hub.GetCondition(v1beta1.HubInitialized) + if hubInitCond == nil || hubInitCond.Status != metav1.ConditionTrue { + return errors.New("hub does not have initialized condition") + } + + tokenMeta, err := getToken(ctx, hubMeta) + if err != nil { + return fmt.Errorf("failed to get join token: %w", err) + } + + joinArgs := append([]string{ + "join", + "--cluster-name", spoke.Name, + fmt.Sprintf("--create-namespace=%t", spoke.Spec.CreateNamespace), + fmt.Sprintf("--enable-sync-labels=%t", spoke.Spec.SyncLabels), + "--hub-token", tokenMeta.Token, + "--wait=true", + // klusterlet args + "--mode", spoke.Spec.Klusterlet.Mode, + "--feature-gates", spoke.Spec.Klusterlet.FeatureGates, + fmt.Sprintf("--force-internal-endpoint-lookup=%t", spoke.Spec.Klusterlet.ForceInternalEndpointLookup), + fmt.Sprintf("--singleton=%t", spoke.Spec.Klusterlet.Singleton), + // source args + "--bundle-version", hub.Spec.ClusterManager.Source.BundleVersion, + "--image-registry", hub.Spec.ClusterManager.Source.Registry, + }, spoke.BaseArgs()...) + + for k, v := range spoke.Spec.Klusterlet.Annotations { + joinArgs = append(joinArgs, fmt.Sprintf("--klusterlet-annotation=%s=%s", k, v)) + } + + // resources args + joinArgs = append(joinArgs, args.PrepareResources(spoke.Spec.Klusterlet.Resources)...) + + // Use hub API server from spec if provided and not forced to use internal endpoint, + // otherwise fall back to the hub API server from the tokenMeta + if hub.Spec.APIServer != "" && !spoke.Spec.Klusterlet.ForceInternalEndpointLookup { + joinArgs = append(joinArgs, "--hub-apiserver", hub.Spec.APIServer) + } else if tokenMeta.HubAPIServer != "" { + joinArgs = append(joinArgs, "--hub-apiserver", tokenMeta.HubAPIServer) + } + + if hub.Spec.Ca != "" { + caFile, caCleanup, err := file.TmpFile([]byte(hub.Spec.Ca), "ca") + if caCleanup != nil { + defer caCleanup() + } + if err != nil { + return fmt.Errorf("failed to write hub CA to disk: %w", err) + } + joinArgs = append([]string{fmt.Sprintf("--ca-file=%s", caFile)}, joinArgs...) + } + + ra := hub.Spec.RegistrationAuth + if ra.Driver == v1alpha1.AWSIRSARegistrationDriver { + raArgs := []string{ + fmt.Sprintf("--registration-auth=%s", ra.Driver), + } + if ra.HubClusterARN != "" { + raArgs = append(raArgs, fmt.Sprintf("--hub-cluster-arn=%s", ra.HubClusterARN)) + } + if spoke.Spec.ClusterARN != "" { + raArgs = append(raArgs, fmt.Sprintf("--managed-cluster-arn=%s", spoke.Spec.ClusterARN)) + } + + joinArgs = append(joinArgs, raArgs...) + } + + if spoke.Spec.Klusterlet.Mode == string(operatorv1.InstallModeHosted) { + joinArgs = append(joinArgs, + fmt.Sprintf("--force-internal-endpoint-lookup-managed=%t", spoke.Spec.Klusterlet.ForceInternalEndpointLookupManaged), + ) + raw, err := kube.KubeconfigFromSecretOrCluster(ctx, r.Client, spoke.Spec.Klusterlet.ManagedClusterKubeconfig, spoke.Namespace) + if err != nil { + return err + } + mgdKcfg, mgdKcfgCleanup, err := file.TmpFile(raw, "kubeconfig") + if mgdKcfgCleanup != nil { + defer mgdKcfgCleanup() + } + if err != nil { + return fmt.Errorf("failed to write managedClusterKubeconfig to disk: %w", err) + } + joinArgs = append(joinArgs, "--managed-cluster-kubeconfig", mgdKcfg) + } + + if spoke.Spec.ProxyCa != "" { + proxyCaFile, proxyCaCleanup, err := file.TmpFile([]byte(spoke.Spec.ProxyCa), "proxy-ca") + if proxyCaCleanup != nil { + defer proxyCaCleanup() + } + if err != nil { + return fmt.Errorf("failed to write proxy CA to disk: %w", err) + } + joinArgs = append(joinArgs, fmt.Sprintf("--proxy-ca-file=%s", proxyCaFile)) + } + if spoke.Spec.ProxyURL != "" { + joinArgs = append(joinArgs, fmt.Sprintf("--proxy-url=%s", spoke.Spec.ProxyURL)) + } + + valuesArgs, valuesCleanup, err := prepareKlusterletValuesFile(klusterletValues) + if valuesCleanup != nil { + defer valuesCleanup() + } + if err != nil { + return err + } + joinArgs = append(joinArgs, valuesArgs...) + + joinArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, spokeKubeconfig, spoke.Spec.Kubeconfig.Context, joinArgs) + if cleanupKcfg != nil { + defer cleanupKcfg() + } + if err != nil { + return err + } + + logger.V(1).Info("clusteradm join", "args", joinArgs) + + cmd := exec.Command(clusteradm, joinArgs...) + stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, fmt.Sprintf("waiting for 'clusteradm join' to complete for spoke %s...", spoke.Name)) + if err != nil { + out := append(stdout, stderr...) + return fmt.Errorf("clusteradm join command failed for spoke %s: %v, output: %s", spoke.Name, err, string(out)) + } + logger.V(1).Info("successfully requested spoke cluster join", "output", string(stdout)) + + return nil +} + +// acceptCluster accepts a Spoke cluster's join request +func acceptCluster(ctx context.Context, spoke *v1beta1.Spoke, skipApproveCheck bool) error { + logger := log.FromContext(ctx) + logger.V(0).Info("acceptCluster", "spoke", spoke.Name) + + acceptArgs := append([]string{ + "accept", "--cluster", spoke.Name, + }, spoke.BaseArgs()...) + + logger.V(1).Info("clusteradm accept", "args", acceptArgs) + + // TODO: handle other args: + // --requesters=[]: + // Common Names of agents to be approved. + + if skipApproveCheck { + acceptArgs = append(acceptArgs, "--skip-approve-check") + } + + cmd := exec.Command(clusteradm, acceptArgs...) + stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, fmt.Sprintf("waiting for 'clusteradm accept' to complete for spoke %s...", spoke.Name)) + if err != nil { + out := append(stdout, stderr...) + return fmt.Errorf("failed to accept spoke cluster join request: %v, output: %s", err, string(out)) + } + logger.V(1).Info("spoke cluster join request accepted", "output", string(stdout)) + + return nil +} + +// getJoinedCondition gets the joined condition from a managed cluster +func (r *SpokeReconciler) getJoinedCondition(managedCluster *clusterv1.ManagedCluster) *metav1.Condition { + if managedCluster == nil || managedCluster.Status.Conditions == nil { + return nil + } + + for _, c := range managedCluster.Status.Conditions { + if c.Type == "ManagedClusterJoined" { + return &c + } + } + + return nil +} + +// spokeNeedsUpgrade checks if the klusterlet on a Spoke cluster requires an upgrade +func (r *SpokeReconciler) spokeNeedsUpgrade(ctx context.Context, spoke *v1beta1.Spoke, currKlusterletHash string, source v1beta1.OCMSource, spokeKubeconfig []byte) (bool, error) { + logger := log.FromContext(ctx) + logger.V(0).Info("spokeNeedsUpgrade", "spokeClusterName", spoke.Name) + + prevHash := spoke.Status.KlusterletHash + hashChanged := prevHash != currKlusterletHash && prevHash != "" + logger.V(2).Info("comparing klusterlet values hash", + "spoke", spoke.Name, + "prevHash", spoke.Status.KlusterletHash, + "currHash", currKlusterletHash, + ) + if hashChanged { + logger.V(0).Info("hash changed", "old", spoke.Status.KlusterletHash, "new", currKlusterletHash) + return true, nil + } + + if source.BundleVersion == "default" { + logger.V(0).Info("klusterlet bundleVersion is default, skipping upgrade") + return false, nil + } + if source.BundleVersion == "latest" { + logger.V(0).Info("klusterlet bundleVersion is latest, attempting upgrade") + return true, nil + } + + operatorC, err := common.OperatorClient(spokeKubeconfig) + if err != nil { + return false, err + } + + k, err := operatorC.OperatorV1().Klusterlets().Get(ctx, "klusterlet", metav1.GetOptions{}) + if err != nil { + return false, fmt.Errorf("failed to get klusterlet: %w", err) + } + + // identify lowest bundleVersion referenced in the klusterlet spec + bundleSpecs := make([]string, 0) + if k.Spec.ImagePullSpec != "" { + bundleSpecs = append(bundleSpecs, k.Spec.ImagePullSpec) + } + if k.Spec.RegistrationImagePullSpec != "" { + bundleSpecs = append(bundleSpecs, k.Spec.RegistrationImagePullSpec) + } + if k.Spec.WorkImagePullSpec != "" { + bundleSpecs = append(bundleSpecs, k.Spec.WorkImagePullSpec) + } + activeBundleVersion, err := version.LowestBundleVersion(ctx, bundleSpecs) + if err != nil { + return false, fmt.Errorf("failed to detect bundleVersion from klusterlet spec: %w", err) + } + desiredBundleVersion, err := version.Normalize(source.BundleVersion) + if err != nil { + return false, err + } + + logger.V(0).Info("found klusterlet bundleVersions", + "activeBundleVersion", activeBundleVersion, + "desiredBundleVersion", desiredBundleVersion, + ) + return activeBundleVersion != desiredBundleVersion, nil +} + +// upgradeSpoke upgrades the Spoke cluster's klusterlet +func (r *SpokeReconciler) upgradeSpoke(ctx context.Context, spoke *v1beta1.Spoke, klusterletValues *v1beta1.KlusterletChartConfig, source v1beta1.OCMSource, spokeKubeconfig []byte) error { + logger := log.FromContext(ctx) + logger.V(0).Info("upgradeSpoke", "spoke", spoke.Name) + + upgradeArgs := append([]string{ + "upgrade", "klusterlet", + "--bundle-version", source.BundleVersion, + "--image-registry", source.Registry, + "--wait=true", + }, spoke.BaseArgs()...) + + valuesArgs, valuesCleanup, err := prepareKlusterletValuesFile(klusterletValues) + if valuesCleanup != nil { + defer valuesCleanup() + } + if err != nil { + return err + } + upgradeArgs = append(upgradeArgs, valuesArgs...) + + upgradeArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, spokeKubeconfig, spoke.Spec.Kubeconfig.Context, upgradeArgs) + if cleanupKcfg != nil { + defer cleanupKcfg() + } + if err != nil { + return err + } + + logger.V(1).Info("clusteradm upgrade klusterlet", "args", upgradeArgs) + + cmd := exec.Command(clusteradm, upgradeArgs...) + stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, fmt.Sprintf("waiting for 'clusteradm upgrade klusterlet' to complete for spoke %s...", spoke.Name)) + if err != nil { + out := append(stdout, stderr...) + return fmt.Errorf( + "failed to upgrade klusterlet on spoke cluster %s to %s: %v, output: %s", + spoke.Name, source.BundleVersion, err, string(out), + ) + } + logger.V(1).Info("klusterlet upgraded", "output", string(stdout)) + + return nil +} + +// unjoinSpoke unjoins a spoke from the hub +func (r *SpokeReconciler) unjoinSpoke(ctx context.Context, spoke *v1beta1.Spoke, spokeKubeconfig []byte) error { + logger := log.FromContext(ctx) + logger.V(0).Info("unjoinSpoke", "spoke", spoke.Name) + + unjoinArgs := append([]string{ + "unjoin", + "--cluster-name", spoke.GetName(), + fmt.Sprintf("--purge-operator=%t", spoke.Spec.Klusterlet.PurgeOperator), + }, spoke.BaseArgs()...) + + unjoinArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, spokeKubeconfig, spoke.Spec.Kubeconfig.Context, unjoinArgs) + if cleanupKcfg != nil { + defer cleanupKcfg() + } + if err != nil { + return fmt.Errorf("failed to unjoin spoke cluster %s: %w", spoke.GetName(), err) + } + + logger.V(1).Info("clusteradm unjoin", "args", unjoinArgs) + + cmd := exec.Command(clusteradm, unjoinArgs...) + stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, fmt.Sprintf("waiting for 'clusteradm unjoin' to complete for spoke %s...", spoke.GetName())) + out := append(stdout, stderr...) + if err != nil { //|| strings.Contains(string(out), amwExistsError) { + return fmt.Errorf("failed to unjoin spoke cluster %s: %v, output: %s", spoke.GetName(), err, string(out)) + } + logger.V(1).Info("spoke cluster unjoined", "output", string(stdout)) + + return nil +} + +// getToken gets a join token from the Hub cluster via 'clusteradm get token' +func getToken(ctx context.Context, hubMeta hubMeta) (*tokenMeta, error) { + logger := log.FromContext(ctx) + logger.V(0).Info("getToken") + + tokenArgs := append([]string{ + "get", "token", "--output=json", + }, hubMeta.hub.BaseArgs()...) + + if hubMeta.hub.Spec.ClusterManager != nil { + tokenArgs = append(tokenArgs, fmt.Sprintf("--use-bootstrap-token=%t", hubMeta.hub.Spec.ClusterManager.UseBootstrapToken)) + } + tokenArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, hubMeta.kubeconfig, hubMeta.hub.Spec.Kubeconfig.Context, tokenArgs) + if cleanupKcfg != nil { + defer cleanupKcfg() + } + if err != nil { + return nil, fmt.Errorf("failed to prepare kubeconfig: %w", err) + } + + logger.V(1).Info("clusteradm get token", "args", tokenArgs) + + cmd := exec.Command(clusteradm, tokenArgs...) + stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm get token' to complete...") + if err != nil { + out := append(stdout, stderr...) + return nil, fmt.Errorf("failed to get join token: %v, output: %s", err, string(out)) + } + logger.V(1).Info("got join token", "output", string(stdout)) + + tokenMeta := &tokenMeta{} + if err := json.Unmarshal(stdout, &tokenMeta); err != nil { + return nil, fmt.Errorf("failed to unmarshal join token: %w", err) + } + return tokenMeta, nil +} + +func (r *SpokeReconciler) getHubMeta(ctx context.Context, hubRef v1beta1.HubRef) (hubMeta, error) { + hub := &v1beta1.Hub{} + hubMeta := hubMeta{} + nn := types.NamespacedName{Name: hubRef.Name, Namespace: hubRef.Namespace} + + // get Hub using local client + err := r.Get(ctx, nn, hub) + if err != nil { + return hubMeta, client.IgnoreNotFound(err) + } + hubMeta.hub = hub + // if found, load the hub's kubeconfig + hubKubeconfig, err := kube.KubeconfigFromSecretOrCluster(ctx, r.Client, hub.Spec.Kubeconfig, hub.Namespace) + if err != nil { + return hubMeta, err + } + hubMeta.kubeconfig = hubKubeconfig + return hubMeta, nil +} + +func (r *SpokeReconciler) mergeKlusterletValues(ctx context.Context, spoke *v1beta1.Spoke) (*v1beta1.KlusterletChartConfig, error) { + logger := log.FromContext(ctx) + + if spoke.Spec.Klusterlet.ValuesFrom == nil && spoke.Spec.Klusterlet.Values == nil { + logger.V(3).Info("no values or valuesFrom provided. Using default klusterlet chart values", "spoke", spoke.Name) + return nil, nil + } + + var fromInterface = map[string]any{} + var specInterface = map[string]any{} + + if spoke.Spec.Klusterlet.ValuesFrom != nil { + cm := &corev1.ConfigMap{} + nn := types.NamespacedName{Name: spoke.Spec.Klusterlet.ValuesFrom.Name, Namespace: spoke.Namespace} + err := r.Get(ctx, nn, cm) + if err != nil { + if kerrs.IsNotFound(err) { + // cm not found, return spec's values + logger.V(1).Info("warning: Klusterlet values ConfigMap not found", "spoke", spoke.Name, "configMap", nn) + return spoke.Spec.Klusterlet.Values, nil + } + return nil, fmt.Errorf("failed to retrieve Klusterlet values ConfigMap %s: %w", nn, err) + } + fromValues, ok := cm.Data[spoke.Spec.Klusterlet.ValuesFrom.Key] + if !ok { + logger.V(1).Info("warning: Klusterlet values ConfigMap not found", "spoke", spoke.Name, "configMap", nn, "key", spoke.Spec.Klusterlet.ValuesFrom.Key) + return spoke.Spec.Klusterlet.Values, nil + } + fromBytes := []byte(fromValues) + err = yaml.Unmarshal(fromBytes, &fromInterface) + if err != nil { + return nil, fmt.Errorf("failed to unmarshal YAML values from ConfigMap %s key %s: %w", nn, spoke.Spec.Klusterlet.ValuesFrom.Key, err) + } + } + + if spoke.Spec.Klusterlet.Values != nil { + specBytes, err := yaml.Marshal(spoke.Spec.Klusterlet.Values) + if err != nil { + return nil, fmt.Errorf("failed to marshal Klusterlet values from spoke spec for spoke %s: %w", spoke.Name, err) + } + err = yaml.Unmarshal(specBytes, &specInterface) + if err != nil { + return nil, fmt.Errorf("failed to unmarshal Klusterlet values from spoke spec for spoke %s: %w", spoke.Name, err) + } + } + + mergedMap := map[string]any{} + maps.Copy(mergedMap, fromInterface) + + // Merge spec on top but ignore zero-values from spec + if err := mergo.Map(&mergedMap, specInterface, mergo.WithOverride); err != nil { + return nil, fmt.Errorf("merge failed for spoke %s: %w", spoke.Name, err) + } + + mergedBytes, err := yaml.Marshal(mergedMap) + if err != nil { + return nil, fmt.Errorf("failed to marshal merged Klusterlet values for spoke %s: %w", spoke.Name, err) + } + + merged := &v1beta1.KlusterletChartConfig{} + err = yaml.Unmarshal(mergedBytes, merged) + if err != nil { + return nil, fmt.Errorf("failed to unmarshal merged values into KlusterletChartConfig for spoke %s: %w", spoke.Name, err) + } + + return merged, nil + +} + +// prepareKlusterletValuesFile creates a temporary file with klusterlet values and returns +// args to append and a cleanup function. Returns empty slice if values are empty. +func prepareKlusterletValuesFile(values *v1beta1.KlusterletChartConfig) ([]string, func(), error) { + if values == nil { + return nil, nil, nil + } + + if values.IsEmpty() { + return nil, nil, nil + } + valuesYAML, err := yaml.Marshal(values) + if err != nil { + return nil, nil, fmt.Errorf("failed to marshal klusterlet values to YAML: %w", err) + } + valuesFile, valuesCleanup, err := file.TmpFile(valuesYAML, "klusterlet-values") + if err != nil { + return nil, nil, fmt.Errorf("failed to write klusterlet values to disk: %w", err) + } + return []string{"--klusterlet-values-file", valuesFile}, valuesCleanup, nil +} diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation.go b/fleetconfig-controller/internal/webhook/v1beta1/validation.go index e2f7510c..876896b4 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation.go @@ -21,7 +21,10 @@ import ( "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1beta1" ) -const warnHubNotFound = "hub not found, cannot validate spoke addons" +const ( + warnHubNotFound = "hub not found, cannot validate spoke addons" + fccAddOnName = "fleetconfig-controller-manager" +) func isKubeconfigValid(kubeconfig v1beta1.Kubeconfig) (bool, string) { if kubeconfig.SecretReference == nil && !kubeconfig.InCluster { @@ -99,6 +102,7 @@ func allowHubUpdate(oldHub, newHub *v1beta1.Hub) error { // - spec.addOns // - spec.timeout // - spec.logVerbosity +// - spec.hubRed func allowSpokeUpdate(oldSpoke, newSpoke *v1beta1.Spoke) error { if !reflect.DeepEqual(newSpoke.Spec, oldSpoke.Spec) { oldSpokeCopy := oldSpoke.Spec.DeepCopy() @@ -115,9 +119,11 @@ func allowSpokeUpdate(oldSpoke, newSpoke *v1beta1.Spoke) error { newSpokeCopy.LogVerbosity = 0 oldSpokeCopy.Timeout = 0 newSpokeCopy.Timeout = 0 + oldSpokeCopy.HubRef = v1beta1.HubRef{} + newSpokeCopy.HubRef = v1beta1.HubRef{} if !reflect.DeepEqual(oldSpokeCopy, newSpokeCopy) { - return errors.New("spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke") + return errors.New("spoke contains changes which are not allowed; only changes to spec.hubRef, spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke") } } @@ -343,6 +349,20 @@ func validateAddonNotInUse(ctx context.Context, removedAddons []string, fieldPat func validateAddons(ctx context.Context, cli client.Client, newObject *v1beta1.Spoke, addonC *versioned.Clientset) (admission.Warnings, field.ErrorList) { errs := field.ErrorList{} + if newObject.IsHubAsSpoke() { + if slices.ContainsFunc(newObject.Spec.AddOns, func(a v1beta1.AddOn) bool { + return a.ConfigName == fccAddOnName + }) { + errs = append(errs, field.Invalid(field.NewPath("spec").Child("addOns"), newObject.Spec.AddOns, "hub-as-spoke Spoke cannot enable fleetconfig-controller-manager addon")) + } + } else { + if !slices.ContainsFunc(newObject.Spec.AddOns, func(a v1beta1.AddOn) bool { + return a.ConfigName == fccAddOnName + }) { + errs = append(errs, field.Invalid(field.NewPath("spec").Child("addOns"), newObject.Spec.AddOns, "Spoke must enable fleetconfig-controller-manager addon")) + } + } + // try to get hub, if not present or not ready, log a warning that addons cant be properly validated hub := &v1beta1.Hub{} err := cli.Get(ctx, types.NamespacedName{Name: newObject.Spec.HubRef.Name, Namespace: newObject.Spec.HubRef.Namespace}, hub) diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go b/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go index 92b3434e..5391c536 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go @@ -409,7 +409,7 @@ func TestAllowSpokeUpdate(t *testing.T) { wantErr: false, }, { - name: "disallowed - HubRef change", + name: "allowed - HubRef change", oldSpoke: &v1beta1.Spoke{ Spec: v1beta1.SpokeSpec{ HubRef: v1beta1.HubRef{ @@ -426,8 +426,7 @@ func TestAllowSpokeUpdate(t *testing.T) { }, }, }, - wantErr: true, - errMsg: "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", + wantErr: false, }, { name: "disallowed - CreateNamespace change", @@ -442,7 +441,7 @@ func TestAllowSpokeUpdate(t *testing.T) { }, }, wantErr: true, - errMsg: "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", + errMsg: "spoke contains changes which are not allowed; only changes to spec.hubRef, spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", }, { name: "disallowed - klusterlet mode change", @@ -461,7 +460,7 @@ func TestAllowSpokeUpdate(t *testing.T) { }, }, wantErr: true, - errMsg: "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", + errMsg: "spoke contains changes which are not allowed; only changes to spec.hubRef, spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", }, { name: "disallowed - klusterlet feature gates change", @@ -480,7 +479,7 @@ func TestAllowSpokeUpdate(t *testing.T) { }, }, wantErr: true, - errMsg: "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", + errMsg: "spoke contains changes which are not allowed; only changes to spec.hubRef, spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", }, { name: "multiple allowed changes", From ef5943b0ea57522be5079cd85d74ef86dba1e708 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 25 Sep 2025 11:01:03 -0700 Subject: [PATCH 02/62] chore: tidt todos, improve cleanup Signed-off-by: Artur Shad Nik --- .../api/v1beta1/constants.go | 10 +++- .../templates/deployment.yaml | 2 + .../ocm/fcc-addon/addon-template.yaml | 33 ++---------- .../fcc-addon/cluster-management-addon.yaml | 2 +- .../ocm/fcc-addon/cluster-role-binding.yaml | 2 +- fleetconfig-controller/cmd/manager/manager.go | 2 +- .../{hack/dev => }/devspace-start-hub.sh | 0 .../{hack/dev => }/devspace-start-spoke.sh | 0 fleetconfig-controller/devspace.yaml | 11 +--- .../internal/controller/v1beta1/addon.go | 39 +++++--------- .../internal/controller/v1beta1/common.go | 12 ----- .../internal/controller/v1beta1/constants.go | 44 ++++++++++++++++ .../controller/v1beta1/spoke_controller.go | 32 ++++++++---- .../controller/v1beta1/spoke_handler.go | 52 ++++++++++--------- .../test/data/fleetconfig-values.yaml | 1 + .../test/e2e/v1beta1_hub_spoke.go | 4 ++ 16 files changed, 129 insertions(+), 117 deletions(-) rename fleetconfig-controller/{hack/dev => }/devspace-start-hub.sh (100%) rename fleetconfig-controller/{hack/dev => }/devspace-start-spoke.sh (100%) create mode 100644 fleetconfig-controller/internal/controller/v1beta1/constants.go diff --git a/fleetconfig-controller/api/v1beta1/constants.go b/fleetconfig-controller/api/v1beta1/constants.go index 7fa82b0f..adde7acb 100644 --- a/fleetconfig-controller/api/v1beta1/constants.go +++ b/fleetconfig-controller/api/v1beta1/constants.go @@ -78,8 +78,8 @@ const ( // HubKubeconfigEnvVar is the environment variable containing the path to the mounted Hub kubeconfig. HubKubeconfigEnvVar = "HUB_KUBECONFIG" - // HubKubeconfigFallbackPath is the path of the mounted kubeconfig when the controller is running in a Spoke cluster. Used if the environment variable is not set. - HubKubeconfigFallbackPath = "/managed/hub-kubeconfig/kubeconfig" + // DefaultHubKubeconfigPath is the path of the mounted kubeconfig when the controller is running in a Spoke cluster. Used if the environment variable is not set. + DefaultHubKubeconfigPath = "/managed/hub-kubeconfig/kubeconfig" // SpokeNameEnvVar is the environment variable containing the name of the Spoke resource. SpokeNameEnvVar = "CLUSTER_NAME" @@ -93,8 +93,14 @@ const ( // ControllerNamespaceEnvVar is the environment variable containing the namespace that the controller is deployed to. ControllerNamespaceEnvVar = "CONTROLLER_NAMESPACE" + // RoleNameEnvVar containing the name of the ClusterRole for fleetconfig-controller-manager. + RoleNameEnvVar = "ROLE_NAME" + // FCCAddOnName is the name of the fleetconfig-controller-addon FCCAddOnName = "fleetconfig-controller-manager" + + // DefaultFCCManagerRole is the default name of the fleetconfig-controller-manager ClusterRole + DefaultFCCManagerRole = "fleetconfig-controller-manager-role" ) // SupportedClusterTypes are the valid cluster types that the controller can be installed in. diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml index 4c3b4b6a..ae0ad0d0 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml @@ -48,6 +48,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: ROLE_NAME + value: {{ include "chart.fullname" . }}-manager-role image: {{ include "controller.image" . }} imagePullPolicy: {{ quote .Values.image.pullPolicy }} name: manager diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml index 9706a87f..1eaefff8 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml @@ -62,8 +62,7 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace - image: quay.io/open-cluster-management/fleetconfig-controller:local # TODO @arturshadnik - use the actual image - # image: {{ include "controller.image" . }} + image: {{ include "controller.image" . }} imagePullPolicy: {{ quote .Values.image.pullPolicy }} name: manager resources: @@ -156,37 +155,11 @@ spec: - kind: ServiceAccount name: {{ include "chart.fullname" . }}-manager namespace: {{ .Release.Namespace }} - registration: # optional - # kubeClient or custom signer, if kubeClient, user and group is in a certain format. - # user is "system:open-cluster-management:cluster:{clusterName}:addon:{addonName}:agent:{agentName}" - # group is ["system:open-cluster-management:cluster:{clusterName}:addon:{addonName}", - # "system:open-cluster-management:addon:{addonName}", "system:authenticated"] + registration: - type: KubeClient kubeClient: hubPermissions: - # this isnt suffcient, but any other permissions need to set by fcc, because we cant template the namespace - type: CurrentCluster currentCluster: clusterRoleName: {{ include "chart.fullname" . }}-manager-role - # - type: SingleNamespace - # singleNamespace: - # # TODO @arturshadnik - figure out how to set this per-cluster's hubRef - probably needs to be done by fcc - # namespace: {{ .Release.Namespace }} - # # namespace: {{ `{{HUB_CLUSTER_NAMESPACE}}` }} - # roleRef: - # apiGroup: rbac.authorization.k8s.io - # kind: ClusterRole - # name: {{ include "chart.fullname" . }}-manager-role - # - type: SingleNamespace - # singleNamespace: - # # namespace: {{ .Release.Namespace }} # TODO @arturshadnik - figure out how to set this per-cluster - # # namespace: {{ `{{CLUSTER_NAMESPACE}}` }} - # namespace: default - # roleRef: - # apiGroup: rbac.authorization.k8s.io - # kind: ClusterRole - # # should be created by user; the addon manager will grant the permission to the agent, so if the - # # role/clusterRole contains some permissions that the addon manager doesn't have, user needs to grant - # # the permission to the addon-manager (service account open-cluster-management-hub/addon-manager-controller-sa), - # # otherwise the addon manager will fail to grant the permission to the agent - # name: {{ include "chart.fullname" . }}-manager-role + diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-management-addon.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-management-addon.yaml index 47ea0897..1c1403cd 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-management-addon.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-management-addon.yaml @@ -17,7 +17,7 @@ spec: name: fleetconfig-controller-manager installStrategy: type: Manual - # TODO - use `Placements`` once ManagedClusters can be labeled immediately during the registration process. See https://github.com/open-cluster-management-io/ocm/issues/1195, https://github.com/open-cluster-management-io/ocm/pull/1123 + # TODO - use `Placements` once ManagedClusters can be labeled immediately during the registration process. See https://github.com/open-cluster-management-io/ocm/issues/1195, https://github.com/open-cluster-management-io/ocm/pull/1123 # placements: # - namespace: managed-cluster-set-spokes # name: spokes diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-role-binding.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-role-binding.yaml index 013d0f35..7cc049d8 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-role-binding.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-role-binding.yaml @@ -10,4 +10,4 @@ roleRef: subjects: - kind: ServiceAccount name: addon-manager-controller-sa - namespace: open-cluster-management-hub \ No newline at end of file + namespace: open-cluster-management-hub diff --git a/fleetconfig-controller/cmd/manager/manager.go b/fleetconfig-controller/cmd/manager/manager.go index 611dbd2b..c5b41c2e 100644 --- a/fleetconfig-controller/cmd/manager/manager.go +++ b/fleetconfig-controller/cmd/manager/manager.go @@ -266,7 +266,7 @@ func ForSpoke(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { func getHubRestConfig() (*rest.Config, error) { hubKubeconfigPath := os.Getenv(apiv1beta1.HubKubeconfigEnvVar) if hubKubeconfigPath == "" { - hubKubeconfigPath = apiv1beta1.HubKubeconfigFallbackPath + hubKubeconfigPath = apiv1beta1.DefaultHubKubeconfigPath } basePath := strings.TrimSuffix(hubKubeconfigPath, "kubeconfig") diff --git a/fleetconfig-controller/hack/dev/devspace-start-hub.sh b/fleetconfig-controller/devspace-start-hub.sh similarity index 100% rename from fleetconfig-controller/hack/dev/devspace-start-hub.sh rename to fleetconfig-controller/devspace-start-hub.sh diff --git a/fleetconfig-controller/hack/dev/devspace-start-spoke.sh b/fleetconfig-controller/devspace-start-spoke.sh similarity index 100% rename from fleetconfig-controller/hack/dev/devspace-start-spoke.sh rename to fleetconfig-controller/devspace-start-spoke.sh diff --git a/fleetconfig-controller/devspace.yaml b/fleetconfig-controller/devspace.yaml index 69e83d96..307a143f 100644 --- a/fleetconfig-controller/devspace.yaml +++ b/fleetconfig-controller/devspace.yaml @@ -166,7 +166,7 @@ dev: enabled: true disableReplace: true workDir: /workspace - command: ./hack/dev/devspace-start-hub.sh + command: ./devspace-start-hub.sh ports: - port: ${HUB_PORT} sync: @@ -180,14 +180,10 @@ dev: - '!/dependencymagnet' - '!/internal' - '!/pkg' - - '!/hack/dev/devspace-start-hub.sh' - '!/hack' - '!/go.mod' - '!/go.sum' - 'Makefile' - - path: ./hack/dev:/workspace - excludePaths: - - '**' - '!devspace-start-hub.sh' fleetconfig-controller-dev-spoke: @@ -214,7 +210,4 @@ dev: - '!/go.mod' - '!/go.sum' - 'Makefile' - - path: ./hack/dev:/workspace - excludePaths: - - '**' - - '!devspace-start-spoke.sh' \ No newline at end of file + - '!devspace-start-spoke.sh' diff --git a/fleetconfig-controller/internal/controller/v1beta1/addon.go b/fleetconfig-controller/internal/controller/v1beta1/addon.go index 3915339a..0888aa9c 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/addon.go +++ b/fleetconfig-controller/internal/controller/v1beta1/addon.go @@ -29,28 +29,6 @@ import ( "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/file" ) -const ( - // commands - addon = "addon" - create = "create" - enable = "enable" - disable = "disable" - - install = "install" - uninstall = "uninstall" - hubAddon = "hub-addon" - - addonArgoCD = "argocd" - addonGPF = "governance-policy-framework" - - managedClusterAddOn = "ManagedClusterAddOn" -) - -var supportedHubAddons = []string{ - addonArgoCD, - addonGPF, -} - // getManagedClusterAddOns returns the list of ManagedClusterAddOns currently installed on a spoke cluster func getManagedClusterAddOns(ctx context.Context, addonC *addonapi.Clientset, spokeName string) ([]string, error) { managedClusterAddOns, err := addonC.AddonV1alpha1().ManagedClusterAddOns(spokeName).List(ctx, metav1.ListOptions{ @@ -407,7 +385,8 @@ func handleAddonEnable(ctx context.Context, spoke *v1beta1.Spoke, addons []v1bet continue } // TODO - do this natively with clusteradm once https://github.com/open-cluster-management-io/clusteradm/issues/501 is resolved. - // OR switch to using Placements strategy once https://github.com/open-cluster-management-io/ocm/pull/1123 is merged. + // When switching to using Placements strategy once https://github.com/open-cluster-management-io/ocm/pull/1123 is merged, + // this will still need to be done, just in a different part of the code. if a.ConfigName == v1beta1.FCCAddOnName { err = patchFCCMca(ctx, spoke.Name, addonC) if err != nil { @@ -434,7 +413,7 @@ func patchFCCMca(ctx context.Context, spokeName string, addonC *addonapi.Clients mca.Spec.Configs = append(mca.Spec.Configs, addonv1alpha1.AddOnConfig{ ConfigGroupResource: addonv1alpha1.ConfigGroupResource{ Group: addonv1alpha1.GroupName, - Resource: "addondeploymentconfigs", // TODO - no magic string + Resource: AddOnDeploymentConfigsKind, }, ConfigReferent: addonv1alpha1.ConfigReferent{ Name: v1beta1.FCCAddOnName, @@ -692,7 +671,7 @@ func isAddonInstalled(ctx context.Context, addonC *addonapi.Clientset, addonName // waitForAddonManifestWorksCleanup polls for addon-related manifestWorks to be removed // after addon disable operation to avoid race conditions during spoke unjoin -func waitForAddonManifestWorksCleanup(ctx context.Context, workC *workapi.Clientset, spokeName string, timeout time.Duration) error { +func waitForAddonManifestWorksCleanup(ctx context.Context, workC *workapi.Clientset, spokeName string, timeout time.Duration, isHubAsSpoke bool) error { logger := log.FromContext(ctx) logger.V(1).Info("waiting for addon manifestWorks cleanup", "spokeName", spokeName, "timeout", timeout) @@ -704,8 +683,14 @@ func waitForAddonManifestWorksCleanup(ctx context.Context, workC *workapi.Client return false, nil } - // Success condition: only FCC manifestWork remaining - if len(manifestWorks.Items) == 1 { // TODO @arturshadnik ................. + // for hub-as-spoke, all addons must be removed. + // otherwise, fleetconfig-controller-manager must not be removed. + var expectedWorks = 1 + if isHubAsSpoke { + expectedWorks = 0 + } + + if len(manifestWorks.Items) == expectedWorks { logger.V(1).Info("addon manifestWorks cleanup completed", "spokeName", spokeName, "remainingManifestWorks", len(manifestWorks.Items)) return true, nil } diff --git a/fleetconfig-controller/internal/controller/v1beta1/common.go b/fleetconfig-controller/internal/controller/v1beta1/common.go index 611396a4..f39d00cd 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/common.go +++ b/fleetconfig-controller/internal/controller/v1beta1/common.go @@ -3,8 +3,6 @@ package v1beta1 import ( "context" "fmt" - "regexp" - "time" kerrs "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -15,16 +13,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" ) -const ( - clusteradm = "clusteradm" - requeue = 30 * time.Second - amwExistsError = "you should manually clean them, uninstall kluster will cause those works out of control." - addonCleanupTimeout = 1 * time.Minute - addonCleanupPollInterval = 2 * time.Second -) - -var csrSuffixPattern = regexp.MustCompile(`-[a-zA-Z0-9]{5}$`) - func ret(ctx context.Context, res ctrl.Result, err error) (ctrl.Result, error) { logger := log.FromContext(ctx) if err != nil { diff --git a/fleetconfig-controller/internal/controller/v1beta1/constants.go b/fleetconfig-controller/internal/controller/v1beta1/constants.go new file mode 100644 index 00000000..2252233a --- /dev/null +++ b/fleetconfig-controller/internal/controller/v1beta1/constants.go @@ -0,0 +1,44 @@ +package v1beta1 + +import ( + "regexp" + "time" +) + +// generic +const ( + clusteradm = "clusteradm" + requeue = 30 * time.Second + amwExistsError = "you should manually clean them, uninstall kluster will cause those works out of control." +) + +var csrSuffixPattern = regexp.MustCompile(`-[a-zA-Z0-9]{5}$`) + +// addon +const ( + // commands + addon = "addon" + create = "create" + enable = "enable" + disable = "disable" + + install = "install" + uninstall = "uninstall" + hubAddon = "hub-addon" + + addonArgoCD = "argocd" + addonGPF = "governance-policy-framework" + + managedClusterAddOn = "ManagedClusterAddOn" + AddOnDeploymentConfigsKind = "addondeploymentconfigs" + + addonCleanupTimeout = 1 * time.Minute + addonCleanupPollInterval = 2 * time.Second + + fccAddOnManifestWorkLabel = "open-cluster-management.io/addon-name=fleetconfig-controller-manager" +) + +var supportedHubAddons = []string{ + addonArgoCD, + addonGPF, +} diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go index 3c7f7a43..91077180 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go @@ -91,16 +91,28 @@ func (r *SpokeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl spoke.Status.Phase = v1beta1.Unhealthy } - // Add finalizers if not already present, set defaults, and requeue - if !slices.Contains(spoke.Finalizers, v1beta1.HubCleanupFinalizer) { - setDefaults(ctx, spoke, hubMeta) - spoke.Finalizers = append( - spoke.Finalizers, - v1beta1.HubCleanupPreflightFinalizer, // removed by the hub to signal to the spoke that preflight is completed - v1beta1.SpokeCleanupFinalizer, // removed by the spoke to signal to the hub that unjoin succeeded - v1beta1.HubCleanupFinalizer, // removed by the hub after post-unjoin cleanup is finished - ) - return ret(ctx, ctrl.Result{RequeueAfter: requeue}, nil) + switch r.ClusterType { + case v1beta1.ClusterTypeHub: + if !slices.Contains(spoke.Finalizers, v1beta1.HubCleanupFinalizer) { + setDefaults(ctx, spoke, hubMeta) + spoke.Finalizers = append( + spoke.Finalizers, + v1beta1.HubCleanupPreflightFinalizer, // removed by the hub to signal to the spoke that preflight is completed + v1beta1.HubCleanupFinalizer, // removed by the hub after post-unjoin cleanup is finished + ) + if spoke.IsHubAsSpoke() { + spoke.Finalizers = append(spoke.Finalizers, v1beta1.SpokeCleanupFinalizer) + } + return ret(ctx, ctrl.Result{RequeueAfter: requeue}, nil) + } + case v1beta1.ClusterTypeSpoke: + if !slices.Contains(spoke.Finalizers, v1beta1.SpokeCleanupFinalizer) && spoke.DeletionTimestamp.IsZero() { + spoke.Finalizers = append(spoke.Finalizers, v1beta1.SpokeCleanupFinalizer) // removed by the spoke to signal to the hub that unjoin succeeded + return ret(ctx, ctrl.Result{RequeueAfter: requeue}, nil) + } + default: + // this is guarded against when the manager is initialized. should never reach this point + panic(fmt.Sprintf("unknown cluster type %s. Must be one of %v", r.ClusterType, v1beta1.SupportedClusterTypes)) } // Handle deletion logic with finalizer diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 2242e97c..05724b02 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -223,10 +223,15 @@ func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, h } func (r *SpokeReconciler) bindAddonAgent(ctx context.Context, spoke *v1beta1.Spoke) error { + roleName := os.Getenv(v1beta1.RoleNameEnvVar) + if roleName == "" { + roleName = v1beta1.DefaultFCCManagerRole + } + roleRef := rbacv1.RoleRef{ Kind: "ClusterRole", APIGroup: rbacv1.GroupName, - Name: "fleetconfig-controller-manager-role", // TODO @arturshadnik get this some other way + Name: roleName, } err := r.doBind(ctx, roleRef, spoke.Namespace, spoke.Name) @@ -355,7 +360,7 @@ func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke if len(spoke.Status.EnabledAddons) > 0 { // Wait for addon manifestWorks to be fully cleaned up before proceeding with unjoin - if err := waitForAddonManifestWorksCleanup(ctx, workC, spoke.Name, addonCleanupTimeout); err != nil { + if err := waitForAddonManifestWorksCleanup(ctx, workC, spoke.Name, addonCleanupTimeout, spoke.IsHubAsSpoke()); err != nil { spoke.SetConditions(true, v1beta1.NewCondition( err.Error(), v1beta1.AddonsConfigured, metav1.ConditionTrue, metav1.ConditionFalse, )) @@ -390,32 +395,30 @@ func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke } // at this point, klusterlet-work-agent is uninstalled, so nothing can remove this finalizer. all resources are cleaned up by the spoke's controller, so to prevent a dangling mw/namespace, we remove the finalizer manually - mw, err := workC.WorkV1().ManifestWorks(spoke.Name).Get(ctx, "addon-fleetconfig-controller-manager-deploy-0", metav1.GetOptions{}) // TODO @arturshadnik - use a label maybe - if err != nil && !kerrs.IsNotFound(err) { - return err - } - mw.Finalizers = slices.DeleteFunc(mw.Finalizers, func(s string) bool { - return s == "cluster.open-cluster-management.io/manifest-work-cleanup" - }) - - patchBytes, err := json.Marshal(map[string]any{ - "metadata": map[string]any{ - "finalizers": mw.Finalizers, - }, - }) + mwList, err := workC.WorkV1().ManifestWorks(spoke.Name).List(ctx, metav1.ListOptions{LabelSelector: fccAddOnManifestWorkLabel}) if err != nil { return err } + for _, mw := range mwList.Items { + patchBytes, err := json.Marshal(map[string]any{ + "metadata": map[string]any{ + "finalizers": nil, + }, + }) + if err != nil { + return err + } - _, err = workC.WorkV1().ManifestWorks(spoke.Name).Patch( - ctx, - "addon-fleetconfig-controller-manager-deploy-0", - types.MergePatchType, - patchBytes, - metav1.PatchOptions{}, - ) - if err != nil && !kerrs.IsNotFound(err) { - return err + _, err = workC.WorkV1().ManifestWorks(spoke.Name).Patch( + ctx, + mw.Name, + types.MergePatchType, + patchBytes, + metav1.PatchOptions{}, + ) + if err != nil && !kerrs.IsNotFound(err) { + return err + } } // remove ManagedCluster @@ -451,6 +454,7 @@ func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spo if err != nil { return err } + spoke.Finalizers = slices.DeleteFunc(spoke.Finalizers, func(s string) bool { return s == v1beta1.SpokeCleanupFinalizer }) diff --git a/fleetconfig-controller/test/data/fleetconfig-values.yaml b/fleetconfig-controller/test/data/fleetconfig-values.yaml index e2a61e23..6899c5c0 100644 --- a/fleetconfig-controller/test/data/fleetconfig-values.yaml +++ b/fleetconfig-controller/test/data/fleetconfig-values.yaml @@ -39,6 +39,7 @@ fleetConfig: namespace: fleetconfig-system addOns: - configName: test-addon + - configName: fleetconfig-controller-manager createNamespace: true syncLabels: false kubeconfig: diff --git a/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go b/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go index 134628f0..cd67ad4c 100644 --- a/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go +++ b/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go @@ -60,6 +60,9 @@ var _ = Describe("hub and spoke", Label("v1beta1"), Serial, Ordered, func() { BeforeAll(func() { tc = setupTestEnvironment() + By("loading the fcc image into the spoke cluster") + Expect(utils.DevspaceRunPipeline(tc.ctx, tc.spokeKubeconfig, "load-local", fcNamespace, "v1beta1")).To(Succeed()) + By("deploying fleetconfig") Expect(utils.DevspaceRunPipeline(tc.ctx, tc.hubKubeconfig, "deploy-local", fcNamespace, "v1beta1")).To(Succeed()) }) @@ -248,6 +251,7 @@ var _ = Describe("hub and spoke", Label("v1beta1"), Serial, Ordered, func() { } else if err != nil { utils.WarnError(err, "failed to check if Hub was deleted") } + fmt.Println(hubClone.Status) return errors.New("hub still exists") }, ) From c2a19c2b63a6d9109c5b1ee41356d2786b58b33d Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 25 Sep 2025 13:40:01 -0700 Subject: [PATCH 03/62] feat: add new pivot condition; update tests Signed-off-by: Artur Shad Nik --- fleetconfig-controller/api/v1beta1/constants.go | 5 ++++- .../controller/v1beta1/spoke_controller.go | 3 +++ .../controller/v1beta1/spoke_controller_test.go | 8 +++++--- .../internal/controller/v1beta1/spoke_handler.go | 7 +++++++ .../internal/webhook/v1beta1/validation.go | 5 +---- .../internal/webhook/v1beta1/validation_test.go | 9 +++++---- fleetconfig-controller/test/e2e/helper.go | 1 + .../test/e2e/v1beta1_hub_spoke.go | 14 +++++++------- 8 files changed, 33 insertions(+), 19 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/constants.go b/fleetconfig-controller/api/v1beta1/constants.go index adde7acb..e4a3463b 100644 --- a/fleetconfig-controller/api/v1beta1/constants.go +++ b/fleetconfig-controller/api/v1beta1/constants.go @@ -24,8 +24,11 @@ const ( // CleanupFailed means that a failure occurred during cleanup. CleanupFailed = "CleanupFailed" - // SpokeJoined means that the spoke has successfully joined the Hub. + // SpokeJoined means that the Spoke has successfully joined the Hub. SpokeJoined = "SpokeJoined" + + // PivotComplete means that the spoke cluster has successfully started managing itself. + PivotComplete = "PivotComplete" ) // Hub and Spoke condition reasons diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go index 91077180..a736babf 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go @@ -148,6 +148,9 @@ func (r *SpokeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl v1beta1.NewCondition( v1beta1.AddonsConfigured, v1beta1.AddonsConfigured, metav1.ConditionFalse, metav1.ConditionFalse, ), + v1beta1.NewCondition( + v1beta1.PivotComplete, v1beta1.PivotComplete, metav1.ConditionFalse, metav1.ConditionFalse, + ), } spoke.SetConditions(false, initConditions...) diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller_test.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller_test.go index 3daf7e68..503bcfed 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller_test.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller_test.go @@ -71,9 +71,10 @@ var _ = Describe("Spoke Controller", Ordered, func() { Namespace: "default", } spokeReconciler = &SpokeReconciler{ - Client: k8sClient, - Log: logr.Logger{}, - Scheme: k8sClient.Scheme(), + Client: k8sClient, + Log: logr.Logger{}, + Scheme: k8sClient.Scheme(), + ClusterType: v1beta1.ClusterTypeHub, } spoke = &v1beta1.Spoke{ ObjectMeta: metav1.ObjectMeta{ @@ -131,6 +132,7 @@ var _ = Describe("Spoke Controller", Ordered, func() { v1beta1.SpokeJoined: metav1.ConditionFalse, v1beta1.CleanupFailed: metav1.ConditionFalse, v1beta1.AddonsConfigured: metav1.ConditionFalse, + v1beta1.PivotComplete: metav1.ConditionFalse, })).To(Succeed()) }) diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 05724b02..c65fb91a 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -164,6 +164,10 @@ func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, h spoke.SetConditions(true, v1beta1.NewCondition( "Joined", v1beta1.SpokeJoined, metav1.ConditionTrue, metav1.ConditionTrue, )) + // do not mark the Spoke "Running" until the spoke fcc agent has begun managing it + spoke.SetConditions(true, v1beta1.NewCondition( + "WaitingForSpokeAgent", v1beta1.PivotComplete, metav1.ConditionFalse, metav1.ConditionTrue, + )) // Label the spoke ManagedCluster if in hub-as-spoke mode. // This allows the 'spoke' ManagedClusterSet to omit the hub-as-spoke cluster from its list @@ -307,6 +311,9 @@ func (r *SpokeReconciler) doSpokeWork(ctx context.Context, spoke *v1beta1.Spoke, } spoke.Status.KlusterletHash = currKlusterletHash + spoke.SetConditions(true, v1beta1.NewCondition( + "WaitingForSpokeAgent", v1beta1.PivotComplete, metav1.ConditionTrue, metav1.ConditionTrue, + )) return nil } diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation.go b/fleetconfig-controller/internal/webhook/v1beta1/validation.go index 876896b4..133848c0 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation.go @@ -102,7 +102,6 @@ func allowHubUpdate(oldHub, newHub *v1beta1.Hub) error { // - spec.addOns // - spec.timeout // - spec.logVerbosity -// - spec.hubRed func allowSpokeUpdate(oldSpoke, newSpoke *v1beta1.Spoke) error { if !reflect.DeepEqual(newSpoke.Spec, oldSpoke.Spec) { oldSpokeCopy := oldSpoke.Spec.DeepCopy() @@ -119,11 +118,9 @@ func allowSpokeUpdate(oldSpoke, newSpoke *v1beta1.Spoke) error { newSpokeCopy.LogVerbosity = 0 oldSpokeCopy.Timeout = 0 newSpokeCopy.Timeout = 0 - oldSpokeCopy.HubRef = v1beta1.HubRef{} - newSpokeCopy.HubRef = v1beta1.HubRef{} if !reflect.DeepEqual(oldSpokeCopy, newSpokeCopy) { - return errors.New("spoke contains changes which are not allowed; only changes to spec.hubRef, spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke") + return errors.New("spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke") } } diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go b/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go index 5391c536..e9c2b037 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go @@ -426,7 +426,8 @@ func TestAllowSpokeUpdate(t *testing.T) { }, }, }, - wantErr: false, + wantErr: true, + errMsg: "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", }, { name: "disallowed - CreateNamespace change", @@ -441,7 +442,7 @@ func TestAllowSpokeUpdate(t *testing.T) { }, }, wantErr: true, - errMsg: "spoke contains changes which are not allowed; only changes to spec.hubRef, spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", + errMsg: "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", }, { name: "disallowed - klusterlet mode change", @@ -460,7 +461,7 @@ func TestAllowSpokeUpdate(t *testing.T) { }, }, wantErr: true, - errMsg: "spoke contains changes which are not allowed; only changes to spec.hubRef, spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", + errMsg: "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", }, { name: "disallowed - klusterlet feature gates change", @@ -479,7 +480,7 @@ func TestAllowSpokeUpdate(t *testing.T) { }, }, wantErr: true, - errMsg: "spoke contains changes which are not allowed; only changes to spec.hubRef, spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", + errMsg: "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", }, { name: "multiple allowed changes", diff --git a/fleetconfig-controller/test/e2e/helper.go b/fleetconfig-controller/test/e2e/helper.go index 84892685..03bdfc52 100644 --- a/fleetconfig-controller/test/e2e/helper.go +++ b/fleetconfig-controller/test/e2e/helper.go @@ -485,6 +485,7 @@ func ensureHubAndSpokesProvisioned(tc *E2EContext, hub *v1beta1.Hub, spokes []*v "SpokeJoined": metav1.ConditionTrue, "CleanupFailed": metav1.ConditionFalse, "AddonsConfigured": metav1.ConditionTrue, + "PivotComplete": metav1.ConditionTrue, } for k, v := range extraExpectedConditions { hubExpectedConditions[k] = v diff --git a/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go b/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go index cd67ad4c..cc4ca5db 100644 --- a/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go +++ b/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go @@ -63,7 +63,7 @@ var _ = Describe("hub and spoke", Label("v1beta1"), Serial, Ordered, func() { By("loading the fcc image into the spoke cluster") Expect(utils.DevspaceRunPipeline(tc.ctx, tc.spokeKubeconfig, "load-local", fcNamespace, "v1beta1")).To(Succeed()) - By("deploying fleetconfig") + By("deploying fleetconfig-controller") Expect(utils.DevspaceRunPipeline(tc.ctx, tc.hubKubeconfig, "deploy-local", fcNamespace, "v1beta1")).To(Succeed()) }) @@ -71,7 +71,7 @@ var _ = Describe("hub and spoke", Label("v1beta1"), Serial, Ordered, func() { teardownTestEnvironment(tc) }) - // Tests FleetConfig operations with ResourceCleanup feature gate enabled, verifying: + // Tests Hub and Spoke operations with ResourceCleanup feature gate enabled, verifying: // 1. Cluster joining (spoke and hub-as-spoke) to the hub // 2. Addon configuration on hub and installation on spoke // 3. ManifestWork creation in hub-as-spoke namespace and namespace creation validation @@ -83,11 +83,11 @@ var _ = Describe("hub and spoke", Label("v1beta1"), Serial, Ordered, func() { Context("deploy and teardown Hub and Spokes with ResourceCleanup feature gate enabled", func() { It("should join the spoke and hub-as-spoke clusters to the hub", func() { - // NOTE: The FleetConfig CR is created by devspace when the fleetconfig-controller chart is installed. + // NOTE: The Hub and Spoke CRs are created by devspace when the fleetconfig-controller chart is installed. // Its configuration is defined via the fleetConfig values. ensureHubAndSpokesProvisioned(tc, hub, []*v1beta1.Spoke{spoke, hubAsSpoke}, nil) - By("cloning the FleetConfig resources for further scenarios") + By("cloning the Hub and Spoke resources for further scenarios") err := utils.CloneHub(hub, hubClone) Expect(err).NotTo(HaveOccurred()) err = utils.CloneSpoke(spoke, spokeClone) @@ -129,7 +129,7 @@ var _ = Describe("hub and spoke", Label("v1beta1"), Serial, Ordered, func() { }, 2*time.Minute, 10*time.Second).Should(Succeed()) }) - It("should not allow changes to the FleetConfig resource", func() { + It("should not allow changes to the Hub resource", func() { By("failing to patch the Hub's feature gates") hub, err := utils.GetHub(tc.ctx, tc.kClient, v1beta1hubNN) @@ -203,12 +203,12 @@ var _ = Describe("hub and spoke", Label("v1beta1"), Serial, Ordered, func() { ExpectWithOffset(1, tc.kClient.Delete(tc.ctx, hubClone)).To(Succeed()) EventuallyWithOffset(1, func() error { if err := tc.kClient.Get(tc.ctx, v1beta1hubNN, hubClone); err != nil { - utils.WarnError(err, "failed to get FleetConfig") + utils.WarnError(err, "failed to get Hub") return err } if hubClone.Status.Phase != v1beta1.Deleting { err := fmt.Errorf("expected %s, got %s", v1beta1.Deleting, hubClone.Status.Phase) - utils.WarnError(err, "FleetConfig deletion not started") + utils.WarnError(err, "Hub deletion not started") return err } conditions := make([]metav1.Condition, len(hubClone.Status.Conditions)) From da104f7cbcb49b04acdb31a32f61f5302eaeb65e Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 25 Sep 2025 14:29:37 -0700 Subject: [PATCH 04/62] test: expand e2e coverage to valdiate full spoke cleanup Signed-off-by: Artur Shad Nik --- fleetconfig-controller/README.md | 2 +- fleetconfig-controller/cmd/manager/manager.go | 2 +- .../controller/v1beta1/spoke_controller.go | 2 +- .../controller/v1beta1/spoke_handler.go | 9 ++-- fleetconfig-controller/test/e2e/helper.go | 2 + .../test/e2e/v1beta1_hub_spoke.go | 41 ++++++++++++++++++- 6 files changed, 48 insertions(+), 10 deletions(-) diff --git a/fleetconfig-controller/README.md b/fleetconfig-controller/README.md index 2d555a1c..ce815076 100644 --- a/fleetconfig-controller/README.md +++ b/fleetconfig-controller/README.md @@ -40,7 +40,7 @@ Support for orchestration of OCM multi-clusters varies based on the Kubernetes d ### Onboarding -To familiarize yourself with the `FleetConfig` API and the `fleetconfig-controller`, we recommend doing one or more of the following onboarding steps. +To familiarize yourself with the `Hub` and `Spoke` APIs and the `fleetconfig-controller`, we recommend doing one or more of the following onboarding steps. 1. Step through a [smoke test](./docs/smoketests.md) 1. Invoke the [end-to-end tests](./test/e2e/fleetconfig.go) and inspect the content of the kind clusters that the E2E suite automatically creates diff --git a/fleetconfig-controller/cmd/manager/manager.go b/fleetconfig-controller/cmd/manager/manager.go index c5b41c2e..5cddf6a5 100644 --- a/fleetconfig-controller/cmd/manager/manager.go +++ b/fleetconfig-controller/cmd/manager/manager.go @@ -251,7 +251,7 @@ func ForSpoke(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { } if err := (&controllerv1beta1.SpokeReconciler{ - Client: mgr.GetClient(), // Uses the manager's client which has the correct scheme and hub config + Client: mgr.GetClient(), Log: ctrl.Log.WithName("controllers").WithName("Spoke"), ConcurrentReconciles: opts.SpokeConcurrentReconciles, Scheme: mgr.GetScheme(), diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go index a736babf..916caa8b 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go @@ -149,7 +149,7 @@ func (r *SpokeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl v1beta1.AddonsConfigured, v1beta1.AddonsConfigured, metav1.ConditionFalse, metav1.ConditionFalse, ), v1beta1.NewCondition( - v1beta1.PivotComplete, v1beta1.PivotComplete, metav1.ConditionFalse, metav1.ConditionFalse, + v1beta1.PivotComplete, v1beta1.PivotComplete, metav1.ConditionFalse, metav1.ConditionTrue, ), } spoke.SetConditions(false, initConditions...) diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index c65fb91a..564bb401 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -164,10 +164,6 @@ func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, h spoke.SetConditions(true, v1beta1.NewCondition( "Joined", v1beta1.SpokeJoined, metav1.ConditionTrue, metav1.ConditionTrue, )) - // do not mark the Spoke "Running" until the spoke fcc agent has begun managing it - spoke.SetConditions(true, v1beta1.NewCondition( - "WaitingForSpokeAgent", v1beta1.PivotComplete, metav1.ConditionFalse, metav1.ConditionTrue, - )) // Label the spoke ManagedCluster if in hub-as-spoke mode. // This allows the 'spoke' ManagedClusterSet to omit the hub-as-spoke cluster from its list @@ -312,7 +308,7 @@ func (r *SpokeReconciler) doSpokeWork(ctx context.Context, spoke *v1beta1.Spoke, spoke.Status.KlusterletHash = currKlusterletHash spoke.SetConditions(true, v1beta1.NewCondition( - "WaitingForSpokeAgent", v1beta1.PivotComplete, metav1.ConditionTrue, metav1.ConditionTrue, + v1beta1.PivotComplete, v1beta1.PivotComplete, metav1.ConditionTrue, metav1.ConditionTrue, )) return nil } @@ -472,6 +468,9 @@ func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spo } // "self-destruct" any remaining namespaces/resources + // TODO - instead of deleting the namespace etc, can we delete the appliedManifestWork? + // name: 43967cf4fa7b6c9c1f4014eb104077ac73d86aabc3d2c4ae6c51babdf5898540-addon-fleetconfig-controller-manager-deploy-0 + // no labels, no owner ref operatorClient, err := common.OperatorClient(spokeKubeconfig) if err != nil { return err diff --git a/fleetconfig-controller/test/e2e/helper.go b/fleetconfig-controller/test/e2e/helper.go index 03bdfc52..f0321e05 100644 --- a/fleetconfig-controller/test/e2e/helper.go +++ b/fleetconfig-controller/test/e2e/helper.go @@ -54,6 +54,8 @@ var ( v1beta1spokeNN = ktypes.NamespacedName{Name: "spoke", Namespace: fcNamespace} v1beta1hubAsSpokeNN = ktypes.NamespacedName{Name: "hub-as-spoke", Namespace: fcNamespace} + v1beta1fccAddOnAgentNN = ktypes.NamespacedName{Name: "fleetconfig-controller-manager", Namespace: fcNamespace} + // global test variables klusterletNN = ktypes.NamespacedName{Name: "klusterlet"} diff --git a/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go b/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go index cc4ca5db..6dd59171 100644 --- a/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go +++ b/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go @@ -24,6 +24,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" kerrs "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -149,7 +150,6 @@ var _ = Describe("hub and spoke", Label("v1beta1"), Serial, Ordered, func() { }) It("should clean up the hub cluster", func() { - By("ensuring the spoke is deregistered properly") EventuallyWithOffset(1, func() error { By("ensuring the Spoke resource is deleted") @@ -196,8 +196,45 @@ var _ = Describe("hub and spoke", Label("v1beta1"), Serial, Ordered, func() { utils.WarnError(err, "ManagedCluster namespace still exists") return err } + + By("ensuring the spoke agent is uninstalled and ocm resources are cleaned up") + deploy := &appsv1.Deployment{} + err = tc.kClientSpoke.Get(tc.ctx, v1beta1fccAddOnAgentNN, deploy) + if err != nil { + if !kerrs.IsNotFound(err) { + return err + } + utils.Info("fleetconfig-controller addon agent deleted successfully") + } else { + err := errors.New("fleetconfig-controller addon agent not deleted yet") + utils.WarnError(err, "fleetconfig-controller addon agent still exists") + return err + } + + namespacesToDelete := []string{ + "open-cluster-management-agent", + "open-cluster-management-agent-addon", + "open-cluster-management", + fcNamespace, + } + for _, n := range namespacesToDelete { + spokeNs := &corev1.Namespace{} + err = tc.kClientSpoke.Get(tc.ctx, ktypes.NamespacedName{Name: n}, spokeNs) + if err != nil { + if !kerrs.IsNotFound(err) { + return err + } + utils.Info(fmt.Sprintf("namespace %s deleted successfully", n)) + } else { + err := fmt.Errorf("namespace %s not deleted yet", n) + utils.WarnError(err, "namespace still exists") + return err + } + + } + return nil - }) + }, 5*time.Minute, 10*time.Second).Should(Succeed()) By("deleting the Hub and ensuring that it isn't deleted until the ManifestWork is deleted") ExpectWithOffset(1, tc.kClient.Delete(tc.ctx, hubClone)).To(Succeed()) From 282d2f6206eb20cb3510609dd018ff7fa3e7c9a1 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 25 Sep 2025 15:36:40 -0700 Subject: [PATCH 05/62] chore: rabbit Signed-off-by: Artur Shad Nik --- .../templates/deployment.yaml | 2 +- .../ocm/fcc-addon/addon-template.yaml | 3 +-- .../fcc-addon/cluster-management-addon.yaml | 2 +- fleetconfig-controller/cmd/main.go | 4 ++-- fleetconfig-controller/cmd/manager/manager.go | 13 ++++++++-- .../devspace-start-spoke.sh | 4 ++-- .../internal/controller/v1beta1/addon.go | 24 +++++++++++-------- .../controller/v1beta1/spoke_controller.go | 2 +- .../internal/webhook/v1beta1/validation.go | 5 ++-- 9 files changed, 35 insertions(+), 24 deletions(-) diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml index ae0ad0d0..b53671f1 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml @@ -36,9 +36,9 @@ spec: - "--use-webhook=true" - "--webhook-port={{ .Values.webhookService.port }}" - "--webhook-cert-dir={{ .Values.admissionWebhooks.certificate.mountPath }}" + {{- end }} - "--spoke-concurrent-reconciles={{ .Values.spokeConcurrentReconciles }}" - "--cluster-type=hub" - {{- end }} command: - /manager env: diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml index 1eaefff8..816542da 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml @@ -47,6 +47,7 @@ spec: - args: - "--leader-elect" - "--health-probe-bind-address=:{{ .Values.healthCheck.port }}" + - "--use-webhook=false" - "--spoke-concurrent-reconciles=1" - "--cluster-type=spoke" command: @@ -84,7 +85,6 @@ spec: apiVersion: rbac.authorization.k8s.io/v1 metadata: name: {{ include "chart.fullname" . }}-manager-spoke-role - namespace: {{ .Release.Namespace }} rules: - apiGroups: [""] resources: ["namespaces"] @@ -150,7 +150,6 @@ spec: apiGroup: rbac.authorization.k8s.io kind: Role name: {{ include "chart.fullname" . }}-leader-election-role - namespace: {{ .Release.Namespace }} subjects: - kind: ServiceAccount name: {{ include "chart.fullname" . }}-manager diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-management-addon.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-management-addon.yaml index 1c1403cd..4ff2ef10 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-management-addon.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-management-addon.yaml @@ -9,7 +9,7 @@ spec: displayName: FleetConfig Controller Addon description: | fleetconfig-controller-manager is an addon to deploy fleetconfig-controller manager on the managed cluster. - It is used to enable decentalized management of spoke clusters. + It is used to enable decentralized management of spoke clusters. supportedConfigs: - group: addon.open-cluster-management.io resource: addontemplates diff --git a/fleetconfig-controller/cmd/main.go b/fleetconfig-controller/cmd/main.go index db0288de..3e3d6f62 100644 --- a/fleetconfig-controller/cmd/main.go +++ b/fleetconfig-controller/cmd/main.go @@ -64,8 +64,8 @@ func main() { flag.BoolVar(&mOpts.EnableHTTP2, "enable-http2", false, "If set, HTTP/2 will be enabled for the metrics and webhook servers.") flag.BoolVar(&mOpts.UseWebhook, "use-webhook", mOpts.UseWebhook, "Enable admission webhooks") - flag.StringVar(&mOpts.CertDir, "webhook-cert-dir", mOpts.CertDir, "Admission webhook cert/key dir") - flag.IntVar(&mOpts.WebhookPort, "webhook-port", mOpts.WebhookPort, "Admission webhook port") + flag.StringVar(&mOpts.CertDir, "webhook-cert-dir", "/etc/k8s-webhook-certs", "Admission webhook cert/key dir") + flag.IntVar(&mOpts.WebhookPort, "webhook-port", 9443, "Admission webhook port") flag.IntVar(&mOpts.SpokeConcurrentReconciles, "spoke-concurrent-reconciles", apiv1beta1.SpokeDefaultMaxConcurrentReconciles, fmt.Sprintf("Maximum number of Spoke resources that may be reconciled in parallel. Defaults to %d.", apiv1beta1.SpokeDefaultMaxConcurrentReconciles)) flag.StringVar(&mOpts.ClusterType, "cluster-type", apiv1beta1.ClusterTypeHub, "The type of cluster that this controller instance is installed in.") diff --git a/fleetconfig-controller/cmd/manager/manager.go b/fleetconfig-controller/cmd/manager/manager.go index 5cddf6a5..1bf6a0e5 100644 --- a/fleetconfig-controller/cmd/manager/manager.go +++ b/fleetconfig-controller/cmd/manager/manager.go @@ -7,7 +7,6 @@ import ( "fmt" "os" "path/filepath" - "strings" "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" @@ -181,7 +180,17 @@ func ForSpoke(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { } spokeNamespace := os.Getenv(apiv1beta1.SpokeNamespaceEnvVar) + if spokeNamespace == "" { + err = fmt.Errorf("CLUSTER_NAMESPACE environment variable must be set") + setupLog.Error(err, "unable to create controller", "controller", "Spoke") + return nil, err + } hubNamespace := os.Getenv(apiv1beta1.HubNamespaceEnvVar) + if hubNamespace == "" { + err = fmt.Errorf("HUB_NAMESPACE environment variable must be set") + setupLog.Error(err, "unable to create controller", "controller", "Spoke") + return nil, err + } mgr, err := ctrl.NewManager(hubRestCfg, ctrl.Options{ Scheme: opts.Scheme, @@ -269,7 +278,7 @@ func getHubRestConfig() (*rest.Config, error) { hubKubeconfigPath = apiv1beta1.DefaultHubKubeconfigPath } - basePath := strings.TrimSuffix(hubKubeconfigPath, "kubeconfig") + basePath := filepath.Dir(hubKubeconfigPath) certPath := filepath.Join(basePath, "tls.crt") keyPath := filepath.Join(basePath, "tls.key") diff --git a/fleetconfig-controller/devspace-start-spoke.sh b/fleetconfig-controller/devspace-start-spoke.sh index ffb35f16..e2059cd8 100755 --- a/fleetconfig-controller/devspace-start-spoke.sh +++ b/fleetconfig-controller/devspace-start-spoke.sh @@ -24,8 +24,8 @@ This is how you can work with it: If you wish to run the fleetconfig controller manager in debug mode with delve, run: \`${COLOR_CYAN}${DEBUG_CMD}${COLOR_RESET}\` - Wait until the \`${COLOR_CYAN}API server listening at: [::]:2344${COLOR_RESET}\` message appears - Start the \"Debug (localhost:2344)\" configuration in VSCode to connect your debugger session. + Wait until the \`${COLOR_CYAN}API server listening at: [::]:2345${COLOR_RESET}\` message appears + Start the \"Debug (localhost:2345)\" configuration in VSCode to connect your debugger session. ${COLOR_CYAN}Note:${COLOR_RESET} fleetconfig controller manager won't start until you connect with the debugger. ${COLOR_CYAN}Note:${COLOR_RESET} fleetconfig controller manager will be stopped once you detach your debugger session. diff --git a/fleetconfig-controller/internal/controller/v1beta1/addon.go b/fleetconfig-controller/internal/controller/v1beta1/addon.go index 0888aa9c..7ca0f188 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/addon.go +++ b/fleetconfig-controller/internal/controller/v1beta1/addon.go @@ -410,7 +410,7 @@ func patchFCCMca(ctx context.Context, spokeName string, addonC *addonapi.Clients if err != nil { return fmt.Errorf("failed to configure fleetconfig-controller-manager: %v", err) } - mca.Spec.Configs = append(mca.Spec.Configs, addonv1alpha1.AddOnConfig{ + desired := addonv1alpha1.AddOnConfig{ ConfigGroupResource: addonv1alpha1.ConfigGroupResource{ Group: addonv1alpha1.GroupName, Resource: AddOnDeploymentConfigsKind, @@ -419,26 +419,30 @@ func patchFCCMca(ctx context.Context, spokeName string, addonC *addonapi.Clients Name: v1beta1.FCCAddOnName, Namespace: spokeName, }, - }) - + } + if slices.ContainsFunc(mca.Spec.Configs, func(c addonv1alpha1.AddOnConfig) bool { + return c.Group == desired.Group && + c.Resource == desired.Resource && + c.Name == desired.Name && + c.Namespace == desired.Namespace + }) { + return nil + } + mca.Spec.Configs = append(mca.Spec.Configs, desired) patchBytes, err := json.Marshal(map[string]any{ - "spec": map[string]any{ - "configs": mca.Spec.Configs, - }, + "spec": map[string]any{"configs": mca.Spec.Configs}, }) if err != nil { return fmt.Errorf("failed to marshal patch for fleetconfig-controller-manager: %v", err) } - _, err = addonC.AddonV1alpha1().ManagedClusterAddOns(spokeName).Patch( + if _, err = addonC.AddonV1alpha1().ManagedClusterAddOns(spokeName).Patch( ctx, v1beta1.FCCAddOnName, types.MergePatchType, patchBytes, metav1.PatchOptions{}, - ) - if err != nil { + ); err != nil { return fmt.Errorf("failed to patch fleetconfig-controller-manager: %v", err) - } return nil } diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go index 916caa8b..58d8886b 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go @@ -246,7 +246,7 @@ func (r *SpokeReconciler) SetupWithManagerForHub(mgr ctrl.Manager) error { // SetupWithManagerForSpoke sets up the controller with the Manager to run on a Spoke cluster. func (r *SpokeReconciler) SetupWithManagerForSpoke(mgr ctrl.Manager) error { - spokeName := os.Getenv(v1beta1.SpokeNameEnvVar) + spokeName := os.Getenv(v1beta1.SpokeNameEnvVar) // we know this is set, because the mgr setup would have failed otherwise return ctrl.NewControllerManagedBy(mgr). For(&v1beta1.Spoke{}, builder.WithPredicates(predicate.NewPredicateFuncs( diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation.go b/fleetconfig-controller/internal/webhook/v1beta1/validation.go index 133848c0..15c87549 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation.go @@ -23,7 +23,6 @@ import ( const ( warnHubNotFound = "hub not found, cannot validate spoke addons" - fccAddOnName = "fleetconfig-controller-manager" ) func isKubeconfigValid(kubeconfig v1beta1.Kubeconfig) (bool, string) { @@ -348,13 +347,13 @@ func validateAddons(ctx context.Context, cli client.Client, newObject *v1beta1.S if newObject.IsHubAsSpoke() { if slices.ContainsFunc(newObject.Spec.AddOns, func(a v1beta1.AddOn) bool { - return a.ConfigName == fccAddOnName + return a.ConfigName == v1beta1.FCCAddOnName }) { errs = append(errs, field.Invalid(field.NewPath("spec").Child("addOns"), newObject.Spec.AddOns, "hub-as-spoke Spoke cannot enable fleetconfig-controller-manager addon")) } } else { if !slices.ContainsFunc(newObject.Spec.AddOns, func(a v1beta1.AddOn) bool { - return a.ConfigName == fccAddOnName + return a.ConfigName == v1beta1.FCCAddOnName }) { errs = append(errs, field.Invalid(field.NewPath("spec").Child("addOns"), newObject.Spec.AddOns, "Spoke must enable fleetconfig-controller-manager addon")) } From d863fed59415cef14b19ae77c8a14a25e3f63f0a Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 25 Sep 2025 18:37:32 -0700 Subject: [PATCH 06/62] chore: rabbit nits and DRY Signed-off-by: Artur Shad Nik --- .../api/v1beta1/constants.go | 2 +- .../templates/deployment.yaml | 2 +- .../ocm/fcc-addon/addon-template.yaml | 1 - fleetconfig-controller/cmd/main.go | 2 +- fleetconfig-controller/cmd/manager/manager.go | 32 +++++-------------- .../config/devspace/hub/manager.yaml | 2 ++ fleetconfig-controller/devspace.yaml | 4 +-- .../internal/controller/v1beta1/addon.go | 2 +- .../internal/controller/v1beta1/constants.go | 4 +-- .../internal/webhook/v1beta1/validation.go | 14 ++++---- .../webhook/v1beta1/validation_test.go | 6 ++-- 11 files changed, 28 insertions(+), 43 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/constants.go b/fleetconfig-controller/api/v1beta1/constants.go index e4a3463b..7f3d8b1a 100644 --- a/fleetconfig-controller/api/v1beta1/constants.go +++ b/fleetconfig-controller/api/v1beta1/constants.go @@ -3,7 +3,7 @@ package v1beta1 import "k8s.io/apimachinery/pkg/labels" const ( - // HubCleanupPreflightFinalizer is the finalizer for cleanup preflight checks hub cluster's controller instance. Used to signal to the spoke's controller than unjoin can proceed. + // HubCleanupPreflightFinalizer is the finalizer for cleanup preflight checks hub cluster's controller instance. Used to signal to the spoke's controller that unjoin can proceed. HubCleanupPreflightFinalizer = "fleetconfig.open-cluster-management.io/hub-cleanup-preflight" // HubCleanupFinalizer is the finalizer for cleanup by the hub cluster's controller instance. diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml index b53671f1..ee0673b6 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml @@ -57,8 +57,8 @@ spec: {{- toYaml .Values.resources | nindent 10 }} securityContext: {{- toYaml .Values.containerSecurityContext | nindent 10 }} - {{- if .Values.admissionWebhooks.enabled }} ports: + {{- if .Values.admissionWebhooks.enabled }} - containerPort: {{ .Values.webhookService.port }} name: webhook-server protocol: TCP diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml index 816542da..37c720f5 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml @@ -161,4 +161,3 @@ spec: - type: CurrentCluster currentCluster: clusterRoleName: {{ include "chart.fullname" . }}-manager-role - diff --git a/fleetconfig-controller/cmd/main.go b/fleetconfig-controller/cmd/main.go index 3e3d6f62..5ac12b12 100644 --- a/fleetconfig-controller/cmd/main.go +++ b/fleetconfig-controller/cmd/main.go @@ -97,7 +97,7 @@ func main() { os.Exit(1) } default: - setupLog.Info("unable to create controller for unknown cluster type", "controller", "Spoke", "clusterType", mOpts.ClusterType, "allowed", apiv1beta1.SupportedClusterTypes) + setupLog.Info("unable to create controller for unknown cluster type", "clusterType", mOpts.ClusterType, "allowed", apiv1beta1.SupportedClusterTypes) os.Exit(1) } diff --git a/fleetconfig-controller/cmd/manager/manager.go b/fleetconfig-controller/cmd/manager/manager.go index 1bf6a0e5..14eeb914 100644 --- a/fleetconfig-controller/cmd/manager/manager.go +++ b/fleetconfig-controller/cmd/manager/manager.go @@ -44,8 +44,7 @@ type Options struct { Scheme *runtime.Scheme } -// ForHub configures a manager instance for a Hub cluster. -func ForHub(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { +func setupServer(opts Options, setupLog logr.Logger) (webhook.Server, []func(*tls.Config)) { // if the EnableHTTP2 flag is false (the default), http/2 should be disabled // due to its vulnerabilities. More specifically, disabling http/2 will // prevent from being vulnerable to the HTTP/2 Stream Cancellation and @@ -67,7 +66,12 @@ func ForHub(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { Port: opts.WebhookPort, TLSOpts: tlsOpts, }) + return webhookServer, tlsOpts +} +// ForHub configures a manager instance for a Hub cluster. +func ForHub(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { + webhookServer, tlsOpts := setupServer(opts, setupLog) mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ Scheme: opts.Scheme, Metrics: metricsserver.Options{ @@ -143,27 +147,7 @@ func ForHub(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { // ForSpoke configures a manager instance for a Spoke cluster. func ForSpoke(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { - // if the EnableHTTP2 flag is false (the default), http/2 should be disabled - // due to its vulnerabilities. More specifically, disabling http/2 will - // prevent from being vulnerable to the HTTP/2 Stream Cancellation and - // Rapid Reset CVEs. For more information see: - // - https://github.com/advisories/GHSA-qppj-fm5r-hxr3 - // - https://github.com/advisories/GHSA-4374-p667-p6c8 - disableHTTP2 := func(c *tls.Config) { - setupLog.Info("disabling http/2") - c.NextProtos = []string{"http/1.1"} - } - - tlsOpts := []func(*tls.Config){} - if !opts.EnableHTTP2 { - tlsOpts = append(tlsOpts, disableHTTP2) - } - - webhookServer := webhook.NewServer(webhook.Options{ - CertDir: opts.CertDir, - Port: opts.WebhookPort, - TLSOpts: tlsOpts, - }) + _, tlsOpts := setupServer(opts, setupLog) // enables watching resources in the hub cluster hubRestCfg, err := getHubRestConfig() @@ -238,7 +222,7 @@ func ForSpoke(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { }, }, - WebhookServer: webhookServer, + WebhookServer: nil, HealthProbeBindAddress: opts.ProbeAddr, LeaderElection: opts.EnableLeaderElection, LeaderElectionConfig: localRestCfg, // use local restConfig. alternatively, we can disable leader election if HA is not a concern. diff --git a/fleetconfig-controller/config/devspace/hub/manager.yaml b/fleetconfig-controller/config/devspace/hub/manager.yaml index 022c18ca..5400d89c 100644 --- a/fleetconfig-controller/config/devspace/hub/manager.yaml +++ b/fleetconfig-controller/config/devspace/hub/manager.yaml @@ -28,6 +28,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: ROLE_NAME + value: fleetconfig-controller-manager-role image: quay.io/open-cluster-management/fleetconfig-controller:dev imagePullPolicy: IfNotPresent ports: diff --git a/fleetconfig-controller/devspace.yaml b/fleetconfig-controller/devspace.yaml index 307a143f..2b7b4f2e 100644 --- a/fleetconfig-controller/devspace.yaml +++ b/fleetconfig-controller/devspace.yaml @@ -55,13 +55,13 @@ pipelines: debug: |- run_dependencies --all build_images fleetconfig-controller-dev - kubectl -n fleetconfig-system delete deployment fleetconfig-controller-manager + kubectl -n fleetconfig-system delete deployment fleetconfig-controller-manager --ignore-not-found create_deployments debug start_dev fleetconfig-controller-dev-hub debug-spoke: |- run_dependencies --all build_images fleetconfig-controller-dev - kubectl -n fleetconfig-system delete deployment fleetconfig-controller-manager + kubectl -n fleetconfig-system delete deployment fleetconfig-controller-manager --ignore-not-found create_deployments debug-spoke start_dev fleetconfig-controller-dev-spoke diff --git a/fleetconfig-controller/internal/controller/v1beta1/addon.go b/fleetconfig-controller/internal/controller/v1beta1/addon.go index 7ca0f188..d44e6c32 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/addon.go +++ b/fleetconfig-controller/internal/controller/v1beta1/addon.go @@ -413,7 +413,7 @@ func patchFCCMca(ctx context.Context, spokeName string, addonC *addonapi.Clients desired := addonv1alpha1.AddOnConfig{ ConfigGroupResource: addonv1alpha1.ConfigGroupResource{ Group: addonv1alpha1.GroupName, - Resource: AddOnDeploymentConfigsKind, + Resource: AddOnDeploymentConfigResource, }, ConfigReferent: addonv1alpha1.ConfigReferent{ Name: v1beta1.FCCAddOnName, diff --git a/fleetconfig-controller/internal/controller/v1beta1/constants.go b/fleetconfig-controller/internal/controller/v1beta1/constants.go index 2252233a..94f95013 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/constants.go +++ b/fleetconfig-controller/internal/controller/v1beta1/constants.go @@ -29,8 +29,8 @@ const ( addonArgoCD = "argocd" addonGPF = "governance-policy-framework" - managedClusterAddOn = "ManagedClusterAddOn" - AddOnDeploymentConfigsKind = "addondeploymentconfigs" + managedClusterAddOn = "ManagedClusterAddOn" + AddOnDeploymentConfigResource = "addondeploymentconfigs" addonCleanupTimeout = 1 * time.Minute addonCleanupPollInterval = 2 * time.Second diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation.go b/fleetconfig-controller/internal/webhook/v1beta1/validation.go index 15c87549..44da91a8 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation.go @@ -87,7 +87,7 @@ func allowHubUpdate(oldHub, newHub *v1beta1.Hub) error { newHubCopy.Kubeconfig = v1beta1.Kubeconfig{} if !reflect.DeepEqual(oldHubCopy, newHubCopy) { - return errors.New("only changes to spec.apiServer, spec.clusterManager.source.*, spec.hubAddOns, spec.addOnConfigs, spec.logVerbosity, spec.timeout, and spec.registrationAuth are allowed when updating the hub") + return errors.New("only changes to spec.apiServer, spec.clusterManager.source.*, spec.hubAddOns, spec.addOnConfigs, spec.logVerbosity, spec.timeout, spec.registrationAuth, and spec.kubeconfig are allowed when updating the hub") } } return nil @@ -364,21 +364,21 @@ func validateAddons(ctx context.Context, cli client.Client, newObject *v1beta1.S err := cli.Get(ctx, types.NamespacedName{Name: newObject.Spec.HubRef.Name, Namespace: newObject.Spec.HubRef.Namespace}, hub) if err != nil { if !kerrs.IsNotFound(err) { - return nil, field.ErrorList{field.InternalError(field.NewPath("spec").Child("addOns"), err)} + errs = append(errs, field.InternalError(field.NewPath("spec").Child("addOns"), err)) + return nil, errs } - // warn instead of an error, so we don't block creating spokes and hub at the same time - return admission.Warnings{warnHubNotFound}, nil + return admission.Warnings{warnHubNotFound}, errs } initCond := hub.GetCondition(v1beta1.HubInitialized) if initCond == nil || initCond.Status != metav1.ConditionTrue { - // warn instead of an error, so we don't block creating spokes and hub at the same time - return admission.Warnings{warnHubNotFound}, nil + return admission.Warnings{warnHubNotFound}, errs } cmaList, err := addonC.AddonV1alpha1().ClusterManagementAddOns().List(ctx, metav1.ListOptions{}) if err != nil { - return nil, field.ErrorList{field.InternalError(field.NewPath("spec").Child("addOns"), err)} + errs = append(errs, field.InternalError(field.NewPath("spec").Child("addOns"), err)) + return nil, errs } cmaNames := make([]string, len(cmaList.Items)) for i, cma := range cmaList.Items { diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go b/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go index e9c2b037..595f804b 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go @@ -219,7 +219,7 @@ func TestAllowHubUpdate(t *testing.T) { }, }, wantErr: true, - errMsg: "only changes to spec.apiServer, spec.clusterManager.source.*, spec.hubAddOns, spec.addOnConfigs, spec.logVerbosity, spec.timeout, and spec.registrationAuth are allowed when updating the hub", + errMsg: "only changes to spec.apiServer, spec.clusterManager.source.*, spec.hubAddOns, spec.addOnConfigs, spec.logVerbosity, spec.timeout, spec.registrationAuth, and spec.kubeconfig are allowed when updating the hub", }, { name: "disallowed - Force change", @@ -234,7 +234,7 @@ func TestAllowHubUpdate(t *testing.T) { }, }, wantErr: true, - errMsg: "only changes to spec.apiServer, spec.clusterManager.source.*, spec.hubAddOns, spec.addOnConfigs, spec.logVerbosity, spec.timeout, and spec.registrationAuth are allowed when updating the hub", + errMsg: "only changes to spec.apiServer, spec.clusterManager.source.*, spec.hubAddOns, spec.addOnConfigs, spec.logVerbosity, spec.timeout, spec.registrationAuth, and spec.kubeconfig are allowed when updating the hub", }, { name: "disallowed - ClusterManager non-source change", @@ -253,7 +253,7 @@ func TestAllowHubUpdate(t *testing.T) { }, }, wantErr: true, - errMsg: "only changes to spec.apiServer, spec.clusterManager.source.*, spec.hubAddOns, spec.addOnConfigs, spec.logVerbosity, spec.timeout, and spec.registrationAuth are allowed when updating the hub", + errMsg: "only changes to spec.apiServer, spec.clusterManager.source.*, spec.hubAddOns, spec.addOnConfigs, spec.logVerbosity, spec.timeout, spec.registrationAuth, and spec.kubeconfig are allowed when updating the hub", }, { name: "multiple allowed changes", From aaedc87d934b85f9711fbd29ab331e1433996c28 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 25 Sep 2025 20:55:50 -0700 Subject: [PATCH 07/62] feat: improve addon validation, add tests Signed-off-by: Artur Shad Nik --- .../api/v1beta1/constants.go | 12 +- .../internal/controller/v1beta1/addon.go | 4 +- .../internal/controller/v1beta1/constants.go | 8 - .../internal/webhook/v1beta1/validation.go | 24 +- .../webhook/v1beta1/validation_test.go | 241 +++++++++++++++++- 5 files changed, 274 insertions(+), 15 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/constants.go b/fleetconfig-controller/api/v1beta1/constants.go index 7f3d8b1a..6a48d859 100644 --- a/fleetconfig-controller/api/v1beta1/constants.go +++ b/fleetconfig-controller/api/v1beta1/constants.go @@ -90,7 +90,7 @@ const ( // SpokeNamespaceEnvVar is the environment variable containing the namespace of the Spoke resource. SpokeNamespaceEnvVar = "CLUSTER_NAMESPACE" - // HubNamespaceEnvVar is the environment variable containing the namespace of the Spoke resource. + // HubNamespaceEnvVar is the environment variable containing the namespace of the Hub resource. HubNamespaceEnvVar = "HUB_NAMESPACE" // ControllerNamespaceEnvVar is the environment variable containing the namespace that the controller is deployed to. @@ -159,3 +159,13 @@ var ( // ManagedBySelector is a label selector for filtering add-on resources managed fleetconfig-controller. ManagedBySelector = labels.SelectorFromSet(labels.Set(ManagedByLabels)) ) + +const ( + AddonArgoCD = "argocd" + AddonGPF = "governance-policy-framework" +) + +var SupportedHubAddons = []string{ + AddonArgoCD, + AddonGPF, +} diff --git a/fleetconfig-controller/internal/controller/v1beta1/addon.go b/fleetconfig-controller/internal/controller/v1beta1/addon.go index d44e6c32..5eb26bf7 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/addon.go +++ b/fleetconfig-controller/internal/controller/v1beta1/addon.go @@ -55,7 +55,7 @@ func getHubAddOns(ctx context.Context, addonC *addonapi.Clientset) ([]string, er var hubAddons []string for _, addon := range allClusterManagementAddOns.Items { - if slices.Contains(supportedHubAddons, addon.Name) && addon.Labels[v1beta1.LabelAddOnManagedBy] == "" { + if slices.Contains(v1beta1.SupportedHubAddons, addon.Name) && addon.Labels[v1beta1.LabelAddOnManagedBy] == "" { hubAddons = append(hubAddons, addon.Name) } } @@ -650,7 +650,7 @@ func handleHubAddonInstall(ctx context.Context, addonC *addonapi.Clientset, addo } // the argocd pull integration addon logs the entire helm template output including CRDs to stdout. // to prevent flooding the logs, overwrite it. - if addon.Name == addonArgoCD { + if addon.Name == v1beta1.AddonArgoCD { stdout = []byte("ArgoCD hub addon successfully installed") } logger.V(1).Info("installed hubAddon", "name", addon.Name, "output", string(stdout)) diff --git a/fleetconfig-controller/internal/controller/v1beta1/constants.go b/fleetconfig-controller/internal/controller/v1beta1/constants.go index 94f95013..a61ad4d2 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/constants.go +++ b/fleetconfig-controller/internal/controller/v1beta1/constants.go @@ -26,9 +26,6 @@ const ( uninstall = "uninstall" hubAddon = "hub-addon" - addonArgoCD = "argocd" - addonGPF = "governance-policy-framework" - managedClusterAddOn = "ManagedClusterAddOn" AddOnDeploymentConfigResource = "addondeploymentconfigs" @@ -37,8 +34,3 @@ const ( fccAddOnManifestWorkLabel = "open-cluster-management.io/addon-name=fleetconfig-controller-manager" ) - -var supportedHubAddons = []string{ - addonArgoCD, - addonGPF, -} diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation.go b/fleetconfig-controller/internal/webhook/v1beta1/validation.go index 44da91a8..22314b5e 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation.go @@ -149,15 +149,33 @@ func validateHubAddons(ctx context.Context, cli client.Client, oldObject, newObj func validateAddonUniqueness(newObject *v1beta1.Hub) field.ErrorList { errs := field.ErrorList{} + for i, ha := range newObject.Spec.HubAddOns { + if !slices.ContainsFunc(v1beta1.SupportedHubAddons, func(a string) bool { + return ha.Name == a + }) { + errs = append(errs, field.Invalid(field.NewPath("hubAddOns").Index(i), ha.Name, fmt.Sprintf("invalid hubAddOn name. must be one of %v", v1beta1.SupportedHubAddons))) + } + } + // Validate that AddOnConfig names are unique within the AddOnConfigs list - addOnConfigNames := make(map[string]int) + addOnConfigVersionedNames := make(map[string]int) for i, a := range newObject.Spec.AddOnConfigs { key := fmt.Sprintf("%s-%s", a.Name, a.Version) - if existingIndex, found := addOnConfigNames[key]; found { + if existingIndex, found := addOnConfigVersionedNames[key]; found { errs = append(errs, field.Invalid(field.NewPath("addOnConfigs").Index(i), key, fmt.Sprintf("duplicate addOnConfig %s (name-version) found at indices %d and %d", key, existingIndex, i))) } else { - addOnConfigNames[key] = i + addOnConfigVersionedNames[key] = i + } + } + + // Validate that AddOnConfig names are unique within the AddOnConfigs list + addOnConfigNames := make(map[string]int) + for i, a := range newObject.Spec.AddOnConfigs { + if _, found := addOnConfigNames[a.Name]; found { + continue + } else { + addOnConfigNames[a.Name] = i } } diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go b/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go index 595f804b..5663aaf9 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go @@ -409,7 +409,7 @@ func TestAllowSpokeUpdate(t *testing.T) { wantErr: false, }, { - name: "allowed - HubRef change", + name: "disallowed - HubRef change", oldSpoke: &v1beta1.Spoke{ Spec: v1beta1.SpokeSpec{ HubRef: v1beta1.HubRef{ @@ -536,3 +536,242 @@ func TestAllowSpokeUpdate(t *testing.T) { }) } } + +func TestValidateAddonUniqueness(t *testing.T) { + tests := []struct { + name string + hub *v1beta1.Hub + wantErrs int + errMsgs []string + }{ + { + name: "valid - no addons", + hub: &v1beta1.Hub{ + Spec: v1beta1.HubSpec{}, + }, + wantErrs: 0, + }, + { + name: "valid - unique AddOnConfigs with different names", + hub: &v1beta1.Hub{ + Spec: v1beta1.HubSpec{ + AddOnConfigs: []v1beta1.AddOnConfig{ + {Name: "addon1", Version: "v1.0.0"}, + {Name: "addon2", Version: "v1.0.0"}, + {Name: "addon3", Version: "v2.0.0"}, + }, + }, + }, + wantErrs: 0, + }, + { + name: "valid - same AddOnConfig name with different versions", + hub: &v1beta1.Hub{ + Spec: v1beta1.HubSpec{ + AddOnConfigs: []v1beta1.AddOnConfig{ + {Name: "addon1", Version: "v1.0.0"}, + {Name: "addon1", Version: "v2.0.0"}, + {Name: "addon1", Version: "v3.0.0"}, + }, + }, + }, + wantErrs: 0, + }, + { + name: "valid - unique HubAddOns", + hub: &v1beta1.Hub{ + Spec: v1beta1.HubSpec{ + HubAddOns: []v1beta1.HubAddOn{ + {Name: "argocd"}, + {Name: "governance-policy-framework"}, + }, + }, + }, + wantErrs: 0, + }, + { + name: "valid - no name conflicts between HubAddOns and AddOnConfigs", + hub: &v1beta1.Hub{ + Spec: v1beta1.HubSpec{ + AddOnConfigs: []v1beta1.AddOnConfig{ + {Name: "addon1", Version: "v1.0.0"}, + {Name: "addon2", Version: "v2.0.0"}, + }, + HubAddOns: []v1beta1.HubAddOn{ + {Name: "argocd"}, + {Name: "governance-policy-framework"}, + }, + }, + }, + wantErrs: 0, + }, + { + name: "invalid - duplicate AddOnConfig name-version pairs", + hub: &v1beta1.Hub{ + Spec: v1beta1.HubSpec{ + AddOnConfigs: []v1beta1.AddOnConfig{ + {Name: "addon1", Version: "v1.0.0"}, + {Name: "addon2", Version: "v2.0.0"}, + {Name: "addon1", Version: "v1.0.0"}, // duplicate + }, + }, + }, + wantErrs: 1, + errMsgs: []string{"duplicate addOnConfig addon1-v1.0.0 (name-version) found at indices"}, + }, + { + name: "invalid - multiple duplicate AddOnConfig name-version pairs", + hub: &v1beta1.Hub{ + Spec: v1beta1.HubSpec{ + AddOnConfigs: []v1beta1.AddOnConfig{ + {Name: "addon1", Version: "v1.0.0"}, + {Name: "addon2", Version: "v2.0.0"}, + {Name: "addon1", Version: "v1.0.0"}, // duplicate + {Name: "addon2", Version: "v2.0.0"}, // duplicate + }, + }, + }, + wantErrs: 2, + errMsgs: []string{ + "duplicate addOnConfig addon1-v1.0.0 (name-version) found at indices", + "duplicate addOnConfig addon2-v2.0.0 (name-version) found at indices", + }, + }, + { + name: "invalid - duplicate HubAddOn names", + hub: &v1beta1.Hub{ + Spec: v1beta1.HubSpec{ + HubAddOns: []v1beta1.HubAddOn{ + {Name: "argocd"}, + {Name: "governance-policy-framework"}, + {Name: "argocd"}, // duplicate + }, + }, + }, + wantErrs: 1, + errMsgs: []string{"duplicate hubAddOn name argocd found at indices"}, + }, + { + name: "invalid - multiple duplicate HubAddOn names", + hub: &v1beta1.Hub{ + Spec: v1beta1.HubSpec{ + HubAddOns: []v1beta1.HubAddOn{ + {Name: "argocd"}, + {Name: "governance-policy-framework"}, + {Name: "argocd"}, // duplicate + {Name: "governance-policy-framework"}, // duplicate + }, + }, + }, + wantErrs: 2, + errMsgs: []string{ + "duplicate hubAddOn name argocd found at indices", + "duplicate hubAddOn name governance-policy-framework found at indices", + }, + }, + { + name: "invalid - name conflict between HubAddOn and AddOnConfig", + hub: &v1beta1.Hub{ + Spec: v1beta1.HubSpec{ + AddOnConfigs: []v1beta1.AddOnConfig{ + {Name: "argocd", Version: "v1.0.0"}, + {Name: "addon2", Version: "v2.0.0"}, + }, + HubAddOns: []v1beta1.HubAddOn{ + {Name: "argocd"}, // conflicts with AddOnConfig + {Name: "governance-policy-framework"}, + }, + }, + }, + wantErrs: 1, + errMsgs: []string{"hubAddOn name argocd clashes with an existing addOnConfig name"}, + }, + { + name: "invalid - unsupported HubAddOn", + hub: &v1beta1.Hub{ + Spec: v1beta1.HubSpec{ + AddOnConfigs: []v1beta1.AddOnConfig{}, + HubAddOns: []v1beta1.HubAddOn{ + {Name: "custom-addon"}, + }, + }, + }, + wantErrs: 1, + errMsgs: []string{"hubAddOn name argocd clashes with an existing addOnConfig name"}, + }, + { + name: "invalid - all types of conflicts combined", + hub: &v1beta1.Hub{ + Spec: v1beta1.HubSpec{ + AddOnConfigs: []v1beta1.AddOnConfig{ + {Name: "addon1", Version: "v1.0.0"}, + {Name: "addon2", Version: "v2.0.0"}, + {Name: "addon1", Version: "v1.0.0"}, // duplicate name-version + {Name: "argocd", Version: "v1.0.0"}, + }, + HubAddOns: []v1beta1.HubAddOn{ + {Name: "argocd"}, + {Name: "argocd"}, // duplicate name, conflicts with AddOnConfig + }, + }, + }, + wantErrs: 4, + errMsgs: []string{ + "duplicate addOnConfig addon1-v1.0.0 (name-version) found at indices", + "duplicate hubAddOn name argocd found at indices", + "hubAddOn name shared-addon clashes with an existing addOnConfig name", + "hubAddOn name shared-addon clashes with an existing addOnConfig name", + }, + }, + { + name: "edge case - empty version defaults", + hub: &v1beta1.Hub{ + Spec: v1beta1.HubSpec{ + AddOnConfigs: []v1beta1.AddOnConfig{ + {Name: "addon1"}, // empty version + {Name: "addon1"}, // empty version, should be duplicate + }, + }, + }, + wantErrs: 1, + errMsgs: []string{"duplicate addOnConfig addon1- (name-version) found at indices"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + errs := validateAddonUniqueness(tt.hub) + + if len(errs) != tt.wantErrs { + t.Errorf("validateAddonUniqueness() returned %d errors, want %d", len(errs), tt.wantErrs) + for i, err := range errs { + t.Errorf(" Error %d: %s", i, err.Error()) + } + return + } + + // Check that each expected error message is present + for _, expectedMsg := range tt.errMsgs { + found := false + for _, err := range errs { + if err.Error() != "" && err.Error() != expectedMsg { + // For partial matching since error messages include indices + if len(expectedMsg) > 0 && err.Error() != "" { + // Check if the error message contains the expected substring + if len(err.Error()) >= len(expectedMsg) { + found = true + break + } + } + } else if err.Error() == expectedMsg { + found = true + break + } + } + if !found { + t.Errorf("Expected error message containing %q not found in errors: %v", expectedMsg, errs) + } + } + }) + } +} From 0b0b000cd29437783ea3fdab1c885889ee641a0d Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 25 Sep 2025 21:01:27 -0700 Subject: [PATCH 08/62] chore: make reviewable Signed-off-by: Artur Shad Nik --- fleetconfig-controller/api/v1beta1/constants.go | 6 +++++- .../internal/controller/v1beta1/constants.go | 4 +++- .../internal/controller/v1beta1/spoke_handler.go | 2 +- .../internal/webhook/v1beta1/validation.go | 3 +-- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/constants.go b/fleetconfig-controller/api/v1beta1/constants.go index 6a48d859..b5e84f7a 100644 --- a/fleetconfig-controller/api/v1beta1/constants.go +++ b/fleetconfig-controller/api/v1beta1/constants.go @@ -161,10 +161,14 @@ var ( ) const ( + // AddonArgoCD is the name of the built-in ArgoCD hub addon. AddonArgoCD = "argocd" - AddonGPF = "governance-policy-framework" + + // AddonGPF is the name of the built-in Governance Policy Framework hub addon. + AddonGPF = "governance-policy-framework" ) +// SupportedHubAddons are the built-in hub addons which clusteradm and fleetconfig-controller support. var SupportedHubAddons = []string{ AddonArgoCD, AddonGPF, diff --git a/fleetconfig-controller/internal/controller/v1beta1/constants.go b/fleetconfig-controller/internal/controller/v1beta1/constants.go index a61ad4d2..327472a7 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/constants.go +++ b/fleetconfig-controller/internal/controller/v1beta1/constants.go @@ -32,5 +32,7 @@ const ( addonCleanupTimeout = 1 * time.Minute addonCleanupPollInterval = 2 * time.Second - fccAddOnManifestWorkLabel = "open-cluster-management.io/addon-name=fleetconfig-controller-manager" + manifestWorkAddOnLabelKey = "open-cluster-management.io/addon-name" + + manifestWorkAddOnLabelValueFcc = "fleetconfig-controller-manager" ) diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 564bb401..f4c50d09 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -398,7 +398,7 @@ func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke } // at this point, klusterlet-work-agent is uninstalled, so nothing can remove this finalizer. all resources are cleaned up by the spoke's controller, so to prevent a dangling mw/namespace, we remove the finalizer manually - mwList, err := workC.WorkV1().ManifestWorks(spoke.Name).List(ctx, metav1.ListOptions{LabelSelector: fccAddOnManifestWorkLabel}) + mwList, err := workC.WorkV1().ManifestWorks(spoke.Name).List(ctx, metav1.ListOptions{LabelSelector: fmt.Sprintf("%s=%s", manifestWorkAddOnLabelKey, manifestWorkAddOnLabelValueFcc)}) if err != nil { return err } diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation.go b/fleetconfig-controller/internal/webhook/v1beta1/validation.go index 22314b5e..f7b3e3d8 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation.go @@ -174,9 +174,8 @@ func validateAddonUniqueness(newObject *v1beta1.Hub) field.ErrorList { for i, a := range newObject.Spec.AddOnConfigs { if _, found := addOnConfigNames[a.Name]; found { continue - } else { - addOnConfigNames[a.Name] = i } + addOnConfigNames[a.Name] = i } // Validate that HubAddOn names are unique within the HubAddOns list From 2ba3cbfe2c37d491ea341e9b6eac175700f48683 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 25 Sep 2025 21:08:59 -0700 Subject: [PATCH 09/62] chore: logs Signed-off-by: Artur Shad Nik --- .../internal/controller/v1beta1/spoke_handler.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index f4c50d09..fab65945 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -284,6 +284,10 @@ func (r *SpokeReconciler) doSpokeWork(ctx context.Context, spoke *v1beta1.Spoke, logger := log.FromContext(ctx) logger.V(0).Info("handleSpoke", "spoke", spoke.Name) + spoke.SetConditions(true, v1beta1.NewCondition( + v1beta1.PivotComplete, v1beta1.PivotComplete, metav1.ConditionTrue, metav1.ConditionTrue, + )) + spokeKubeconfig, err := kube.RawFromInClusterRestConfig() if err != nil { return fmt.Errorf("failed to load kubeconfig from inCluster: %v", err) @@ -307,9 +311,6 @@ func (r *SpokeReconciler) doSpokeWork(ctx context.Context, spoke *v1beta1.Spoke, } spoke.Status.KlusterletHash = currKlusterletHash - spoke.SetConditions(true, v1beta1.NewCondition( - v1beta1.PivotComplete, v1beta1.PivotComplete, metav1.ConditionTrue, metav1.ConditionTrue, - )) return nil } @@ -381,6 +382,7 @@ func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke // requeue until unjoin is complete by the spoke's controller if slices.Contains(spoke.Finalizers, v1beta1.SpokeCleanupFinalizer) { + logger.V(1).Info("Hub preflight complete, waiting for spoke agent to deregister") return nil } @@ -444,8 +446,10 @@ func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke } func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spoke) error { + logger := log.FromContext(ctx) // requeue until preflight is complete by the hub's controller if slices.Contains(spoke.Finalizers, v1beta1.HubCleanupPreflightFinalizer) { + logger.V(1).Info("Cleanup initiated, waiting for hub to complete preflight") return nil } @@ -504,6 +508,7 @@ func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spo } } + logger.V(1).Info("Klusterlet cleanup complete") return nil } From 674fb05c5beb67aea89f8362ce05121966ee0a65 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 26 Sep 2025 11:18:41 -0700 Subject: [PATCH 10/62] fix: add fallback cleanup if addon agent has not come up Signed-off-by: Artur Shad Nik --- .../api/v1beta1/spoke_types.go | 14 ++++++ fleetconfig-controller/cmd/manager/manager.go | 2 +- .../internal/controller/v1beta1/addon.go | 10 ++-- .../controller/v1beta1/spoke_handler.go | 48 +++++++++++++------ .../internal/webhook/v1beta1/validation.go | 2 +- 5 files changed, 54 insertions(+), 22 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/spoke_types.go b/fleetconfig-controller/api/v1beta1/spoke_types.go index b496566d..7dfd30d6 100644 --- a/fleetconfig-controller/api/v1beta1/spoke_types.go +++ b/fleetconfig-controller/api/v1beta1/spoke_types.go @@ -20,6 +20,7 @@ import ( "fmt" "maps" "reflect" + "slices" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -103,6 +104,19 @@ func (s *Spoke) IsHubAsSpoke() bool { return s.Name == ManagedClusterTypeHubAsSpoke || s.Spec.Kubeconfig.InCluster } +// PivotComplete return true if the spoke's agent has successfully started managing day 2 operations. +func (s *Spoke) PivotComplete() bool { + jc := s.GetCondition(SpokeJoined) + if jc == nil || jc.Status != metav1.ConditionTrue { + return false + } + pc := s.GetCondition(PivotComplete) + if pc == nil || pc.Status != metav1.ConditionTrue { + return false + } + return slices.Contains(s.Finalizers, SpokeCleanupFinalizer) +} + // Klusterlet is the configuration for a klusterlet. type Klusterlet struct { // Annotations to apply to the spoke cluster. If not present, the 'agent.open-cluster-management.io/' prefix is added to each key. diff --git a/fleetconfig-controller/cmd/manager/manager.go b/fleetconfig-controller/cmd/manager/manager.go index 14eeb914..8199e698 100644 --- a/fleetconfig-controller/cmd/manager/manager.go +++ b/fleetconfig-controller/cmd/manager/manager.go @@ -128,7 +128,7 @@ func ForHub(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { } // nolint:goconst - if (opts.UseWebhook || os.Getenv("ENABLE_WEBHOOKS") != "false") && opts.ClusterType != apiv1beta1.ClusterTypeSpoke { + if opts.UseWebhook || os.Getenv("ENABLE_WEBHOOKS") == "true" { if err = apiv1alpha1.SetupFleetConfigWebhookWithManager(mgr); err != nil { setupLog.Error(err, "unable to create webhook", "webhook", "FleetConfig") return nil, err diff --git a/fleetconfig-controller/internal/controller/v1beta1/addon.go b/fleetconfig-controller/internal/controller/v1beta1/addon.go index 5eb26bf7..14c20d31 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/addon.go +++ b/fleetconfig-controller/internal/controller/v1beta1/addon.go @@ -675,7 +675,7 @@ func isAddonInstalled(ctx context.Context, addonC *addonapi.Clientset, addonName // waitForAddonManifestWorksCleanup polls for addon-related manifestWorks to be removed // after addon disable operation to avoid race conditions during spoke unjoin -func waitForAddonManifestWorksCleanup(ctx context.Context, workC *workapi.Clientset, spokeName string, timeout time.Duration, isHubAsSpoke bool) error { +func waitForAddonManifestWorksCleanup(ctx context.Context, workC *workapi.Clientset, spokeName string, timeout time.Duration, shouldCleanAll bool) error { logger := log.FromContext(ctx) logger.V(1).Info("waiting for addon manifestWorks cleanup", "spokeName", spokeName, "timeout", timeout) @@ -687,11 +687,11 @@ func waitForAddonManifestWorksCleanup(ctx context.Context, workC *workapi.Client return false, nil } - // for hub-as-spoke, all addons must be removed. + // for hub-as-spoke, or if the pivot failed, all addons must be removed. // otherwise, fleetconfig-controller-manager must not be removed. - var expectedWorks = 1 - if isHubAsSpoke { - expectedWorks = 0 + var expectedWorks = 0 + if !shouldCleanAll { + expectedWorks = 1 } if len(manifestWorks.Items) == expectedWorks { diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index fab65945..a659fdc5 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -40,19 +40,21 @@ import ( func (r *SpokeReconciler) cleanup(ctx context.Context, spoke *v1beta1.Spoke, hubKubeconfig []byte) error { switch r.ClusterType { case v1beta1.ClusterTypeHub: - err := r.doHubCleanup(ctx, spoke, hubKubeconfig) + originalSpoke := ctx.Value(originalSpokeKey).(*v1beta1.Spoke) // use the original object to check conditions/finalizers + pivotComplete := originalSpoke.PivotComplete() + err := r.doHubCleanup(ctx, spoke, hubKubeconfig, pivotComplete) if err != nil { return err } - if spoke.IsHubAsSpoke() { - err = r.doSpokeCleanup(ctx, spoke) + if spoke.IsHubAsSpoke() || !pivotComplete { + err = r.doSpokeCleanup(ctx, spoke, false) if err != nil { return err } } return nil case v1beta1.ClusterTypeSpoke: - return r.doSpokeCleanup(ctx, spoke) + return r.doSpokeCleanup(ctx, spoke, true) default: // this is guarded against when the manager is initialized. should never reach this point panic(fmt.Sprintf("unknown cluster type %s. Must be one of %v", r.ClusterType, v1beta1.SupportedClusterTypes)) @@ -234,12 +236,12 @@ func (r *SpokeReconciler) bindAddonAgent(ctx context.Context, spoke *v1beta1.Spo Name: roleName, } - err := r.doBind(ctx, roleRef, spoke.Namespace, spoke.Name) + err := r.createBinding(ctx, roleRef, spoke.Namespace, spoke.Name) if err != nil { return err } if spoke.Spec.HubRef.Namespace != spoke.Namespace { - err = r.doBind(ctx, roleRef, spoke.Spec.HubRef.Namespace, spoke.Name) + err = r.createBinding(ctx, roleRef, spoke.Spec.HubRef.Namespace, spoke.Name) if err != nil { return err } @@ -247,10 +249,10 @@ func (r *SpokeReconciler) bindAddonAgent(ctx context.Context, spoke *v1beta1.Spo return nil } -func (r *SpokeReconciler) doBind(ctx context.Context, roleRef rbacv1.RoleRef, namespace, spokeName string) error { +func (r *SpokeReconciler) createBinding(ctx context.Context, roleRef rbacv1.RoleRef, namespace, spokeName string) error { binding := &rbacv1.RoleBinding{ ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("open-cluster-management:%s:%s:agent-%s", + Name: fmt.Sprintf("open-cluster-management:%s:%s:agent-%s", // this is a different naming format than OCM uses for addon agents. we need to append the spoke name to avoid possible conflicts in cases where multiple spokes exist in 1 namespace v1beta1.FCCAddOnName, strings.ToLower(roleRef.Kind), spokeName), Namespace: namespace, Labels: map[string]string{ @@ -314,7 +316,7 @@ func (r *SpokeReconciler) doSpokeWork(ctx context.Context, spoke *v1beta1.Spoke, return nil } -func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke, hubKubeconfig []byte) error { +func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke, hubKubeconfig []byte, pivotComplete bool) error { logger := log.FromContext(ctx) clusterC, err := common.ClusterClient(hubKubeconfig) if err != nil { @@ -352,8 +354,13 @@ func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke // remove addons only after confirming that the cluster can be unjoined - this avoids leaving dangling resources that may rely on the addon spokeCopy := spoke.DeepCopy() spokeCopy.Spec.AddOns = nil - if !spoke.IsHubAsSpoke() { - spokeCopy.Spec.AddOns = append(spokeCopy.Spec.AddOns, v1beta1.AddOn{ConfigName: "fleetconfig-controller-manager"}) // disable all except fcc + + // for hub-as-spoke, or if the addon agent never came up, disable all addons + // otherwise, leave fleetconfig-controller-manager addon running so that it can do deregistration + shouldCleanAll := spoke.IsHubAsSpoke() || !pivotComplete + + if !shouldCleanAll { + spokeCopy.Spec.AddOns = append(spokeCopy.Spec.AddOns, v1beta1.AddOn{ConfigName: v1beta1.FCCAddOnName}) } if _, err := handleSpokeAddons(ctx, addonC, spokeCopy); err != nil { spoke.SetConditions(true, v1beta1.NewCondition( @@ -364,7 +371,7 @@ func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke if len(spoke.Status.EnabledAddons) > 0 { // Wait for addon manifestWorks to be fully cleaned up before proceeding with unjoin - if err := waitForAddonManifestWorksCleanup(ctx, workC, spoke.Name, addonCleanupTimeout, spoke.IsHubAsSpoke()); err != nil { + if err := waitForAddonManifestWorksCleanup(ctx, workC, spoke.Name, addonCleanupTimeout, shouldCleanAll); err != nil { spoke.SetConditions(true, v1beta1.NewCondition( err.Error(), v1beta1.AddonsConfigured, metav1.ConditionTrue, metav1.ConditionFalse, )) @@ -445,7 +452,7 @@ func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke return nil } -func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spoke) error { +func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spoke, pivotComplete bool) error { logger := log.FromContext(ctx) // requeue until preflight is complete by the hub's controller if slices.Contains(spoke.Finalizers, v1beta1.HubCleanupPreflightFinalizer) { @@ -453,10 +460,21 @@ func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spo return nil } - spokeKubeconfig, err := kube.RawFromInClusterRestConfig() + var ( + spokeKubeconfig []byte + err error + ) + + // if the addon agent did not come up successfully, try to unjoin the spoke from the hub + if pivotComplete { + spokeKubeconfig, err = kube.RawFromInClusterRestConfig() + } else { + spokeKubeconfig, err = kube.KubeconfigFromSecretOrCluster(ctx, r.Client, spoke.Spec.Kubeconfig, spoke.Namespace) + } if err != nil { return err } + err = r.unjoinSpoke(ctx, spoke, spokeKubeconfig) if err != nil { return err @@ -930,7 +948,7 @@ func (r *SpokeReconciler) mergeKlusterletValues(ctx context.Context, spoke *v1be } fromValues, ok := cm.Data[spoke.Spec.Klusterlet.ValuesFrom.Key] if !ok { - logger.V(1).Info("warning: Klusterlet values ConfigMap not found", "spoke", spoke.Name, "configMap", nn, "key", spoke.Spec.Klusterlet.ValuesFrom.Key) + logger.V(1).Info("warning: Klusterlet values key not found in ConfigMap", "spoke", spoke.Name, "configMap", nn, "key", spoke.Spec.Klusterlet.ValuesFrom.Key) return spoke.Spec.Klusterlet.Values, nil } fromBytes := []byte(fromValues) diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation.go b/fleetconfig-controller/internal/webhook/v1beta1/validation.go index f7b3e3d8..2fc27a8e 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation.go @@ -169,7 +169,7 @@ func validateAddonUniqueness(newObject *v1beta1.Hub) field.ErrorList { } } - // Validate that AddOnConfig names are unique within the AddOnConfigs list + // Build an index of AddOnConfig names (first occurrence) for cross-set clash checks with HubAddOns addOnConfigNames := make(map[string]int) for i, a := range newObject.Spec.AddOnConfigs { if _, found := addOnConfigNames[a.Name]; found { From e511abd152792993d75bf093c5a93b0c97d9ec40 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 26 Sep 2025 11:53:11 -0700 Subject: [PATCH 11/62] fix: relax fallback condition Signed-off-by: Artur Shad Nik --- fleetconfig-controller/api/v1beta1/spoke_types.go | 6 +----- .../internal/controller/v1beta1/spoke_handler.go | 5 ++++- .../internal/webhook/v1beta1/validation.go | 11 ++++++++--- .../internal/webhook/v1beta1/validation_test.go | 14 +++++++------- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/spoke_types.go b/fleetconfig-controller/api/v1beta1/spoke_types.go index 7dfd30d6..142cb0eb 100644 --- a/fleetconfig-controller/api/v1beta1/spoke_types.go +++ b/fleetconfig-controller/api/v1beta1/spoke_types.go @@ -20,7 +20,6 @@ import ( "fmt" "maps" "reflect" - "slices" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -111,10 +110,7 @@ func (s *Spoke) PivotComplete() bool { return false } pc := s.GetCondition(PivotComplete) - if pc == nil || pc.Status != metav1.ConditionTrue { - return false - } - return slices.Contains(s.Finalizers, SpokeCleanupFinalizer) + return pc != nil && pc.Status == metav1.ConditionTrue } // Klusterlet is the configuration for a klusterlet. diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index a659fdc5..9ac6a2ad 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -40,7 +40,10 @@ import ( func (r *SpokeReconciler) cleanup(ctx context.Context, spoke *v1beta1.Spoke, hubKubeconfig []byte) error { switch r.ClusterType { case v1beta1.ClusterTypeHub: - originalSpoke := ctx.Value(originalSpokeKey).(*v1beta1.Spoke) // use the original object to check conditions/finalizers + originalSpoke, ok := ctx.Value(originalSpokeKey).(*v1beta1.Spoke) // use the original object to check conditions/finalizers + if !ok { + originalSpoke = spoke.DeepCopy() + } pivotComplete := originalSpoke.PivotComplete() err := r.doHubCleanup(ctx, spoke, hubKubeconfig, pivotComplete) if err != nil { diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation.go b/fleetconfig-controller/internal/webhook/v1beta1/validation.go index 2fc27a8e..cd074d6b 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation.go @@ -22,7 +22,9 @@ import ( ) const ( - warnHubNotFound = "hub not found, cannot validate spoke addons" + warnHubNotFound = "hub not found, cannot validate spoke addons" + errAllowedSpokeUpdate = "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.klusterlet.valuesFrom, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke" + errAllowedHubUpdate = "only changes to spec.apiServer, spec.clusterManager.source.*, spec.hubAddOns, spec.addOnConfigs, spec.logVerbosity, spec.timeout, spec.registrationAuth, and spec.kubeconfig are allowed when updating the hub" ) func isKubeconfigValid(kubeconfig v1beta1.Kubeconfig) (bool, string) { @@ -87,7 +89,7 @@ func allowHubUpdate(oldHub, newHub *v1beta1.Hub) error { newHubCopy.Kubeconfig = v1beta1.Kubeconfig{} if !reflect.DeepEqual(oldHubCopy, newHubCopy) { - return errors.New("only changes to spec.apiServer, spec.clusterManager.source.*, spec.hubAddOns, spec.addOnConfigs, spec.logVerbosity, spec.timeout, spec.registrationAuth, and spec.kubeconfig are allowed when updating the hub") + return errors.New(errAllowedHubUpdate) } } return nil @@ -97,6 +99,7 @@ func allowHubUpdate(oldHub, newHub *v1beta1.Hub) error { // Allowed changes include: // - spec.klusterlet.annotations // - spec.klusterlet.values +// - spec.klusterlet.valuesFrom // - spec.kubeconfig // - spec.addOns // - spec.timeout @@ -109,6 +112,8 @@ func allowSpokeUpdate(oldSpoke, newSpoke *v1beta1.Spoke) error { oldSpokeCopy.Klusterlet.Annotations = nil oldSpokeCopy.Klusterlet.Values = nil newSpokeCopy.Klusterlet.Values = nil + oldSpokeCopy.Klusterlet.ValuesFrom = nil + newSpokeCopy.Klusterlet.ValuesFrom = nil oldSpokeCopy.Kubeconfig = v1beta1.Kubeconfig{} newSpokeCopy.Kubeconfig = v1beta1.Kubeconfig{} oldSpokeCopy.AddOns = []v1beta1.AddOn{} @@ -119,7 +124,7 @@ func allowSpokeUpdate(oldSpoke, newSpoke *v1beta1.Spoke) error { newSpokeCopy.Timeout = 0 if !reflect.DeepEqual(oldSpokeCopy, newSpokeCopy) { - return errors.New("spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke") + return errors.New(errAllowedSpokeUpdate) } } diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go b/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go index 5663aaf9..b943b8f4 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation_test.go @@ -219,7 +219,7 @@ func TestAllowHubUpdate(t *testing.T) { }, }, wantErr: true, - errMsg: "only changes to spec.apiServer, spec.clusterManager.source.*, spec.hubAddOns, spec.addOnConfigs, spec.logVerbosity, spec.timeout, spec.registrationAuth, and spec.kubeconfig are allowed when updating the hub", + errMsg: errAllowedHubUpdate, }, { name: "disallowed - Force change", @@ -234,7 +234,7 @@ func TestAllowHubUpdate(t *testing.T) { }, }, wantErr: true, - errMsg: "only changes to spec.apiServer, spec.clusterManager.source.*, spec.hubAddOns, spec.addOnConfigs, spec.logVerbosity, spec.timeout, spec.registrationAuth, and spec.kubeconfig are allowed when updating the hub", + errMsg: errAllowedHubUpdate, }, { name: "disallowed - ClusterManager non-source change", @@ -253,7 +253,7 @@ func TestAllowHubUpdate(t *testing.T) { }, }, wantErr: true, - errMsg: "only changes to spec.apiServer, spec.clusterManager.source.*, spec.hubAddOns, spec.addOnConfigs, spec.logVerbosity, spec.timeout, spec.registrationAuth, and spec.kubeconfig are allowed when updating the hub", + errMsg: errAllowedHubUpdate, }, { name: "multiple allowed changes", @@ -427,7 +427,7 @@ func TestAllowSpokeUpdate(t *testing.T) { }, }, wantErr: true, - errMsg: "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", + errMsg: errAllowedSpokeUpdate, }, { name: "disallowed - CreateNamespace change", @@ -442,7 +442,7 @@ func TestAllowSpokeUpdate(t *testing.T) { }, }, wantErr: true, - errMsg: "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", + errMsg: errAllowedSpokeUpdate, }, { name: "disallowed - klusterlet mode change", @@ -461,7 +461,7 @@ func TestAllowSpokeUpdate(t *testing.T) { }, }, wantErr: true, - errMsg: "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", + errMsg: errAllowedSpokeUpdate, }, { name: "disallowed - klusterlet feature gates change", @@ -480,7 +480,7 @@ func TestAllowSpokeUpdate(t *testing.T) { }, }, wantErr: true, - errMsg: "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke", + errMsg: errAllowedSpokeUpdate, }, { name: "multiple allowed changes", From 93e3be634e543b003e82da7c4b7d9b2596e748fd Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 26 Sep 2025 13:33:18 -0700 Subject: [PATCH 12/62] fix: ensure appliedManifestWork cleaned up Signed-off-by: Artur Shad Nik --- .../ocm/fcc-addon/addon-template.yaml | 2 +- fleetconfig-controller/devspace.yaml | 2 +- .../controller/v1beta1/spoke_controller.go | 3 +- .../controller/v1beta1/spoke_handler.go | 30 ++++++++++++------- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml index 37c720f5..12760a25 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml @@ -103,7 +103,7 @@ spec: verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] - apiGroups: ["work.open-cluster-management.io"] resources: ["appliedmanifestworks"] - verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete", "deletecollection"] - apiGroups: ["cluster.open-cluster-management.io"] resources: ["clusterclaims"] verbs: ["get", "list", "watch"] diff --git a/fleetconfig-controller/devspace.yaml b/fleetconfig-controller/devspace.yaml index 2b7b4f2e..323ebbb1 100644 --- a/fleetconfig-controller/devspace.yaml +++ b/fleetconfig-controller/devspace.yaml @@ -157,7 +157,7 @@ deployments: hooks: - name: cert-manager-ns command: "kubectl create namespace cert-manager --dry-run=client -o yaml | kubectl apply -f -" - events: ["before:deploy"] + events: ["before:deploy:cert-manager"] dev: fleetconfig-controller-dev-hub: diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go index 58d8886b..9cb692d2 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go @@ -233,7 +233,8 @@ func (r *SpokeReconciler) SetupWithManagerForHub(mgr ctrl.Manager) error { if !ok { return false } - return sharedFieldsChanged(oldHub.Spec.DeepCopy(), newHub.Spec.DeepCopy()) + return sharedFieldsChanged(oldHub.Spec.DeepCopy(), newHub.Spec.DeepCopy()) || + !reflect.DeepEqual(oldHub.Status, newHub.Status) }, GenericFunc: func(_ event.GenericEvent) bool { return false diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 9ac6a2ad..3370dd6f 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -483,23 +483,24 @@ func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spo return err } - spoke.Finalizers = slices.DeleteFunc(spoke.Finalizers, func(s string) bool { - return s == v1beta1.SpokeCleanupFinalizer - }) - - // hub-as-spoke case, no further cleanup needed + // hub-as-spoke/failed pivot case, no further cleanup needed - clusteradm unjoin will have handled it all if r.ClusterType == v1beta1.ClusterTypeHub { + spoke.Finalizers = slices.DeleteFunc(spoke.Finalizers, func(s string) bool { + return s == v1beta1.SpokeCleanupFinalizer + }) return nil } - // "self-destruct" any remaining namespaces/resources - // TODO - instead of deleting the namespace etc, can we delete the appliedManifestWork? - // name: 43967cf4fa7b6c9c1f4014eb104077ac73d86aabc3d2c4ae6c51babdf5898540-addon-fleetconfig-controller-manager-deploy-0 - // no labels, no owner ref + // remove all remaining klusterlet resources that unjoin did not remove (because of the remaining AMW) + workClient, err := common.WorkClient(spokeKubeconfig) + if err != nil { + return err + } operatorClient, err := common.OperatorClient(spokeKubeconfig) if err != nil { return err } + if err := operatorClient.OperatorV1().Klusterlets().Delete(ctx, "klusterlet", metav1.DeleteOptions{}); err != nil && !kerrs.IsNotFound(err) { return err } @@ -508,7 +509,6 @@ func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spo "open-cluster-management-agent", "open-cluster-management-agent-addon", "open-cluster-management", - os.Getenv(v1beta1.ControllerNamespaceEnvVar), } restCfg, err := kube.RestConfigFromKubeconfig(spokeKubeconfig) @@ -529,6 +529,16 @@ func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spo } } + spoke.Finalizers = slices.DeleteFunc(spoke.Finalizers, func(s string) bool { + return s == v1beta1.SpokeCleanupFinalizer + }) + + // self-destruct + err = workClient.WorkV1().AppliedManifestWorks().DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{}) + if err != nil { + return err + } + logger.V(1).Info("Klusterlet cleanup complete") return nil } From 191ea9c486e6d09aca4b86f4720fdd3ef1d13a1d Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 26 Sep 2025 15:27:12 -0700 Subject: [PATCH 13/62] docs: add diagram, walkthru Signed-off-by: Artur Shad Nik --- .../docs/2-phase-spoke-reconcile.md | 158 ++++++++++++++++++ .../controller/v1beta1/spoke_handler.go | 14 +- 2 files changed, 166 insertions(+), 6 deletions(-) create mode 100644 fleetconfig-controller/docs/2-phase-spoke-reconcile.md diff --git a/fleetconfig-controller/docs/2-phase-spoke-reconcile.md b/fleetconfig-controller/docs/2-phase-spoke-reconcile.md new file mode 100644 index 00000000..04cbad77 --- /dev/null +++ b/fleetconfig-controller/docs/2-phase-spoke-reconcile.md @@ -0,0 +1,158 @@ +# Spoke Reconciler Walkthrough + +For the purposes of this document `Hub controller` refers to the `SpokeReconciler` running on the hub cluster, and `Spoke controller` refers to the `SpokeReconciler` running on the spoke cluster. This is different from the `HubReconciler`, which is a hub-only controller for reconciling the Hub resource. + +The Spoke reconciler runs in two different modes depending on where it's deployed: + +- **Hub mode**: Runs on a hub cluster, handles joining spokes and cleaning up hub resources +- **Spoke mode**: Runs on spoke clusters, handles klusterlet upgrades and local cleanup. Automatically installed as an [OCM addon](https://open-cluster-management.io/docs/concepts/add-on-extensibility/addon/) when a spoke is joined to a hub. + +Note: *hub-as-spoke* clusters are a special case where the hub is registered as a spoke. A hub-as-spoke cluster is denoted by either the name `hub-as-spoke` or an InCluster kubeconfig. + +## Reconciler Steps + +### 1. Finalizer Setup + +When a Spoke resource is created, the reconcilers add finalizers to control cleanup: + +**Hub controller** adds: +- `HubCleanupPreflightFinalizer` - removed when hub is ready for spoke to unjoin +- `HubCleanupFinalizer` - removed after hub finishes cleanup +- `SpokeCleanupFinalizer` - only for *hub-as-spoke* clusters + +**Spoke controller** adds: +- `SpokeCleanupFinalizer` - removed after spoke finishes local cleanup + +### 2. Day 1 Operations + +**Hub controller**: +1. Check if spoke is already joined as a ManagedCluster +2. If not joined: run `clusteradm join` on the spoke, then `clusteradm accept` on the hub +3. Wait for ManagedClusterJoined condition +4. Set up addon deployment configs and enable addons +5. For hub-as-spoke: also do [day 2 operations](#3-day-2-operations) + +### 3. Day 2 Operations + +**Spoke controller**: +1. Set PivotComplete condition (spoke agent is now managing itself) +2. Check if klusterlet needs upgrading by comparing config hash and bundle version +3. If upgrade needed: run `clusteradm upgrade klusterlet` + +### 4. Cleanup Process + +The cleanup process coordinates between hub and spoke controllers using finalizers. + +**Hub controller**: +1. Check for active ManifestWorks (can't cleanup if cluster is still in use) +2. Disable addons (but keep fleetconfig-controller-agent running so spoke can unjoin) +3. Remove `HubCleanupPreflightFinalizer` (signals spoke to start unjoin) + +**Spoke controller**: +1. Wait for hub to remove `HubCleanupPreflightFinalizer` +2. Run `clusteradm unjoin` to deregister from hub +3. Remove klusterlet and OCM namespaces +4. Remove `SpokeCleanupFinalizer` (signals hub that unjoin is done) +5. Clean up remaining AppliedManifestWorks (at this point, there is only 1 - the fleetconfig-controller-agent) + +**Hub controller**: +1. Wait for spoke to remove `SpokeCleanupFinalizer` +2. Clean up remaining AppliedManifestWorks (at this point, there is only 1 - the fleetconfig-controller-agent) +3. Remove `HubCleanupFinalizer` + +In total, the cleanup process is completed in 3 reconciles - 2 on the hub and 1 on the spoke. + +**Special Cases**: + +There are 2 special cases to consider: + +- **hub-as-spoke**: The hub controller will also run the spoke-side cleanup steps right after the hub pre-flight cleanup steps. Cleanup completes in 2 reconciles. +- **Failed Pivot**: The spoke agent never came up, so the hub will attempt the spoke-side cleanup steps right after the hub pre-flight cleanup steps. Cleanup completes in 2 reconciles. +This allows for an "escape hatch" if the spoke agent never came up. This is the only case where a hub will perform day 2 operations on a spoke. It will never attempt upgrades on a spoke. + +## Key Points + +- All configuration is done on the hub cluster by CRUD operation on Spoke resources. +- After the initial join, OCM addon framework is leveraged to install a fleetconfig-controller-agent inside the spoke cluster. The agent is responsible for the spoke's day 2 operations, including klusterlet upgrades and local cleanup. +- After the "pivot", the spoke kubeconfig secret can be safely deleted. The hub will no longer directly manage the spoke cluster. Instead, the agent will asynchronously pull updates from the hub and reconcile them locally. +- Leveraging finalizers to coordinate cleanup tasks allows the controllers to operate independently and avoid direct communication. Otherwise, API calls between the manager and agent would be required to coordinate cleanup. + +## Sequence Diagram + +```mermaid +sequenceDiagram + participant User + participant HubK8s + participant HubController as Hub Controller + participant SpokeK8s + participant SpokeController as Spoke Controller + + Note over User, SpokeController: Initialization + + User->>HubK8s: Create Spoke resource + HubK8s->>HubController: Spoke resource created + HubController->>HubK8s: Add HubCleanupPreflightFinalizer + HubController->>HubK8s: Add HubCleanupFinalizer + HubController->>HubK8s: Add SpokeCleanupFinalizer (hub-as-spoke only) + + + Note over HubController, SpokeController: Join Process + HubController->>HubController: Check if ManagedCluster exists + HubController->>SpokeK8s: Run clusteradm join + SpokeK8s->>HubK8s: Join request + HubController->>HubK8s: Run clusteradm accept + HubController->>HubK8s: Wait for ManagedClusterJoined condition + + Note over HubK8s, SpokeK8s: Addon Flow + HubController->>HubK8s: Set up AddOnDeploymentConfigs for FCC-agent + HubController->>HubK8s: Enable addons + HubK8s->>SpokeK8s: Install FCC-agent (initiates pivot) + + Note over HubK8s, SpokeController: Day 2 Flow - Pivot Complete + HubK8s->>SpokeController: Spoke Joined + SpokeController->>HubK8s: Add SpokeCleanupFinalizer + SpokeController->>HubK8s: Set PivotComplete condition + SpokeController->>HubK8s: Get Hub, klusterlet values + SpokeController->>SpokeK8s: Check klusterlet upgrade needed + alt Upgrade needed + SpokeController->>SpokeK8s: Run clusteradm upgrade klusterlet + end + + Note over User, SpokeController: Cleanup Flow + + User->>HubK8s: Delete Spoke resource + HubK8s->>HubController: Spoke deletion requested + HubController->>HubK8s: Set phase to "Deleting" + + Note over HubK8s, HubController: Hub Cleanup Phase + HubController->>HubK8s: Check for active ManifestWorks + alt Active ManifestWorks + HubController->>HubController: Requeue with error + end + HubController->>HubK8s: Disable addons (keep fleetconfig-controller-manager) + HubController->>HubK8s: Remove HubCleanupPreflightFinalizer + + Note over SpokeK8s, SpokeController: Spoke Cleanup Phase + SpokeController->>SpokeK8s: HubCleanupPreflightFinalizer removed? + alt Not Removed + SpokeController->>SpokeController: Requeue + end + SpokeController->>SpokeK8s: Run clusteradm unjoin + SpokeController->>SpokeK8s: Remove klusterlet and OCM namespaces + SpokeController->>HubK8s: Remove SpokeCleanupFinalizer + SpokeController->>SpokeK8s: Remove AppliedManifestWork (which removes FCC-agent) + + Note over HubK8s, HubController: Final Hub Cleanup + HubController->>HubK8s: SpokeCleanupFinalizer removed? + alt Not Removed + HubController->>HubController: Requeue + end + HubController->>HubK8s: Clean up CSRs, AddOn ManifestWork Finalizer, ManagedCluster, namespace + HubController->>HubK8s: Remove HubCleanupFinalizer + + HubK8s->>User: Spoke resource deleted + + Note over HubController, SpokeController: Special Cases + Note right of HubController: Hub-as-spoke: Hub does both hub and spoke cleanup + Note right of HubController: Failed Pivot: Hub does spoke cleanup if agent never came up +``` diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 3370dd6f..70214389 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -529,16 +529,18 @@ func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spo } } + // self-destruct as late as possible, so that the controller has enough time to patch the Spoke before being garbage collected + defer func() { + err = workClient.WorkV1().AppliedManifestWorks().DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{}) + if err != nil { + logger.Error(err, "failed to finalize agent cleanup") + } + }() + spoke.Finalizers = slices.DeleteFunc(spoke.Finalizers, func(s string) bool { return s == v1beta1.SpokeCleanupFinalizer }) - // self-destruct - err = workClient.WorkV1().AppliedManifestWorks().DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{}) - if err != nil { - return err - } - logger.V(1).Info("Klusterlet cleanup complete") return nil } From 75a80cff4202c4e32cccbc46e4f724ec27febb4c Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 26 Sep 2025 15:42:11 -0700 Subject: [PATCH 14/62] chore: words Signed-off-by: Artur Shad Nik --- fleetconfig-controller/docs/2-phase-spoke-reconcile.md | 2 +- .../internal/controller/v1beta1/spoke_handler.go | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/fleetconfig-controller/docs/2-phase-spoke-reconcile.md b/fleetconfig-controller/docs/2-phase-spoke-reconcile.md index 04cbad77..4f6e9ae3 100644 --- a/fleetconfig-controller/docs/2-phase-spoke-reconcile.md +++ b/fleetconfig-controller/docs/2-phase-spoke-reconcile.md @@ -102,6 +102,7 @@ sequenceDiagram SpokeK8s->>HubK8s: Join request HubController->>HubK8s: Run clusteradm accept HubController->>HubK8s: Wait for ManagedClusterJoined condition + HubController->>HubController: Spoke Joined Note over HubK8s, SpokeK8s: Addon Flow HubController->>HubK8s: Set up AddOnDeploymentConfigs for FCC-agent @@ -109,7 +110,6 @@ sequenceDiagram HubK8s->>SpokeK8s: Install FCC-agent (initiates pivot) Note over HubK8s, SpokeController: Day 2 Flow - Pivot Complete - HubK8s->>SpokeController: Spoke Joined SpokeController->>HubK8s: Add SpokeCleanupFinalizer SpokeController->>HubK8s: Set PivotComplete condition SpokeController->>HubK8s: Get Hub, klusterlet values diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 70214389..00b8df09 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -91,6 +91,7 @@ func (r *SpokeReconciler) handleSpoke(ctx context.Context, spoke *v1beta1.Spoke, } } +// doHubWork handles hub-side work such as joins and addons func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, hubMeta hubMeta, klusterletValues *v1beta1.KlusterletChartConfig) error { logger := log.FromContext(ctx) logger.V(0).Info("handleSpoke", "spoke", spoke.Name) @@ -227,6 +228,7 @@ func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, h return nil } +// bindAddonAgent creates the necessary bindings for fcc agent to access hub resources func (r *SpokeReconciler) bindAddonAgent(ctx context.Context, spoke *v1beta1.Spoke) error { roleName := os.Getenv(v1beta1.RoleNameEnvVar) if roleName == "" { @@ -252,6 +254,7 @@ func (r *SpokeReconciler) bindAddonAgent(ctx context.Context, spoke *v1beta1.Spo return nil } +// createBinding creates a binding for a given role func (r *SpokeReconciler) createBinding(ctx context.Context, roleRef rbacv1.RoleRef, namespace, spokeName string) error { binding := &rbacv1.RoleBinding{ ObjectMeta: metav1.ObjectMeta{ @@ -285,6 +288,7 @@ func clusterAddonGroup(clusterName, addonName string) string { return fmt.Sprintf("system:open-cluster-management:cluster:%s:addon:%s", clusterName, addonName) } +// doSpokeWork handles spoke-side work such as upgrades func (r *SpokeReconciler) doSpokeWork(ctx context.Context, spoke *v1beta1.Spoke, hub *v1beta1.Hub, klusterletValues *v1beta1.KlusterletChartConfig) error { logger := log.FromContext(ctx) logger.V(0).Info("handleSpoke", "spoke", spoke.Name) @@ -319,6 +323,7 @@ func (r *SpokeReconciler) doSpokeWork(ctx context.Context, spoke *v1beta1.Spoke, return nil } +// doHubCleanup handles all the required cleanup of a hub cluster when deregistering a Spoke func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke, hubKubeconfig []byte, pivotComplete bool) error { logger := log.FromContext(ctx) clusterC, err := common.ClusterClient(hubKubeconfig) @@ -455,6 +460,7 @@ func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke return nil } +// doHubCleanup handles all the required cleanup of a spoke cluster when deregistering a Spoke func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spoke, pivotComplete bool) error { logger := log.FromContext(ctx) // requeue until preflight is complete by the hub's controller @@ -918,6 +924,7 @@ func getToken(ctx context.Context, hubMeta hubMeta) (*tokenMeta, error) { return tokenMeta, nil } +// getHubMeta retrieves the Hub resource and it's associated kubeconfig func (r *SpokeReconciler) getHubMeta(ctx context.Context, hubRef v1beta1.HubRef) (hubMeta, error) { hub := &v1beta1.Hub{} hubMeta := hubMeta{} @@ -938,6 +945,7 @@ func (r *SpokeReconciler) getHubMeta(ctx context.Context, hubRef v1beta1.HubRef) return hubMeta, nil } +// mergeKlusterletValues merges klusterlet values from a configmap in the Spoke namespace, and from the Spoke's spec. Spec takes precedence. func (r *SpokeReconciler) mergeKlusterletValues(ctx context.Context, spoke *v1beta1.Spoke) (*v1beta1.KlusterletChartConfig, error) { logger := log.FromContext(ctx) From 1d89831a13608b734a5779ad079a5abad0f66666 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 26 Sep 2025 17:10:26 -0700 Subject: [PATCH 15/62] docs: words Signed-off-by: Artur Shad Nik --- .../docs/2-phase-spoke-reconcile.md | 2 +- fleetconfig-controller/docs/smoketests.md | 43 +++++++++++++------ .../test/data/fleetconfig-values.yaml | 2 +- fleetconfig-controller/test/e2e/helper.go | 2 +- 4 files changed, 32 insertions(+), 17 deletions(-) diff --git a/fleetconfig-controller/docs/2-phase-spoke-reconcile.md b/fleetconfig-controller/docs/2-phase-spoke-reconcile.md index 4f6e9ae3..5d2e846f 100644 --- a/fleetconfig-controller/docs/2-phase-spoke-reconcile.md +++ b/fleetconfig-controller/docs/2-phase-spoke-reconcile.md @@ -1,6 +1,6 @@ # Spoke Reconciler Walkthrough -For the purposes of this document `Hub controller` refers to the `SpokeReconciler` running on the hub cluster, and `Spoke controller` refers to the `SpokeReconciler` running on the spoke cluster. This is different from the `HubReconciler`, which is a hub-only controller for reconciling the Hub resource. +For the purposes of this document, `Spoke controller` refers to the `SpokeReconciler` running on the spoke cluster, and `Hub controller` refers to the `SpokeReconciler` running on the hub cluster. This is different from the `HubReconciler`, which is a hub-only controller for reconciling the Hub resource. The Spoke reconciler runs in two different modes depending on where it's deployed: diff --git a/fleetconfig-controller/docs/smoketests.md b/fleetconfig-controller/docs/smoketests.md index 8bef427c..a3008f30 100644 --- a/fleetconfig-controller/docs/smoketests.md +++ b/fleetconfig-controller/docs/smoketests.md @@ -16,24 +16,28 @@ mkdir -p $TARGET_DIR export KUBECONFIG=$TARGET_DIR/ocm-hub-as-spoke.kubeconfig ``` -1. Build & load the `fleetconfig-controller:latest` image +2. Build & load the `fleetconfig-controller:latest` image ```bash - IMAGE_FLAVOURS="fleetconfig-controller:./build/Dockerfile.base" make images && \ + IMAGE_FLAVOURS="base:./build/Dockerfile.base" make images && \ kind load docker-image quay.io/open-cluster-management/fleetconfig-controller:latest \ --name ocm-hub-as-spoke ``` -1. Install the `fleetconfig-controller` +3. Install the `fleetconfig-controller` ```bash devspace deploy -n fleetconfig-system ``` -1. Verify that the `FleetConfig` is reconciled successfully +4. Verify that the `Hub` and `Spoke` are reconciled successfully ```bash - kubectl wait --for=jsonpath='{.status.phase}'=Running fleetconfig/fleetconfig \ + kubectl wait --for=jsonpath='{.status.phase}'=Running hub/hub \ + -n fleetconfig-system \ + --timeout=10m + + kubectl wait --for=jsonpath='{.status.phase}'=Running spoke/hub-as-spoke \ -n fleetconfig-system \ --timeout=10m ``` @@ -48,32 +52,43 @@ mkdir -p $TARGET_DIR export KUBECONFIG=$TARGET_DIR/ocm-hub.kubeconfig ``` -1. Generate an internal kubeconfig for the `ocm-spoke` cluster and upload it to the `ocm-hub` cluster +2. Generate an internal kubeconfig for the `ocm-spoke` cluster and upload it to the `ocm-hub` cluster ```bash kind get kubeconfig --name ocm-spoke --internal > $TARGET_DIR/ocm-spoke-internal.kubeconfig - kubectl create secret generic test-fleetconfig-kubeconfig \ + kubectl create namespace fleetconfig-system + kubectl -n fleetconfig-system create secret generic test-spoke-kubeconfig \ --from-file=value=$TARGET_DIR/ocm-spoke-internal.kubeconfig ``` -1. Build & load the `fleetconfig-controller:local` image +3. Build & load the `fleetconfig-controller:local` image ```bash - IMAGE_FLAVOURS="fleetconfig-controller:./build/Dockerfile.base" IMAGE_TAG=local make images && \ - kind load docker-image quay.io/open-cluster-management/fleetconfig-controller:local \ - --name ocm-hub + IMAGE_FLAVOURS="base:./build/Dockerfile.base" IMAGE_REPO=fleetconfig-controller-local IMAGE_TAG=local make images && \ + kind load docker-image quay.io/open-cluster-management/fleetconfig-controller-local:local \ + --name ocm-hub && \ + kind load docker-image quay.io/open-cluster-management/fleetconfig-controller-local:local \ + --name ocm-spoke ``` -1. Install the `fleetconfig-controller` on the hub using the `deploy-local` pipeline +4. Install the `fleetconfig-controller` on the hub using the `deploy-local` pipeline ```bash devspace run-pipeline deploy-local -n fleetconfig-system --skip-build ``` -1. Verify that the `FleetConfig` is reconciled successfully +5. Verify that the `Hub` and `Spoke` are reconciled successfully ```bash - kubectl wait --for=jsonpath='{.status.phase}'=Running fleetconfig/fleetconfig \ + kubectl wait --for=jsonpath='{.status.phase}'=Running hub/hub \ + -n fleetconfig-system \ + --timeout=10m + + kubectl wait --for=jsonpath='{.status.phase}'=Running spoke/hub-as-spoke \ + -n fleetconfig-system \ + --timeout=10m + + kubectl wait --for=jsonpath='{.status.phase}'=Running spoke/spoke \ -n fleetconfig-system \ --timeout=10m ``` diff --git a/fleetconfig-controller/test/data/fleetconfig-values.yaml b/fleetconfig-controller/test/data/fleetconfig-values.yaml index 6899c5c0..f303f6e0 100644 --- a/fleetconfig-controller/test/data/fleetconfig-values.yaml +++ b/fleetconfig-controller/test/data/fleetconfig-values.yaml @@ -45,7 +45,7 @@ fleetConfig: kubeconfig: # secret is provisioned during E2E test setup secretReference: - name: "test-fleetconfig-kubeconfig" + name: "test-spoke-kubeconfig" kubeconfigKey: "value" klusterlet: annotations: diff --git a/fleetconfig-controller/test/e2e/helper.go b/fleetconfig-controller/test/e2e/helper.go index f0321e05..b0d6118c 100644 --- a/fleetconfig-controller/test/e2e/helper.go +++ b/fleetconfig-controller/test/e2e/helper.go @@ -35,7 +35,7 @@ import ( const ( fcNamespace = "fleetconfig-system" - spokeSecretName = "test-fleetconfig-kubeconfig" + spokeSecretName = "test-spoke-kubeconfig" klusterletAnnotationPrefix = "agent.open-cluster-management.io" kubeconfigSecretKey = "value" hubAsSpokeName = v1alpha1.ManagedClusterTypeHubAsSpoke From 7f07b1bc5a9bef698284ef7e76d4c9619b04a176 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Mon, 29 Sep 2025 11:37:37 -0700 Subject: [PATCH 16/62] docs: update diagrams Signed-off-by: Artur Shad Nik --- fleetconfig-controller/devspace.yaml | 2 + .../docs/2-phase-spoke-reconcile.md | 135 +++++++++++++----- 2 files changed, 104 insertions(+), 33 deletions(-) diff --git a/fleetconfig-controller/devspace.yaml b/fleetconfig-controller/devspace.yaml index 323ebbb1..47daa208 100644 --- a/fleetconfig-controller/devspace.yaml +++ b/fleetconfig-controller/devspace.yaml @@ -101,6 +101,8 @@ deployments: image: repository: ${IMAGE_REPOSITORY} tag: ${IMAGE_TAG} + fleetConfig: + enabled: ${FLEETCONFIG_ENABLED} valuesFiles: - ${CONTEXT}/charts/fleetconfig-controller/values.yaml updateImageTags: false diff --git a/fleetconfig-controller/docs/2-phase-spoke-reconcile.md b/fleetconfig-controller/docs/2-phase-spoke-reconcile.md index 5d2e846f..fdcb5bdc 100644 --- a/fleetconfig-controller/docs/2-phase-spoke-reconcile.md +++ b/fleetconfig-controller/docs/2-phase-spoke-reconcile.md @@ -1,6 +1,6 @@ # Spoke Reconciler Walkthrough -For the purposes of this document, `Spoke controller` refers to the `SpokeReconciler` running on the spoke cluster, and `Hub controller` refers to the `SpokeReconciler` running on the hub cluster. This is different from the `HubReconciler`, which is a hub-only controller for reconciling the Hub resource. +For the purposes of this document, `Spoke Reconciler (Spoke)` refers to the `SpokeReconciler` running on the spoke cluster, and `Spoke Reconciler (Hub)` refers to the `SpokeReconciler` running on the hub cluster. This is different from the `HubReconciler`, which is a hub-only controller for reconciling the Hub resource. The Spoke reconciler runs in two different modes depending on where it's deployed: @@ -15,17 +15,17 @@ Note: *hub-as-spoke* clusters are a special case where the hub is registered as When a Spoke resource is created, the reconcilers add finalizers to control cleanup: -**Hub controller** adds: +**Spoke Reconciler (Hub)** adds: - `HubCleanupPreflightFinalizer` - removed when hub is ready for spoke to unjoin - `HubCleanupFinalizer` - removed after hub finishes cleanup - `SpokeCleanupFinalizer` - only for *hub-as-spoke* clusters -**Spoke controller** adds: +**Spoke Reconciler (Spoke)** adds: - `SpokeCleanupFinalizer` - removed after spoke finishes local cleanup ### 2. Day 1 Operations -**Hub controller**: +**Spoke Reconciler (Hub)**: 1. Check if spoke is already joined as a ManagedCluster 2. If not joined: run `clusteradm join` on the spoke, then `clusteradm accept` on the hub 3. Wait for ManagedClusterJoined condition @@ -34,28 +34,28 @@ When a Spoke resource is created, the reconcilers add finalizers to control clea ### 3. Day 2 Operations -**Spoke controller**: +**Spoke Reconciler (Spoke)**: 1. Set PivotComplete condition (spoke agent is now managing itself) 2. Check if klusterlet needs upgrading by comparing config hash and bundle version 3. If upgrade needed: run `clusteradm upgrade klusterlet` ### 4. Cleanup Process -The cleanup process coordinates between hub and spoke controllers using finalizers. +The cleanup process coordinates between hub and Spoke Reconciler (Spoke)s using finalizers. -**Hub controller**: +**Spoke Reconciler (Hub)**: 1. Check for active ManifestWorks (can't cleanup if cluster is still in use) 2. Disable addons (but keep fleetconfig-controller-agent running so spoke can unjoin) 3. Remove `HubCleanupPreflightFinalizer` (signals spoke to start unjoin) -**Spoke controller**: +**Spoke Reconciler (Spoke)**: 1. Wait for hub to remove `HubCleanupPreflightFinalizer` 2. Run `clusteradm unjoin` to deregister from hub 3. Remove klusterlet and OCM namespaces 4. Remove `SpokeCleanupFinalizer` (signals hub that unjoin is done) 5. Clean up remaining AppliedManifestWorks (at this point, there is only 1 - the fleetconfig-controller-agent) -**Hub controller**: +**Spoke Reconciler (Hub)**: 1. Wait for spoke to remove `SpokeCleanupFinalizer` 2. Clean up remaining AppliedManifestWorks (at this point, there is only 1 - the fleetconfig-controller-agent) 3. Remove `HubCleanupFinalizer` @@ -66,7 +66,7 @@ In total, the cleanup process is completed in 3 reconciles - 2 on the hub and 1 There are 2 special cases to consider: -- **hub-as-spoke**: The hub controller will also run the spoke-side cleanup steps right after the hub pre-flight cleanup steps. Cleanup completes in 2 reconciles. +- **hub-as-spoke**: The Spoke Reconciler (Hub) will also run the spoke-side cleanup steps right after the hub pre-flight cleanup steps. Cleanup completes in 2 reconciles. - **Failed Pivot**: The spoke agent never came up, so the hub will attempt the spoke-side cleanup steps right after the hub pre-flight cleanup steps. Cleanup completes in 2 reconciles. This allows for an "escape hatch" if the spoke agent never came up. This is the only case where a hub will perform day 2 operations on a spoke. It will never attempt upgrades on a spoke. @@ -77,15 +77,28 @@ This allows for an "escape hatch" if the spoke agent never came up. This is the - After the "pivot", the spoke kubeconfig secret can be safely deleted. The hub will no longer directly manage the spoke cluster. Instead, the agent will asynchronously pull updates from the hub and reconcile them locally. - Leveraging finalizers to coordinate cleanup tasks allows the controllers to operate independently and avoid direct communication. Otherwise, API calls between the manager and agent would be required to coordinate cleanup. -## Sequence Diagram +## Sequence Diagrams + +### Spoke Reconciles + +#### Actors: +- User: End user +- HubK8s: Hub cluster Kubernetes API server +- Spoke Reconciler (Hub): Hub-side instance of the fleetconfig-controller-manager SpokeReconciler +- SpokeK8s: Spoke cluster Kubernetes API server +- Spoke Reconciler (Spoke): Spoke-side instance of the fleetconfig-controller-agent SpokeReconciler +- Klusterlet: Klusterlet CR and related controllers installed on the spoke cluster + +#### Day 1 - Join ```mermaid sequenceDiagram participant User participant HubK8s - participant HubController as Hub Controller + participant HubController as Spoke Reconciler (Hub) participant SpokeK8s - participant SpokeController as Spoke Controller + participant SpokeController as Spoke Reconciler (Spoke) + participant Klusterlet Note over User, SpokeController: Initialization @@ -95,50 +108,70 @@ sequenceDiagram HubController->>HubK8s: Add HubCleanupFinalizer HubController->>HubK8s: Add SpokeCleanupFinalizer (hub-as-spoke only) - Note over HubController, SpokeController: Join Process HubController->>HubController: Check if ManagedCluster exists - HubController->>SpokeK8s: Run clusteradm join - SpokeK8s->>HubK8s: Join request - HubController->>HubK8s: Run clusteradm accept - HubController->>HubK8s: Wait for ManagedClusterJoined condition - HubController->>HubController: Spoke Joined + alt ManagedCluster does not exist + HubController->>Klusterlet: Create Klusterlet controllers (clusteradm join) + Klusterlet->>HubK8s: Create CSR + HubController->>HubK8s: Accept CSR (clusteradm accept) + HubController->>HubK8s: Wait for ManagedClusterJoined condition + HubController->>HubController: Spoke Joined + end Note over HubK8s, SpokeK8s: Addon Flow HubController->>HubK8s: Set up AddOnDeploymentConfigs for FCC-agent HubController->>HubK8s: Enable addons - HubK8s->>SpokeK8s: Install FCC-agent (initiates pivot) + HubK8s->>SpokeK8s: ClusterManager installs FCC-agent (initiates pivot) + +``` + +#### Day 2 - Maintanance +```mermaid +sequenceDiagram + participant HubK8s + participant SpokeController as Spoke Reconciler (Spoke) + participant Klusterlet - Note over HubK8s, SpokeController: Day 2 Flow - Pivot Complete + Note over HubK8s, Klusterlet: Mainenance Flow SpokeController->>HubK8s: Add SpokeCleanupFinalizer SpokeController->>HubK8s: Set PivotComplete condition - SpokeController->>HubK8s: Get Hub, klusterlet values - SpokeController->>SpokeK8s: Check klusterlet upgrade needed + SpokeController->>HubK8s: Get Hub, klusterlet helm values + SpokeController->>Klusterlet: Check klusterlet upgrade needed alt Upgrade needed - SpokeController->>SpokeK8s: Run clusteradm upgrade klusterlet + SpokeController->>Klusterlet: Run clusteradm upgrade klusterlet end +``` +#### Day 2 - Cleanup - Note over User, SpokeController: Cleanup Flow +```mermaid +sequenceDiagram + participant User + participant HubK8s + participant HubController as Spoke Reconciler (Hub) + participant SpokeK8s + participant SpokeController as Spoke Reconciler (Spoke) + participant Klusterlet + + Note over User, Klusterlet: Cleanup Flow User->>HubK8s: Delete Spoke resource HubK8s->>HubController: Spoke deletion requested HubController->>HubK8s: Set phase to "Deleting" - Note over HubK8s, HubController: Hub Cleanup Phase - HubController->>HubK8s: Check for active ManifestWorks + Note over HubK8s, HubController: Hub Pre-Flight Cleanup Phase + HubController->>HubK8s: Check for active, non-addon ManifestWorks alt Active ManifestWorks HubController->>HubController: Requeue with error end - HubController->>HubK8s: Disable addons (keep fleetconfig-controller-manager) + HubController->>HubK8s: Disable addons (keep fleetconfig-controller-agent) HubController->>HubK8s: Remove HubCleanupPreflightFinalizer Note over SpokeK8s, SpokeController: Spoke Cleanup Phase - SpokeController->>SpokeK8s: HubCleanupPreflightFinalizer removed? + SpokeController->>HubK8s: HubCleanupPreflightFinalizer removed? alt Not Removed SpokeController->>SpokeController: Requeue end - SpokeController->>SpokeK8s: Run clusteradm unjoin - SpokeController->>SpokeK8s: Remove klusterlet and OCM namespaces + SpokeController->>Klusterlet: Remove Klusterlet and OCM namespaces (clusteradm unjoin) SpokeController->>HubK8s: Remove SpokeCleanupFinalizer SpokeController->>SpokeK8s: Remove AppliedManifestWork (which removes FCC-agent) @@ -153,6 +186,42 @@ sequenceDiagram HubK8s->>User: Spoke resource deleted Note over HubController, SpokeController: Special Cases - Note right of HubController: Hub-as-spoke: Hub does both hub and spoke cleanup - Note right of HubController: Failed Pivot: Hub does spoke cleanup if agent never came up + Note right of HubK8s: Hub-as-spoke: Hub does both hub and spoke cleanup + Note right of HubK8s: Failed Pivot: Hub does spoke cleanup if agent never came up ``` + +### Hub Deletion + +#### Actors: +- User: End user +- HubK8s: Hub cluster Kubernetes API server +- Hub Reconciler: Controller responsible for reconciling the Hub resource +- ClusterManager: ClusterManager CR and related controllers installed on the hub cluster + +#### Cleanup Flow +```mermaid +sequenceDiagram + participant User + participant HubK8s + participant HubReconciler as Hub Reconciler + participant ClusterManager + + Note over User, ClusterManager: Hub Deletion + + User->>HubK8s: Delete Hub resource + HubK8s->>HubReconciler: Hub deletion requested + HubReconciler->>HubK8s: Set phase to "Deleting" + + Note over HubK8s, HubReconciler: Hub Pre-Flight Cleanup Phase + HubReconciler->>HubK8s: Check for Spoke resources with HubRef.Name/Namespace == Hub.Name/Namespace + alt Joined Spokes found + HubReconciler->>HubK8s: Mark Spokes for deletion (if not already marked) + HubReconciler->>HubReconciler: Requeue until all Spokes are deleted + end + + Note over HubK8s, ClusterManager: Hub Cleanup Phase + HubReconciler->>HubK8s: Delete all AddOns managed by FCC + HubReconciler->>ClusterManager: Delete ClusterManager, OCM namespaces (clusteradm clean) + HubReconciler->>HubK8s: Remove HubCleanupFinalizer + +``` \ No newline at end of file From 4008691be4bf8e1cf10d152ad529072444092abf Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Mon, 29 Sep 2025 12:31:32 -0700 Subject: [PATCH 17/62] docs: clarify wording, actions Signed-off-by: Artur Shad Nik --- .../docs/2-phase-spoke-reconcile.md | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/fleetconfig-controller/docs/2-phase-spoke-reconcile.md b/fleetconfig-controller/docs/2-phase-spoke-reconcile.md index fdcb5bdc..07710e05 100644 --- a/fleetconfig-controller/docs/2-phase-spoke-reconcile.md +++ b/fleetconfig-controller/docs/2-phase-spoke-reconcile.md @@ -87,20 +87,20 @@ This allows for an "escape hatch" if the spoke agent never came up. This is the - Spoke Reconciler (Hub): Hub-side instance of the fleetconfig-controller-manager SpokeReconciler - SpokeK8s: Spoke cluster Kubernetes API server - Spoke Reconciler (Spoke): Spoke-side instance of the fleetconfig-controller-agent SpokeReconciler -- Klusterlet: Klusterlet CR and related controllers installed on the spoke cluster +- Klusterlet CR: Klusterlet resource in the spoke cluster +- Klusterlet Controllers: Klusterlet Controllers installed on the spoke cluster #### Day 1 - Join ```mermaid sequenceDiagram participant User - participant HubK8s + participant HubK8s as Hub Cluster API Server participant HubController as Spoke Reconciler (Hub) - participant SpokeK8s - participant SpokeController as Spoke Reconciler (Spoke) - participant Klusterlet + participant SpokeK8s as Spoke Cluster API Server + participant KlusterletCtrl as Klusterlet Controllers - Note over User, SpokeController: Initialization + Note over User, HubController: Initialization User->>HubK8s: Create Spoke resource HubK8s->>HubController: Spoke resource created @@ -108,11 +108,11 @@ sequenceDiagram HubController->>HubK8s: Add HubCleanupFinalizer HubController->>HubK8s: Add SpokeCleanupFinalizer (hub-as-spoke only) - Note over HubController, SpokeController: Join Process + Note over HubController, KlusterletCtrl: Join Process HubController->>HubController: Check if ManagedCluster exists alt ManagedCluster does not exist - HubController->>Klusterlet: Create Klusterlet controllers (clusteradm join) - Klusterlet->>HubK8s: Create CSR + HubController->>SpokeK8s: Create Klusterlet and controllers (clusteradm join) + KlusterletCtrl->>HubK8s: Create CSR HubController->>HubK8s: Accept CSR (clusteradm accept) HubController->>HubK8s: Wait for ManagedClusterJoined condition HubController->>HubController: Spoke Joined @@ -121,16 +121,18 @@ sequenceDiagram Note over HubK8s, SpokeK8s: Addon Flow HubController->>HubK8s: Set up AddOnDeploymentConfigs for FCC-agent HubController->>HubK8s: Enable addons - HubK8s->>SpokeK8s: ClusterManager installs FCC-agent (initiates pivot) + KlusterletCtrl->>HubK8s: Work agent pulls addon ManifestWork + KlusterletCtrl->>SpokeK8s: Deploy FCC-agent addon (initiates pivot) ``` #### Day 2 - Maintanance ```mermaid sequenceDiagram - participant HubK8s + participant HubK8s as Hub Cluster API Server participant SpokeController as Spoke Reconciler (Spoke) - participant Klusterlet + participant SpokeK8s as Spoke Cluster API Server + participant Klusterlet as Klusterlet CR Note over HubK8s, Klusterlet: Mainenance Flow SpokeController->>HubK8s: Add SpokeCleanupFinalizer @@ -138,7 +140,7 @@ sequenceDiagram SpokeController->>HubK8s: Get Hub, klusterlet helm values SpokeController->>Klusterlet: Check klusterlet upgrade needed alt Upgrade needed - SpokeController->>Klusterlet: Run clusteradm upgrade klusterlet + SpokeController->>SpokeK8s: Run clusteradm upgrade klusterlet end ``` #### Day 2 - Cleanup @@ -146,13 +148,12 @@ sequenceDiagram ```mermaid sequenceDiagram participant User - participant HubK8s + participant HubK8s as Hub Cluster API Server participant HubController as Spoke Reconciler (Hub) - participant SpokeK8s + participant SpokeK8s as Spoke Cluster API Server participant SpokeController as Spoke Reconciler (Spoke) - participant Klusterlet - Note over User, Klusterlet: Cleanup Flow + Note over User, SpokeController: Cleanup Flow User->>HubK8s: Delete Spoke resource HubK8s->>HubController: Spoke deletion requested @@ -163,15 +164,15 @@ sequenceDiagram alt Active ManifestWorks HubController->>HubController: Requeue with error end - HubController->>HubK8s: Disable addons (keep fleetconfig-controller-agent) + HubController->>HubK8s: Disable addons (except fleetconfig-controller-agent) HubController->>HubK8s: Remove HubCleanupPreflightFinalizer - Note over SpokeK8s, SpokeController: Spoke Cleanup Phase + Note over HubK8s, SpokeController: Spoke Cleanup Phase SpokeController->>HubK8s: HubCleanupPreflightFinalizer removed? alt Not Removed SpokeController->>SpokeController: Requeue end - SpokeController->>Klusterlet: Remove Klusterlet and OCM namespaces (clusteradm unjoin) + SpokeController->>SpokeK8s: Remove Klusterlet and OCM namespaces (clusteradm unjoin) SpokeController->>HubK8s: Remove SpokeCleanupFinalizer SpokeController->>SpokeK8s: Remove AppliedManifestWork (which removes FCC-agent) @@ -180,12 +181,12 @@ sequenceDiagram alt Not Removed HubController->>HubController: Requeue end - HubController->>HubK8s: Clean up CSRs, AddOn ManifestWork Finalizer, ManagedCluster, namespace + HubController->>HubK8s: Clean up CSR, FCC-agent AddOn ManifestWork Finalizer, ManagedCluster, namespace HubController->>HubK8s: Remove HubCleanupFinalizer HubK8s->>User: Spoke resource deleted - Note over HubController, SpokeController: Special Cases + Note over HubK8s, HubController: Special Cases Note right of HubK8s: Hub-as-spoke: Hub does both hub and spoke cleanup Note right of HubK8s: Failed Pivot: Hub does spoke cleanup if agent never came up ``` @@ -202,11 +203,10 @@ sequenceDiagram ```mermaid sequenceDiagram participant User - participant HubK8s + participant HubK8s as Hub Cluster API Server participant HubReconciler as Hub Reconciler - participant ClusterManager - Note over User, ClusterManager: Hub Deletion + Note over User, HubReconciler: Hub Deletion User->>HubK8s: Delete Hub resource HubK8s->>HubReconciler: Hub deletion requested @@ -221,7 +221,7 @@ sequenceDiagram Note over HubK8s, ClusterManager: Hub Cleanup Phase HubReconciler->>HubK8s: Delete all AddOns managed by FCC - HubReconciler->>ClusterManager: Delete ClusterManager, OCM namespaces (clusteradm clean) + HubReconciler->>HubK8s: Delete ClusterManager, OCM namespaces (clusteradm clean) HubReconciler->>HubK8s: Remove HubCleanupFinalizer ``` \ No newline at end of file From 488f42116854b9f6889f61514d8e6caedf868577 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Mon, 29 Sep 2025 14:00:27 -0700 Subject: [PATCH 18/62] feat: add upgrade conditions Signed-off-by: Artur Shad Nik --- .../api/v1beta1/constants.go | 6 ++++++ .../docs/2-phase-spoke-reconcile.md | 2 +- .../controller/v1beta1/hub_controller.go | 17 +++++++++++++++- .../controller/v1beta1/spoke_controller.go | 3 +++ .../controller/v1beta1/spoke_handler.go | 20 ++++++++++++++++++- 5 files changed, 45 insertions(+), 3 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/constants.go b/fleetconfig-controller/api/v1beta1/constants.go index b5e84f7a..e0215127 100644 --- a/fleetconfig-controller/api/v1beta1/constants.go +++ b/fleetconfig-controller/api/v1beta1/constants.go @@ -29,6 +29,12 @@ const ( // PivotComplete means that the spoke cluster has successfully started managing itself. PivotComplete = "PivotComplete" + + // KlusterletSynced means that Klusterlet's OCM bundle version and values are up to date. + KlusterletSynced = "KlusterletSynced" + + // HubUpgradeFailed means that the ClusterManager version upgrade failed. + HubUpgradeFailed = "HubUpgradeFailed" ) // Hub and Spoke condition reasons diff --git a/fleetconfig-controller/docs/2-phase-spoke-reconcile.md b/fleetconfig-controller/docs/2-phase-spoke-reconcile.md index 07710e05..17381698 100644 --- a/fleetconfig-controller/docs/2-phase-spoke-reconcile.md +++ b/fleetconfig-controller/docs/2-phase-spoke-reconcile.md @@ -126,7 +126,7 @@ sequenceDiagram ``` -#### Day 2 - Maintanance +#### Day 2 - Maintenance ```mermaid sequenceDiagram participant HubK8s as Hub Cluster API Server diff --git a/fleetconfig-controller/internal/controller/v1beta1/hub_controller.go b/fleetconfig-controller/internal/controller/v1beta1/hub_controller.go index fbc5a764..68d63848 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/hub_controller.go +++ b/fleetconfig-controller/internal/controller/v1beta1/hub_controller.go @@ -137,6 +137,9 @@ func (r *HubReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R v1beta1.NewCondition( v1beta1.AddonsConfigured, v1beta1.AddonsConfigured, metav1.ConditionFalse, metav1.ConditionFalse, ), + v1beta1.NewCondition( + v1beta1.HubUpgradeFailed, v1beta1.HubUpgradeFailed, metav1.ConditionFalse, metav1.ConditionFalse, + ), } hub.SetConditions(false, initConditions...) @@ -336,11 +339,23 @@ func (r *HubReconciler) handleHub(ctx context.Context, hub *v1beta1.Hub, hubKube if hub.Spec.ClusterManager != nil { upgrade, err := r.hubNeedsUpgrade(ctx, hub, operatorC) if err != nil { + hub.SetConditions(true, v1beta1.NewCondition( + err.Error(), v1beta1.HubUpgradeFailed, metav1.ConditionTrue, metav1.ConditionFalse, + )) return fmt.Errorf("failed to check if hub needs upgrade: %w", err) } if upgrade { - return r.upgradeHub(ctx, hub) + err = r.upgradeHub(ctx, hub) + if err != nil { + hub.SetConditions(true, v1beta1.NewCondition( + err.Error(), v1beta1.HubUpgradeFailed, metav1.ConditionTrue, metav1.ConditionFalse, + )) + return fmt.Errorf("failed to upgrade hub: %w", err) + } } + hub.SetConditions(true, v1beta1.NewCondition( + v1beta1.HubUpgradeFailed, v1beta1.HubUpgradeFailed, metav1.ConditionFalse, metav1.ConditionFalse, + )) } return nil diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go index 9cb692d2..97456c9d 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go @@ -151,6 +151,9 @@ func (r *SpokeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl v1beta1.NewCondition( v1beta1.PivotComplete, v1beta1.PivotComplete, metav1.ConditionFalse, metav1.ConditionTrue, ), + v1beta1.NewCondition( + v1beta1.KlusterletSynced, v1beta1.KlusterletSynced, metav1.ConditionFalse, metav1.ConditionFalse, + ), } spoke.SetConditions(false, initConditions...) diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 00b8df09..802c8407 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -68,8 +68,12 @@ func (r *SpokeReconciler) cleanup(ctx context.Context, spoke *v1beta1.Spoke, hub func (r *SpokeReconciler) handleSpoke(ctx context.Context, spoke *v1beta1.Spoke, hubMeta hubMeta) error { klusterletValues, err := r.mergeKlusterletValues(ctx, spoke) if err != nil { + spoke.SetConditions(true, v1beta1.NewCondition( + err.Error(), v1beta1.KlusterletSynced, metav1.ConditionFalse, metav1.ConditionTrue, + )) return err } + switch r.ClusterType { case v1beta1.ClusterTypeHub: err = r.doHubWork(ctx, spoke, hubMeta, klusterletValues) @@ -79,12 +83,22 @@ func (r *SpokeReconciler) handleSpoke(ctx context.Context, spoke *v1beta1.Spoke, if spoke.IsHubAsSpoke() { // hub-as-spoke err = r.doSpokeWork(ctx, spoke, hubMeta.hub, klusterletValues) if err != nil { + spoke.SetConditions(true, v1beta1.NewCondition( + err.Error(), v1beta1.KlusterletSynced, metav1.ConditionFalse, metav1.ConditionTrue, + )) return err } } return nil case v1beta1.ClusterTypeSpoke: - return r.doSpokeWork(ctx, spoke, hubMeta.hub, klusterletValues) + err = r.doSpokeWork(ctx, spoke, hubMeta.hub, klusterletValues) + if err != nil { + spoke.SetConditions(true, v1beta1.NewCondition( + err.Error(), v1beta1.KlusterletSynced, metav1.ConditionFalse, metav1.ConditionTrue, + )) + return err + } + return nil default: // this is guarded against when the manager is initialized. should never reach this point panic(fmt.Sprintf("unknown cluster type %s. Must be one of %v", r.ClusterType, v1beta1.SupportedClusterTypes)) @@ -320,6 +334,10 @@ func (r *SpokeReconciler) doSpokeWork(ctx context.Context, spoke *v1beta1.Spoke, } spoke.Status.KlusterletHash = currKlusterletHash + spoke.SetConditions(true, v1beta1.NewCondition( + v1beta1.KlusterletSynced, v1beta1.KlusterletSynced, metav1.ConditionTrue, metav1.ConditionTrue, + )) + return nil } From ea4da0cb065d452e8b521f3296d4b0d7708bffa2 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Mon, 29 Sep 2025 14:11:20 -0700 Subject: [PATCH 19/62] docs: break out work/reg controllers into separate actors Signed-off-by: Artur Shad Nik --- .../docs/2-phase-spoke-reconcile.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fleetconfig-controller/docs/2-phase-spoke-reconcile.md b/fleetconfig-controller/docs/2-phase-spoke-reconcile.md index 17381698..77b98ecf 100644 --- a/fleetconfig-controller/docs/2-phase-spoke-reconcile.md +++ b/fleetconfig-controller/docs/2-phase-spoke-reconcile.md @@ -88,7 +88,8 @@ This allows for an "escape hatch" if the spoke agent never came up. This is the - SpokeK8s: Spoke cluster Kubernetes API server - Spoke Reconciler (Spoke): Spoke-side instance of the fleetconfig-controller-agent SpokeReconciler - Klusterlet CR: Klusterlet resource in the spoke cluster -- Klusterlet Controllers: Klusterlet Controllers installed on the spoke cluster +- RegAgent: OCM Klusterlet Registration Agent +- WorkAgent: OCM Klusterlet Work Agent #### Day 1 - Join @@ -99,6 +100,8 @@ sequenceDiagram participant HubController as Spoke Reconciler (Hub) participant SpokeK8s as Spoke Cluster API Server participant KlusterletCtrl as Klusterlet Controllers + participant RegAgent as Registration Agent + participant WorkAgent as Work Agent Note over User, HubController: Initialization @@ -112,7 +115,7 @@ sequenceDiagram HubController->>HubController: Check if ManagedCluster exists alt ManagedCluster does not exist HubController->>SpokeK8s: Create Klusterlet and controllers (clusteradm join) - KlusterletCtrl->>HubK8s: Create CSR + RegAgent->>HubK8s: Create CSR HubController->>HubK8s: Accept CSR (clusteradm accept) HubController->>HubK8s: Wait for ManagedClusterJoined condition HubController->>HubController: Spoke Joined @@ -121,8 +124,8 @@ sequenceDiagram Note over HubK8s, SpokeK8s: Addon Flow HubController->>HubK8s: Set up AddOnDeploymentConfigs for FCC-agent HubController->>HubK8s: Enable addons - KlusterletCtrl->>HubK8s: Work agent pulls addon ManifestWork - KlusterletCtrl->>SpokeK8s: Deploy FCC-agent addon (initiates pivot) + WorkAgent->>HubK8s: Pull FCC-agent addon ManifestWork + WorkAgent->>SpokeK8s: Deploy FCC-agent addon (initiates pivot) ``` From 3dd932e067d1a21f9562940075dfb845940a3ef3 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Mon, 29 Sep 2025 14:13:48 -0700 Subject: [PATCH 20/62] fix: clustermanager nil check before upgrade Signed-off-by: Artur Shad Nik --- .../internal/controller/v1beta1/spoke_handler.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 802c8407..5d425a7f 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -320,7 +320,7 @@ func (r *SpokeReconciler) doSpokeWork(ctx context.Context, spoke *v1beta1.Spoke, if err != nil { return fmt.Errorf("failed to compute hash of spoke %s klusterlet values: %w", spoke.Name, err) } - if hub != nil && hub.Spec.ClusterManager.Source.BundleVersion != "" { + if hub != nil && hub.Spec.ClusterManager != nil && hub.Spec.ClusterManager.Source.BundleVersion != "" { upgrade, err := r.spokeNeedsUpgrade(ctx, spoke, currKlusterletHash, hub.Spec.ClusterManager.Source, spokeKubeconfig) if err != nil { return fmt.Errorf("failed to check if spoke cluster needs upgrade: %w", err) From 157d0cb462f9145cbb3e57137e158bb42f65330c Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Mon, 29 Sep 2025 14:27:38 -0700 Subject: [PATCH 21/62] test: add new conditions to tests Signed-off-by: Artur Shad Nik --- .../internal/controller/v1beta1/hub_controller_test.go | 3 ++- .../internal/controller/v1beta1/spoke_controller_test.go | 3 ++- fleetconfig-controller/test/e2e/helper.go | 2 ++ fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go | 1 + 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fleetconfig-controller/internal/controller/v1beta1/hub_controller_test.go b/fleetconfig-controller/internal/controller/v1beta1/hub_controller_test.go index b0f5ff0c..24ed8bec 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/hub_controller_test.go +++ b/fleetconfig-controller/internal/controller/v1beta1/hub_controller_test.go @@ -29,7 +29,7 @@ import ( "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/reconcile" - v1beta1 "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1beta1" + "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1beta1" ) var ( @@ -94,6 +94,7 @@ var _ = Describe("Hub Controller", Ordered, func() { v1beta1.HubInitialized: metav1.ConditionFalse, v1beta1.CleanupFailed: metav1.ConditionFalse, v1beta1.AddonsConfigured: metav1.ConditionFalse, + v1beta1.HubUpgradeFailed: metav1.ConditionFalse, })).To(Succeed()) }) diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller_test.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller_test.go index 503bcfed..c889c97a 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller_test.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller_test.go @@ -29,7 +29,7 @@ import ( "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/reconcile" - v1beta1 "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1beta1" + "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1beta1" ) var ( @@ -133,6 +133,7 @@ var _ = Describe("Spoke Controller", Ordered, func() { v1beta1.CleanupFailed: metav1.ConditionFalse, v1beta1.AddonsConfigured: metav1.ConditionFalse, v1beta1.PivotComplete: metav1.ConditionFalse, + v1beta1.KlusterletSynced: metav1.ConditionFalse, })).To(Succeed()) }) diff --git a/fleetconfig-controller/test/e2e/helper.go b/fleetconfig-controller/test/e2e/helper.go index b0d6118c..02b17f82 100644 --- a/fleetconfig-controller/test/e2e/helper.go +++ b/fleetconfig-controller/test/e2e/helper.go @@ -482,12 +482,14 @@ func ensureHubAndSpokesProvisioned(tc *E2EContext, hub *v1beta1.Hub, spokes []*v "HubInitialized": metav1.ConditionTrue, "CleanupFailed": metav1.ConditionFalse, "AddonsConfigured": metav1.ConditionTrue, + "HubUpgradeFailed": metav1.ConditionFalse, } spokeExpectedConditions := map[string]metav1.ConditionStatus{ "SpokeJoined": metav1.ConditionTrue, "CleanupFailed": metav1.ConditionFalse, "AddonsConfigured": metav1.ConditionTrue, "PivotComplete": metav1.ConditionTrue, + "KlusterletSynced": metav1.ConditionTrue, } for k, v := range extraExpectedConditions { hubExpectedConditions[k] = v diff --git a/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go b/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go index 6dd59171..2ad306e3 100644 --- a/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go +++ b/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go @@ -256,6 +256,7 @@ var _ = Describe("hub and spoke", Label("v1beta1"), Serial, Ordered, func() { v1beta1.HubInitialized: metav1.ConditionTrue, v1beta1.CleanupFailed: metav1.ConditionTrue, v1beta1.AddonsConfigured: metav1.ConditionTrue, + v1beta1.HubUpgradeFailed: metav1.ConditionFalse, }); err != nil { utils.WarnError(err, "Hub deletion not blocked") return err From 7028df8624805e789ddfd395a92673831e1115b8 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Tue, 30 Sep 2025 16:35:02 -0700 Subject: [PATCH 22/62] test: add upgrades to test; remove kconf secret during test Signed-off-by: Artur Shad Nik --- .../controller/v1beta1/spoke_handler.go | 12 ++-- .../test/e2e/v1beta1_hub_spoke.go | 61 +++++++++++++++++++ 2 files changed, 68 insertions(+), 5 deletions(-) diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 5d425a7f..e280c5f1 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -954,12 +954,14 @@ func (r *SpokeReconciler) getHubMeta(ctx context.Context, hubRef v1beta1.HubRef) return hubMeta, client.IgnoreNotFound(err) } hubMeta.hub = hub - // if found, load the hub's kubeconfig - hubKubeconfig, err := kube.KubeconfigFromSecretOrCluster(ctx, r.Client, hub.Spec.Kubeconfig, hub.Namespace) - if err != nil { - return hubMeta, err + // load the hub's kubeconfig. only needed on the hub's reconciler instance - the spoke's instance can access the hub using its default client + if r.ClusterType != v1beta1.ClusterTypeSpoke { + hubKubeconfig, err := kube.KubeconfigFromSecretOrCluster(ctx, r.Client, hub.Spec.Kubeconfig, hub.Namespace) + if err != nil { + return hubMeta, err + } + hubMeta.kubeconfig = hubKubeconfig } - hubMeta.kubeconfig = hubKubeconfig return hubMeta, nil } diff --git a/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go b/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go index 2ad306e3..916135eb 100644 --- a/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go +++ b/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go @@ -30,6 +30,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ktypes "k8s.io/apimachinery/pkg/types" operatorv1 "open-cluster-management.io/api/operator/v1" + "open-cluster-management.io/ocm/pkg/operator/helpers/chart" + "sigs.k8s.io/controller-runtime/pkg/client" "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1beta1" "github.com/open-cluster-management-io/lab/fleetconfig-controller/pkg/common" @@ -117,6 +119,65 @@ var _ = Describe("hub and spoke", Label("v1beta1"), Serial, Ordered, func() { }, 1*time.Minute, 1*time.Second).Should(Succeed()) }) + It("should successfully upgrade spoke Klusterlet, with no kubeconfig secret", func() { + By("deleting the secret") + EventuallyWithOffset(1, func() error { + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: spokeSecretName, + Namespace: fcNamespace, + }, + } + err := tc.kClient.Delete(tc.ctx, secret) + if err != nil { + return client.IgnoreNotFound(err) + } + return nil + }, 1*time.Minute, 1*time.Second).Should(Succeed()) + + By("updating the klusterlet values and verifying that the upgrade is successful") + EventuallyWithOffset(1, func() error { + err := tc.kClient.Get(tc.ctx, v1beta1spokeNN, spokeClone) + if err != nil { + utils.WarnError(err, "failed to get spoke") + return err + } + newDuration := 5 * time.Second + spokeClone.Spec.Klusterlet.Values = &v1beta1.KlusterletChartConfig{ + KlusterletChartConfig: chart.KlusterletChartConfig{ + Klusterlet: chart.KlusterletConfig{ + WorkConfiguration: operatorv1.WorkAgentConfiguration{ + StatusSyncInterval: &metav1.Duration{ + Duration: newDuration, + }, + }, + }, + }, + } + err = tc.kClient.Update(tc.ctx, spokeClone) + if err != nil { + utils.WarnError(err, "failed to patch spoke") + return err + } + klusterlet := &operatorv1.Klusterlet{} + if err := tc.kClientSpoke.Get(tc.ctx, klusterletNN, klusterlet); err != nil { + utils.WarnError(err, "failed to get klusterlet") + return err + } + if klusterlet.Spec.WorkConfiguration == nil || klusterlet.Spec.WorkConfiguration.StatusSyncInterval == nil { + err = errors.New("klusterlet status sync interval is nil") + utils.WarnError(err, "klusterlet not upgraded") + return err + } + if klusterlet.Spec.WorkConfiguration.StatusSyncInterval.Duration != newDuration { + err = fmt.Errorf("wrong status sync interval found on Klusterlet. want: %s, got: %s", newDuration, klusterlet.Spec.WorkConfiguration.StatusSyncInterval.Duration) + utils.WarnError(err, "failed to upgrade klusterlet") + return err + } + return nil + }, 3*time.Minute, 5*time.Second).Should(Succeed()) + }) + It("should successfully create a namespace in the hub-as-spoke cluster", func() { By("creating a ManifestWork in the hub-as-spoke cluster namespace") From 67a83c4a1690d7553337ec03404adc64e657d278 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Wed, 1 Oct 2025 13:00:14 -0700 Subject: [PATCH 23/62] feat: add a 3rd instance type to enable fallback non-addon mode for EKS Signed-off-by: Artur Shad Nik --- .../api/v1beta1/constants.go | 20 ++-- .../charts/fleetconfig-controller/README.md | 41 +++---- .../templates/_helpers.tpl | 32 +++++- .../templates/clusterissuer.yaml | 1 - .../templates/deployment.yaml | 6 +- .../ocm/fcc-addon/addon-template.yaml | 25 +++-- .../fcc-addon/cluster-management-addon.yaml | 11 +- .../ocm/fcc-addon/cluster-role-binding.yaml | 2 + .../charts/fleetconfig-controller/values.yaml | 4 + fleetconfig-controller/cmd/main.go | 10 +- fleetconfig-controller/cmd/manager/manager.go | 6 +- .../internal/controller/v1beta1/addon.go | 62 +++++++++++ .../controller/v1beta1/spoke_controller.go | 18 ++- .../v1beta1/spoke_controller_test.go | 8 +- .../controller/v1beta1/spoke_handler.go | 103 +++++------------- 15 files changed, 206 insertions(+), 143 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/constants.go b/fleetconfig-controller/api/v1beta1/constants.go index e0215127..2d8a0cd7 100644 --- a/fleetconfig-controller/api/v1beta1/constants.go +++ b/fleetconfig-controller/api/v1beta1/constants.go @@ -78,11 +78,14 @@ const ( ) const ( - // ClusterTypeHub indicates that the controller is running in a Hub cluster. - ClusterTypeHub = "hub" + // InstanceTypeManager indicates that the controller is running in a Hub cluster and only handles day 1 Spoke operations. + InstanceTypeManager = "manager" - // ClusterTypeSpoke indicates that the controller is running in a Spoke cluster. - ClusterTypeSpoke = "spoke" + // InstanceTypeAgent indicates that the controller is running in a Spoke cluster and only handles day 2 Spoke operations. + InstanceTypeAgent = "agent" + + // InstanceTypeUnified indicates that the controller is running in a Hub cluster and handles the entire lifecycle of Spoke resources. + InstanceTypeUnified = "unified" // HubKubeconfigEnvVar is the environment variable containing the path to the mounted Hub kubeconfig. HubKubeconfigEnvVar = "HUB_KUBECONFIG" @@ -112,10 +115,11 @@ const ( DefaultFCCManagerRole = "fleetconfig-controller-manager-role" ) -// SupportedClusterTypes are the valid cluster types that the controller can be installed in. -var SupportedClusterTypes = []string{ - ClusterTypeHub, - ClusterTypeSpoke, +// SupportedInstanceTypes are the valid cluster types that the controller can be installed in. +var SupportedInstanceTypes = []string{ + InstanceTypeManager, + InstanceTypeAgent, + InstanceTypeUnified, } // FleetConfig labels diff --git a/fleetconfig-controller/charts/fleetconfig-controller/README.md b/fleetconfig-controller/charts/fleetconfig-controller/README.md index 58993875..2a82b62d 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/README.md +++ b/fleetconfig-controller/charts/fleetconfig-controller/README.md @@ -149,26 +149,27 @@ Resource specifications for all klusterlet-managed containers. ### fleetconfig-controller parameters -| Name | Description | Value | -| --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------- | -| `spokeConcurrentReconciles` | Maximum number of Spoke resources that will be reconciled at the same time. | `5` | -| `kubernetesProvider` | Kubernetes provider of the cluster that fleetconfig-controller will be installed on. Valid values are "Generic", "EKS", "GKE-Ingress". | `Generic` | -| `replicas` | fleetconfig-controller replica count | `1` | -| `imageRegistry` | Image registry | `""` | -| `image.repository` | Image repository | `quay.io/open-cluster-management/fleetconfig-controller` | -| `image.tag` | Image tag | `v0.0.14` | -| `image.pullPolicy` | Image pull policy | `IfNotPresent` | -| `imagePullSecrets` | Image pull secrets | `[]` | -| `serviceAccount.annotations` | Annotations to add to the service account | `{}` | -| `containerSecurityContext.allowPrivilegeEscalation` | allowPrivilegeEscalation | `false` | -| `containerSecurityContext.capabilities.drop` | capabilities to drop | `["ALL"]` | -| `containerSecurityContext.runAsNonRoot` | runAsNonRoot | `true` | -| `resources.limits.cpu` | fleetconfig controller's cpu limit | `500m` | -| `resources.limits.memory` | fleetconfig controller's memory limit | `512Mi` | -| `resources.requests.cpu` | fleetconfig controller's cpu request | `200m` | -| `resources.requests.memory` | fleetconfig controller's memory request | `256Mi` | -| `healthCheck.port` | port the liveness & readiness probes are bound to | `9440` | -| `kubernetesClusterDomain` | kubernetes cluster domain | `cluster.local` | +| Name | Description | Value | +| --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------- | +| `spokeConcurrentReconciles` | Maximum number of Spoke resources that will be reconciled at the same time. | `5` | +| `addonMode` | Whether to run fleetconfig-controller in addon mode. Addon mode allows for decentralized day 2 management of spoke clusters. Not supported when kubernetesProvider is EKS. | `true` | +| `kubernetesProvider` | Kubernetes provider of the cluster that fleetconfig-controller will be installed on. Valid values are "Generic", "EKS", "GKE-Ingress". | `Generic` | +| `replicas` | fleetconfig-controller replica count | `1` | +| `imageRegistry` | Image registry | `""` | +| `image.repository` | Image repository | `quay.io/open-cluster-management/fleetconfig-controller` | +| `image.tag` | Image tag | `v0.0.14` | +| `image.pullPolicy` | Image pull policy | `IfNotPresent` | +| `imagePullSecrets` | Image pull secrets | `[]` | +| `serviceAccount.annotations` | Annotations to add to the service account | `{}` | +| `containerSecurityContext.allowPrivilegeEscalation` | allowPrivilegeEscalation | `false` | +| `containerSecurityContext.capabilities.drop` | capabilities to drop | `["ALL"]` | +| `containerSecurityContext.runAsNonRoot` | runAsNonRoot | `true` | +| `resources.limits.cpu` | fleetconfig controller's cpu limit | `500m` | +| `resources.limits.memory` | fleetconfig controller's memory limit | `512Mi` | +| `resources.requests.cpu` | fleetconfig controller's cpu request | `200m` | +| `resources.requests.memory` | fleetconfig controller's memory request | `256Mi` | +| `healthCheck.port` | port the liveness & readiness probes are bound to | `9440` | +| `kubernetesClusterDomain` | kubernetes cluster domain | `cluster.local` | ### cert-manager diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/_helpers.tpl b/fleetconfig-controller/charts/fleetconfig-controller/templates/_helpers.tpl index 758722bc..48cf038e 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/_helpers.tpl +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/_helpers.tpl @@ -82,6 +82,19 @@ Generate feature gates string {{- end }} {{- end }} +{{/* +Get the Kubernetes provider +*/}} +{{- define "kubernetesProvider" -}} +{{- if and .Values.global .Values.global.kubernetesProvider -}} +{{- .Values.global.kubernetesProvider | lower -}} +{{- else if .Values.kubernetesProvider -}} +{{- .Values.kubernetesProvider | lower -}} +{{- else -}} +{{- "generic" -}} +{{- end -}} +{{- end -}} + {{/* Format the image name and tag for the given provider. For managed kubernetes providers, the image tag is suffixed with the provider name. @@ -91,12 +104,7 @@ This image has no additional binaries bundled, other than clusteradm. */}} {{- define "controller.image" -}} {{- $baseImage := printf "%s%s:%s" .Values.imageRegistry .Values.image.repository .Values.image.tag -}} -{{- $provider := "" -}} -{{- if and .Values.global .Values.global.kubernetesProvider -}} -{{- $provider = .Values.global.kubernetesProvider | lower -}} -{{- else if .Values.kubernetesProvider -}} -{{- $provider = .Values.kubernetesProvider | lower -}} -{{- end -}} +{{- $provider := include "kubernetesProvider" . -}} {{- if eq $provider "eks" -}} {{- printf "%s-%s" $baseImage $provider -}} {{- else if hasPrefix "gke" $provider -}} @@ -154,4 +162,16 @@ Works with arbitrary depth and handles maps, slices, and scalar values. {{- else -}} {} {{- end -}} +{{- end -}} + +{{/* +Check whether to run fleetconfig-controller in addon mode +*/}} +{{- define "addonMode" -}} +{{- $provider := include "kubernetesProvider" . -}} +{{- if eq $provider "eks" -}} +{{- false -}} +{{- else -}} +{{- .Values.addonMode -}} +{{- end -}} {{- end -}} \ No newline at end of file diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/clusterissuer.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/clusterissuer.yaml index 8da1e529..86895302 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/clusterissuer.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/clusterissuer.yaml @@ -1,6 +1,5 @@ {{- if .Values.certificates.clusterIssuer.enabled -}} # yaml-language-server: $schema=https://datreeio.github.io/CRDs-catalog/cert-manager.io/clusterissuer_v1.json - apiVersion: cert-manager.io/v1 kind: ClusterIssuer metadata: diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml index ee0673b6..6fd5981e 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml @@ -38,7 +38,11 @@ spec: - "--webhook-cert-dir={{ .Values.admissionWebhooks.certificate.mountPath }}" {{- end }} - "--spoke-concurrent-reconciles={{ .Values.spokeConcurrentReconciles }}" - - "--cluster-type=hub" + {{- if eq (include "addonMode" .) "true" }} + - "--instance-type=manager" + {{- else }} + - "--instance-type=unified" + {{- end }} command: - /manager env: diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml index 12760a25..e91e3dac 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml @@ -1,11 +1,11 @@ # yaml-language-server: $schema=https://datreeio.github.io/CRDs-catalog/addon.open-cluster-management.io/addontemplate_v1alpha1.json - +{{- if eq (include "addonMode" .) "true" }} apiVersion: addon.open-cluster-management.io/v1alpha1 kind: AddOnTemplate metadata: - name: fleetconfig-controller-manager + name: fleetconfig-controller-agent spec: - addonName: fleetconfig-controller-manager + addonName: fleetconfig-controller-agent agentSpec: workload: manifests: @@ -16,7 +16,7 @@ spec: - kind: Deployment apiVersion: apps/v1 metadata: - name: {{ include "chart.fullname" . }}-manager + name: {{ include "chart.fullname" . }}-agent namespace: {{ .Release.Namespace }} labels: control-plane: controller-manager @@ -41,7 +41,7 @@ spec: {{- end }} securityContext: {{- toYaml .Values.podSecurityContext | nindent 16 }} - serviceAccountName: {{ include "chart.fullname" . }}-manager + serviceAccountName: {{ include "chart.fullname" . }}-agent terminationGracePeriodSeconds: 10 containers: - args: @@ -49,7 +49,7 @@ spec: - "--health-probe-bind-address=:{{ .Values.healthCheck.port }}" - "--use-webhook=false" - "--spoke-concurrent-reconciles=1" - - "--cluster-type=spoke" + - "--instance-type=agent" command: - /manager env: @@ -79,12 +79,12 @@ spec: - kind: ServiceAccount apiVersion: v1 metadata: - name: {{ include "chart.fullname" . }}-manager + name: {{ include "chart.fullname" . }}-agent namespace: {{ .Release.Namespace }} - kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: {{ include "chart.fullname" . }}-manager-spoke-role + name: {{ include "chart.fullname" . }}-agent-role rules: - apiGroups: [""] resources: ["namespaces"] @@ -113,14 +113,14 @@ spec: - kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: {{ include "chart.fullname" . }}-manager-spoke-rolebinding + name: {{ include "chart.fullname" . }}-agent-rolebinding roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: {{ include "chart.fullname" . }}-manager-spoke-role + name: {{ include "chart.fullname" . }}-agent-role subjects: - kind: ServiceAccount - name: {{ include "chart.fullname" . }}-manager + name: {{ include "chart.fullname" . }}-agent namespace: {{ .Release.Namespace }} - kind: Role apiVersion: rbac.authorization.k8s.io/v1 @@ -152,7 +152,7 @@ spec: name: {{ include "chart.fullname" . }}-leader-election-role subjects: - kind: ServiceAccount - name: {{ include "chart.fullname" . }}-manager + name: {{ include "chart.fullname" . }}-agent namespace: {{ .Release.Namespace }} registration: - type: KubeClient @@ -161,3 +161,4 @@ spec: - type: CurrentCluster currentCluster: clusterRoleName: {{ include "chart.fullname" . }}-manager-role +{{- end }} diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-management-addon.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-management-addon.yaml index 4ff2ef10..51dd5006 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-management-addon.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-management-addon.yaml @@ -1,23 +1,24 @@ # yaml-language-server: $schema=https://datreeio.github.io/CRDs-catalog/addon.open-cluster-management.io/clustermanagementaddon_v1alpha1.json - +{{- if eq (include "addonMode" .) "true" }} apiVersion: addon.open-cluster-management.io/v1alpha1 kind: ClusterManagementAddOn metadata: - name: fleetconfig-controller-manager + name: fleetconfig-controller-agent spec: addOnMeta: - displayName: FleetConfig Controller Addon + displayName: FleetConfig Controller Agent description: | - fleetconfig-controller-manager is an addon to deploy fleetconfig-controller manager on the managed cluster. + fleetconfig-controller-agent is an addon to deploy a fleetconfig-controller agent on the managed cluster. It is used to enable decentralized management of spoke clusters. supportedConfigs: - group: addon.open-cluster-management.io resource: addontemplates defaultConfig: - name: fleetconfig-controller-manager + name: fleetconfig-controller-agent installStrategy: type: Manual # TODO - use `Placements` once ManagedClusters can be labeled immediately during the registration process. See https://github.com/open-cluster-management-io/ocm/issues/1195, https://github.com/open-cluster-management-io/ocm/pull/1123 # placements: # - namespace: managed-cluster-set-spokes # name: spokes +{{- end }} \ No newline at end of file diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-role-binding.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-role-binding.yaml index 7cc049d8..7c9e3260 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-role-binding.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/cluster-role-binding.yaml @@ -1,4 +1,5 @@ # required to grant addon-manager permissions to bind spoke addon agent's SA to the hub's fleetconfig-controller manager clusterrole +{{- if eq (include "addonMode" .) "true" }} apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: @@ -11,3 +12,4 @@ subjects: - kind: ServiceAccount name: addon-manager-controller-sa namespace: open-cluster-management-hub +{{- end }} \ No newline at end of file diff --git a/fleetconfig-controller/charts/fleetconfig-controller/values.yaml b/fleetconfig-controller/charts/fleetconfig-controller/values.yaml index 617c13b0..05f0d762 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/values.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/values.yaml @@ -273,6 +273,10 @@ topologyResources: ## @param spokeConcurrentReconciles Maximum number of Spoke resources that will be reconciled at the same time. spokeConcurrentReconciles: 5 +## @param addonMode Whether to run fleetconfig-controller in addon mode. Addon mode allows for decentralized day 2 management of spoke clusters. Not supported when kubernetesProvider is EKS. + +addonMode: true + ## @param kubernetesProvider Kubernetes provider of the cluster that fleetconfig-controller will be installed on. Valid values are "Generic", "EKS", "GKE-Ingress". kubernetesProvider: "Generic" diff --git a/fleetconfig-controller/cmd/main.go b/fleetconfig-controller/cmd/main.go index 5ac12b12..52bf9dfc 100644 --- a/fleetconfig-controller/cmd/main.go +++ b/fleetconfig-controller/cmd/main.go @@ -68,7 +68,7 @@ func main() { flag.IntVar(&mOpts.WebhookPort, "webhook-port", 9443, "Admission webhook port") flag.IntVar(&mOpts.SpokeConcurrentReconciles, "spoke-concurrent-reconciles", apiv1beta1.SpokeDefaultMaxConcurrentReconciles, fmt.Sprintf("Maximum number of Spoke resources that may be reconciled in parallel. Defaults to %d.", apiv1beta1.SpokeDefaultMaxConcurrentReconciles)) - flag.StringVar(&mOpts.ClusterType, "cluster-type", apiv1beta1.ClusterTypeHub, "The type of cluster that this controller instance is installed in.") + flag.StringVar(&mOpts.InstanceType, "instance-type", apiv1beta1.InstanceTypeManager, fmt.Sprintf("The type of cluster that this controller instance is installed in. Defaults to %s", apiv1beta1.InstanceTypeManager)) zOpts := zap.Options{ Development: true, @@ -83,21 +83,21 @@ func main() { err error ) - switch mOpts.ClusterType { - case apiv1beta1.ClusterTypeHub: + switch mOpts.InstanceType { + case apiv1beta1.InstanceTypeManager, apiv1beta1.InstanceTypeUnified: mgr, err = manager.ForHub(setupLog, mOpts) if err != nil { setupLog.Error(err, "unable to start manager") os.Exit(1) } - case apiv1beta1.ClusterTypeSpoke: + case apiv1beta1.InstanceTypeAgent: mgr, err = manager.ForSpoke(setupLog, mOpts) if err != nil { setupLog.Error(err, "unable to start manager") os.Exit(1) } default: - setupLog.Info("unable to create controller for unknown cluster type", "clusterType", mOpts.ClusterType, "allowed", apiv1beta1.SupportedClusterTypes) + setupLog.Info("unable to create controller for unknown instance type", "instanceType", mOpts.InstanceType, "allowed", apiv1beta1.SupportedInstanceTypes) os.Exit(1) } diff --git a/fleetconfig-controller/cmd/manager/manager.go b/fleetconfig-controller/cmd/manager/manager.go index 8199e698..87b4ffa1 100644 --- a/fleetconfig-controller/cmd/manager/manager.go +++ b/fleetconfig-controller/cmd/manager/manager.go @@ -40,7 +40,7 @@ type Options struct { CertDir string WebhookPort int SpokeConcurrentReconciles int - ClusterType string + InstanceType string Scheme *runtime.Scheme } @@ -121,7 +121,7 @@ func ForHub(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { Log: ctrl.Log.WithName("controllers").WithName("Spoke"), ConcurrentReconciles: opts.SpokeConcurrentReconciles, Scheme: mgr.GetScheme(), - ClusterType: opts.ClusterType, + InstanceType: opts.InstanceType, }).SetupWithManagerForHub(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Spoke") return nil, err @@ -248,7 +248,7 @@ func ForSpoke(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { Log: ctrl.Log.WithName("controllers").WithName("Spoke"), ConcurrentReconciles: opts.SpokeConcurrentReconciles, Scheme: mgr.GetScheme(), - ClusterType: opts.ClusterType, + InstanceType: opts.InstanceType, }).SetupWithManagerForSpoke(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Spoke") return nil, err diff --git a/fleetconfig-controller/internal/controller/v1beta1/addon.go b/fleetconfig-controller/internal/controller/v1beta1/addon.go index 14c20d31..1dccdfba 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/addon.go +++ b/fleetconfig-controller/internal/controller/v1beta1/addon.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "net/url" + "os" "os/exec" "slices" "strings" @@ -12,6 +13,7 @@ import ( "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" kerrs "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" @@ -724,3 +726,63 @@ func allOwnersAddOns(mws []workv1.ManifestWork) bool { } return true } + +// bindAddonAgent creates the necessary bindings for fcc agent to access hub resources +func (r *SpokeReconciler) bindAddonAgent(ctx context.Context, spoke *v1beta1.Spoke) error { + roleName := os.Getenv(v1beta1.RoleNameEnvVar) + if roleName == "" { + roleName = v1beta1.DefaultFCCManagerRole + } + + roleRef := rbacv1.RoleRef{ + Kind: "ClusterRole", + APIGroup: rbacv1.GroupName, + Name: roleName, + } + + err := r.createBinding(ctx, roleRef, spoke.Namespace, spoke.Name) + if err != nil { + return err + } + if spoke.Spec.HubRef.Namespace != spoke.Namespace { + err = r.createBinding(ctx, roleRef, spoke.Spec.HubRef.Namespace, spoke.Name) + if err != nil { + return err + } + } + return nil +} + +// createBinding creates a binding for a given role +func (r *SpokeReconciler) createBinding(ctx context.Context, roleRef rbacv1.RoleRef, namespace, spokeName string) error { + binding := &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("open-cluster-management:%s:%s:agent-%s", // this is a different naming format than OCM uses for addon agents. we need to append the spoke name to avoid possible conflicts in cases where multiple spokes exist in 1 namespace + v1beta1.FCCAddOnName, strings.ToLower(roleRef.Kind), spokeName), + Namespace: namespace, + Labels: map[string]string{ + addonv1alpha1.AddonLabelKey: v1beta1.FCCAddOnName, + }, + }, + RoleRef: roleRef, + Subjects: []rbacv1.Subject{ + { + Kind: rbacv1.GroupKind, + APIGroup: rbacv1.GroupName, + Name: clusterAddonGroup(spokeName, v1beta1.FCCAddOnName), + }, + }, + } + + err := r.Create(ctx, binding, &client.CreateOptions{}) + if err != nil { + return client.IgnoreAlreadyExists(err) + } + return nil +} + +// clusterAddonGroup returns the group that represents the addon for the cluster +// ref: https://github.com/open-cluster-management-io/ocm/blob/main/pkg/addon/templateagent/registration.go#L484 +func clusterAddonGroup(clusterName, addonName string) string { + return fmt.Sprintf("system:open-cluster-management:cluster:%s:addon:%s", clusterName, addonName) +} diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go index 97456c9d..2cf978a6 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go @@ -48,7 +48,7 @@ type SpokeReconciler struct { Log logr.Logger Scheme *runtime.Scheme ConcurrentReconciles int - ClusterType string + InstanceType string } // +kubebuilder:rbac:groups=fleetconfig.open-cluster-management.io,resources=spokes,verbs=get;list;watch;create;update;patch;delete @@ -91,8 +91,8 @@ func (r *SpokeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl spoke.Status.Phase = v1beta1.Unhealthy } - switch r.ClusterType { - case v1beta1.ClusterTypeHub: + switch r.InstanceType { + case v1beta1.InstanceTypeManager: if !slices.Contains(spoke.Finalizers, v1beta1.HubCleanupFinalizer) { setDefaults(ctx, spoke, hubMeta) spoke.Finalizers = append( @@ -105,14 +105,22 @@ func (r *SpokeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl } return ret(ctx, ctrl.Result{RequeueAfter: requeue}, nil) } - case v1beta1.ClusterTypeSpoke: + case v1beta1.InstanceTypeUnified: + setDefaults(ctx, spoke, hubMeta) + spoke.Finalizers = append( + spoke.Finalizers, + v1beta1.HubCleanupPreflightFinalizer, // removed by the hub to signal to the spoke that preflight is completed + v1beta1.SpokeCleanupFinalizer, // removed by the hub after successful unjoin + v1beta1.HubCleanupFinalizer, // removed by the hub after post-unjoin cleanup is finished + ) + case v1beta1.InstanceTypeAgent: if !slices.Contains(spoke.Finalizers, v1beta1.SpokeCleanupFinalizer) && spoke.DeletionTimestamp.IsZero() { spoke.Finalizers = append(spoke.Finalizers, v1beta1.SpokeCleanupFinalizer) // removed by the spoke to signal to the hub that unjoin succeeded return ret(ctx, ctrl.Result{RequeueAfter: requeue}, nil) } default: // this is guarded against when the manager is initialized. should never reach this point - panic(fmt.Sprintf("unknown cluster type %s. Must be one of %v", r.ClusterType, v1beta1.SupportedClusterTypes)) + panic(fmt.Sprintf("unknown instance type %s. Must be one of %v", r.InstanceType, v1beta1.SupportedInstanceTypes)) } // Handle deletion logic with finalizer diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller_test.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller_test.go index c889c97a..fa8dc455 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller_test.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller_test.go @@ -71,10 +71,10 @@ var _ = Describe("Spoke Controller", Ordered, func() { Namespace: "default", } spokeReconciler = &SpokeReconciler{ - Client: k8sClient, - Log: logr.Logger{}, - Scheme: k8sClient.Scheme(), - ClusterType: v1beta1.ClusterTypeHub, + Client: k8sClient, + Log: logr.Logger{}, + Scheme: k8sClient.Scheme(), + InstanceType: v1beta1.InstanceTypeManager, } spoke = &v1beta1.Spoke{ ObjectMeta: metav1.ObjectMeta{ diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index e280c5f1..2a607113 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -9,12 +9,10 @@ import ( "os" "os/exec" "slices" - "strings" "dario.cat/mergo" certificatesv1 "k8s.io/api/certificates/v1" corev1 "k8s.io/api/core/v1" - rbacv1 "k8s.io/api/rbac/v1" kerrs "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -38,8 +36,8 @@ import ( // cleanup cleans up a Spoke and its associated resources. func (r *SpokeReconciler) cleanup(ctx context.Context, spoke *v1beta1.Spoke, hubKubeconfig []byte) error { - switch r.ClusterType { - case v1beta1.ClusterTypeHub: + switch r.InstanceType { + case v1beta1.InstanceTypeManager: originalSpoke, ok := ctx.Value(originalSpokeKey).(*v1beta1.Spoke) // use the original object to check conditions/finalizers if !ok { originalSpoke = spoke.DeepCopy() @@ -56,11 +54,17 @@ func (r *SpokeReconciler) cleanup(ctx context.Context, spoke *v1beta1.Spoke, hub } } return nil - case v1beta1.ClusterTypeSpoke: + case v1beta1.InstanceTypeUnified: + err := r.doHubCleanup(ctx, spoke, hubKubeconfig, false) + if err != nil { + return err + } + return r.doSpokeCleanup(ctx, spoke, false) + case v1beta1.InstanceTypeAgent: return r.doSpokeCleanup(ctx, spoke, true) default: // this is guarded against when the manager is initialized. should never reach this point - panic(fmt.Sprintf("unknown cluster type %s. Must be one of %v", r.ClusterType, v1beta1.SupportedClusterTypes)) + panic(fmt.Sprintf("unknown cluster type %s. Must be one of %v", r.InstanceType, v1beta1.SupportedInstanceTypes)) } } @@ -74,8 +78,8 @@ func (r *SpokeReconciler) handleSpoke(ctx context.Context, spoke *v1beta1.Spoke, return err } - switch r.ClusterType { - case v1beta1.ClusterTypeHub: + switch r.InstanceType { + case v1beta1.InstanceTypeManager: err = r.doHubWork(ctx, spoke, hubMeta, klusterletValues) if err != nil { return err @@ -90,7 +94,20 @@ func (r *SpokeReconciler) handleSpoke(ctx context.Context, spoke *v1beta1.Spoke, } } return nil - case v1beta1.ClusterTypeSpoke: + case v1beta1.InstanceTypeUnified: + err = r.doHubWork(ctx, spoke, hubMeta, klusterletValues) + if err != nil { + return err + } + err = r.doSpokeWork(ctx, spoke, hubMeta.hub, klusterletValues) + if err != nil { + spoke.SetConditions(true, v1beta1.NewCondition( + err.Error(), v1beta1.KlusterletSynced, metav1.ConditionFalse, metav1.ConditionTrue, + )) + return err + } + return nil + case v1beta1.InstanceTypeAgent: err = r.doSpokeWork(ctx, spoke, hubMeta.hub, klusterletValues) if err != nil { spoke.SetConditions(true, v1beta1.NewCondition( @@ -101,7 +118,7 @@ func (r *SpokeReconciler) handleSpoke(ctx context.Context, spoke *v1beta1.Spoke, return nil default: // this is guarded against when the manager is initialized. should never reach this point - panic(fmt.Sprintf("unknown cluster type %s. Must be one of %v", r.ClusterType, v1beta1.SupportedClusterTypes)) + panic(fmt.Sprintf("unknown cluster type %s. Must be one of %v", r.InstanceType, v1beta1.SupportedInstanceTypes)) } } @@ -242,66 +259,6 @@ func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, h return nil } -// bindAddonAgent creates the necessary bindings for fcc agent to access hub resources -func (r *SpokeReconciler) bindAddonAgent(ctx context.Context, spoke *v1beta1.Spoke) error { - roleName := os.Getenv(v1beta1.RoleNameEnvVar) - if roleName == "" { - roleName = v1beta1.DefaultFCCManagerRole - } - - roleRef := rbacv1.RoleRef{ - Kind: "ClusterRole", - APIGroup: rbacv1.GroupName, - Name: roleName, - } - - err := r.createBinding(ctx, roleRef, spoke.Namespace, spoke.Name) - if err != nil { - return err - } - if spoke.Spec.HubRef.Namespace != spoke.Namespace { - err = r.createBinding(ctx, roleRef, spoke.Spec.HubRef.Namespace, spoke.Name) - if err != nil { - return err - } - } - return nil -} - -// createBinding creates a binding for a given role -func (r *SpokeReconciler) createBinding(ctx context.Context, roleRef rbacv1.RoleRef, namespace, spokeName string) error { - binding := &rbacv1.RoleBinding{ - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("open-cluster-management:%s:%s:agent-%s", // this is a different naming format than OCM uses for addon agents. we need to append the spoke name to avoid possible conflicts in cases where multiple spokes exist in 1 namespace - v1beta1.FCCAddOnName, strings.ToLower(roleRef.Kind), spokeName), - Namespace: namespace, - Labels: map[string]string{ - addonv1alpha1.AddonLabelKey: v1beta1.FCCAddOnName, - }, - }, - RoleRef: roleRef, - Subjects: []rbacv1.Subject{ - { - Kind: rbacv1.GroupKind, - APIGroup: rbacv1.GroupName, - Name: clusterAddonGroup(spokeName, v1beta1.FCCAddOnName), - }, - }, - } - - err := r.Create(ctx, binding, &client.CreateOptions{}) - if err != nil { - return client.IgnoreAlreadyExists(err) - } - return nil -} - -// clusterAddonGroup returns the group that represents the addon for the cluster -// ref: https://github.com/open-cluster-management-io/ocm/blob/main/pkg/addon/templateagent/registration.go#L484 -func clusterAddonGroup(clusterName, addonName string) string { - return fmt.Sprintf("system:open-cluster-management:cluster:%s:addon:%s", clusterName, addonName) -} - // doSpokeWork handles spoke-side work such as upgrades func (r *SpokeReconciler) doSpokeWork(ctx context.Context, spoke *v1beta1.Spoke, hub *v1beta1.Hub, klusterletValues *v1beta1.KlusterletChartConfig) error { logger := log.FromContext(ctx) @@ -507,8 +464,8 @@ func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spo return err } - // hub-as-spoke/failed pivot case, no further cleanup needed - clusteradm unjoin will have handled it all - if r.ClusterType == v1beta1.ClusterTypeHub { + // unified manager/hub-as-spoke/failed pivot case, no further cleanup needed - clusteradm unjoin will have handled it all + if r.InstanceType != v1beta1.InstanceTypeAgent { spoke.Finalizers = slices.DeleteFunc(spoke.Finalizers, func(s string) bool { return s == v1beta1.SpokeCleanupFinalizer }) @@ -955,7 +912,7 @@ func (r *SpokeReconciler) getHubMeta(ctx context.Context, hubRef v1beta1.HubRef) } hubMeta.hub = hub // load the hub's kubeconfig. only needed on the hub's reconciler instance - the spoke's instance can access the hub using its default client - if r.ClusterType != v1beta1.ClusterTypeSpoke { + if r.InstanceType != v1beta1.InstanceTypeAgent { hubKubeconfig, err := kube.KubeconfigFromSecretOrCluster(ctx, r.Client, hub.Spec.Kubeconfig, hub.Namespace) if err != nil { return hubMeta, err From c29cc447c6bf2cc05d19a3f3aeda0fb89db21ba5 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Wed, 1 Oct 2025 13:20:07 -0700 Subject: [PATCH 24/62] fix: update all spoke manager references to be agent Signed-off-by: Artur Shad Nik --- fleetconfig-controller/api/v1beta1/constants.go | 4 ++-- .../charts/fleetconfig-controller/values.yaml | 1 - fleetconfig-controller/config/devspace/spoke/manager.yaml | 8 ++++---- fleetconfig-controller/devspace.yaml | 2 +- .../internal/controller/v1beta1/addon.go | 8 ++++---- .../internal/controller/v1beta1/constants.go | 2 +- .../internal/controller/v1beta1/spoke_handler.go | 4 ++-- .../internal/webhook/v1beta1/validation.go | 4 ++-- fleetconfig-controller/test/data/fleetconfig-values.yaml | 2 +- 9 files changed, 17 insertions(+), 18 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/constants.go b/fleetconfig-controller/api/v1beta1/constants.go index 2d8a0cd7..87d51c36 100644 --- a/fleetconfig-controller/api/v1beta1/constants.go +++ b/fleetconfig-controller/api/v1beta1/constants.go @@ -108,8 +108,8 @@ const ( // RoleNameEnvVar containing the name of the ClusterRole for fleetconfig-controller-manager. RoleNameEnvVar = "ROLE_NAME" - // FCCAddOnName is the name of the fleetconfig-controller-addon - FCCAddOnName = "fleetconfig-controller-manager" + // FCCAddOnName is the name of the fleetconfig-controller addon + FCCAddOnName = "fleetconfig-controller-agent" // DefaultFCCManagerRole is the default name of the fleetconfig-controller-manager ClusterRole DefaultFCCManagerRole = "fleetconfig-controller-manager-role" diff --git a/fleetconfig-controller/charts/fleetconfig-controller/values.yaml b/fleetconfig-controller/charts/fleetconfig-controller/values.yaml index 05f0d762..70ef64e6 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/values.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/values.yaml @@ -274,7 +274,6 @@ topologyResources: spokeConcurrentReconciles: 5 ## @param addonMode Whether to run fleetconfig-controller in addon mode. Addon mode allows for decentralized day 2 management of spoke clusters. Not supported when kubernetesProvider is EKS. - addonMode: true ## @param kubernetesProvider Kubernetes provider of the cluster that fleetconfig-controller will be installed on. Valid values are "Generic", "EKS", "GKE-Ingress". diff --git a/fleetconfig-controller/config/devspace/spoke/manager.yaml b/fleetconfig-controller/config/devspace/spoke/manager.yaml index aa20e251..2d957d07 100644 --- a/fleetconfig-controller/config/devspace/spoke/manager.yaml +++ b/fleetconfig-controller/config/devspace/spoke/manager.yaml @@ -1,7 +1,7 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: fleetconfig-controller-manager + name: fleetconfig-controller-agent spec: replicas: 1 selector: @@ -14,9 +14,9 @@ spec: app.kubernetes.io/instance: fleetconfig-controller app.kubernetes.io/name: fleetconfig-controller spec: - serviceAccountName: fleetconfig-controller-manager + serviceAccountName: fleetconfig-controller-agent containers: - - name: fleetconfig-controller-manager + - name: fleetconfig-controller-agent env: - name: KUBERNETES_CLUSTER_DOMAIN value: cluster.local @@ -53,4 +53,4 @@ spec: - name: hub-kubeconfig secret: defaultMode: 420 - secretName: fleetconfig-controller-manager-hub-kubeconfig \ No newline at end of file + secretName: fleetconfig-controller-agent-hub-kubeconfig \ No newline at end of file diff --git a/fleetconfig-controller/devspace.yaml b/fleetconfig-controller/devspace.yaml index 47daa208..fd15d246 100644 --- a/fleetconfig-controller/devspace.yaml +++ b/fleetconfig-controller/devspace.yaml @@ -61,7 +61,7 @@ pipelines: debug-spoke: |- run_dependencies --all build_images fleetconfig-controller-dev - kubectl -n fleetconfig-system delete deployment fleetconfig-controller-manager --ignore-not-found + kubectl -n fleetconfig-system delete deployment fleetconfig-controller-agent --ignore-not-found create_deployments debug-spoke start_dev fleetconfig-controller-dev-spoke diff --git a/fleetconfig-controller/internal/controller/v1beta1/addon.go b/fleetconfig-controller/internal/controller/v1beta1/addon.go index 1dccdfba..badddd8b 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/addon.go +++ b/fleetconfig-controller/internal/controller/v1beta1/addon.go @@ -410,7 +410,7 @@ func handleAddonEnable(ctx context.Context, spoke *v1beta1.Spoke, addons []v1bet func patchFCCMca(ctx context.Context, spokeName string, addonC *addonapi.Clientset) error { mca, err := addonC.AddonV1alpha1().ManagedClusterAddOns(spokeName).Get(ctx, v1beta1.FCCAddOnName, metav1.GetOptions{}) if err != nil { - return fmt.Errorf("failed to configure fleetconfig-controller-manager: %v", err) + return fmt.Errorf("failed to configure %s: %v", v1beta1.FCCAddOnName, err) } desired := addonv1alpha1.AddOnConfig{ ConfigGroupResource: addonv1alpha1.ConfigGroupResource{ @@ -435,7 +435,7 @@ func patchFCCMca(ctx context.Context, spokeName string, addonC *addonapi.Clients "spec": map[string]any{"configs": mca.Spec.Configs}, }) if err != nil { - return fmt.Errorf("failed to marshal patch for fleetconfig-controller-manager: %v", err) + return fmt.Errorf("failed to marshal patch for %s: %v", v1beta1.FCCAddOnName, err) } if _, err = addonC.AddonV1alpha1().ManagedClusterAddOns(spokeName).Patch( ctx, @@ -444,7 +444,7 @@ func patchFCCMca(ctx context.Context, spokeName string, addonC *addonapi.Clients patchBytes, metav1.PatchOptions{}, ); err != nil { - return fmt.Errorf("failed to patch fleetconfig-controller-manager: %v", err) + return fmt.Errorf("failed to patch %S: %v", v1beta1.FCCAddOnName, err) } return nil } @@ -690,7 +690,7 @@ func waitForAddonManifestWorksCleanup(ctx context.Context, workC *workapi.Client } // for hub-as-spoke, or if the pivot failed, all addons must be removed. - // otherwise, fleetconfig-controller-manager must not be removed. + // otherwise, fleetconfig-controller-agent must not be removed. var expectedWorks = 0 if !shouldCleanAll { expectedWorks = 1 diff --git a/fleetconfig-controller/internal/controller/v1beta1/constants.go b/fleetconfig-controller/internal/controller/v1beta1/constants.go index 327472a7..4bf7d146 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/constants.go +++ b/fleetconfig-controller/internal/controller/v1beta1/constants.go @@ -34,5 +34,5 @@ const ( manifestWorkAddOnLabelKey = "open-cluster-management.io/addon-name" - manifestWorkAddOnLabelValueFcc = "fleetconfig-controller-manager" + manifestWorkAddOnLabelValueFcc = "fleetconfig-controller-agent" ) diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 2a607113..328bab3a 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -219,7 +219,7 @@ func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, h if !spoke.IsHubAsSpoke() { adc := &addonv1alpha1.AddOnDeploymentConfig{ ObjectMeta: metav1.ObjectMeta{ - Name: "fleetconfig-controller-manager", + Name: v1beta1.FCCAddOnName, Namespace: spoke.Name, }, Spec: addonv1alpha1.AddOnDeploymentConfigSpec{ @@ -339,7 +339,7 @@ func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke spokeCopy.Spec.AddOns = nil // for hub-as-spoke, or if the addon agent never came up, disable all addons - // otherwise, leave fleetconfig-controller-manager addon running so that it can do deregistration + // otherwise, leave fleetconfig-controller-agent addon running so that it can do deregistration shouldCleanAll := spoke.IsHubAsSpoke() || !pivotComplete if !shouldCleanAll { diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation.go b/fleetconfig-controller/internal/webhook/v1beta1/validation.go index cd074d6b..7ea2312d 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation.go @@ -371,13 +371,13 @@ func validateAddons(ctx context.Context, cli client.Client, newObject *v1beta1.S if slices.ContainsFunc(newObject.Spec.AddOns, func(a v1beta1.AddOn) bool { return a.ConfigName == v1beta1.FCCAddOnName }) { - errs = append(errs, field.Invalid(field.NewPath("spec").Child("addOns"), newObject.Spec.AddOns, "hub-as-spoke Spoke cannot enable fleetconfig-controller-manager addon")) + errs = append(errs, field.Invalid(field.NewPath("spec").Child("addOns"), newObject.Spec.AddOns, "hub-as-spoke Spoke cannot enable fleetconfig-controller-agent addon")) } } else { if !slices.ContainsFunc(newObject.Spec.AddOns, func(a v1beta1.AddOn) bool { return a.ConfigName == v1beta1.FCCAddOnName }) { - errs = append(errs, field.Invalid(field.NewPath("spec").Child("addOns"), newObject.Spec.AddOns, "Spoke must enable fleetconfig-controller-manager addon")) + errs = append(errs, field.Invalid(field.NewPath("spec").Child("addOns"), newObject.Spec.AddOns, "Spoke must enable fleetconfig-controller-agent addon")) } } diff --git a/fleetconfig-controller/test/data/fleetconfig-values.yaml b/fleetconfig-controller/test/data/fleetconfig-values.yaml index f303f6e0..90296a1a 100644 --- a/fleetconfig-controller/test/data/fleetconfig-values.yaml +++ b/fleetconfig-controller/test/data/fleetconfig-values.yaml @@ -39,7 +39,7 @@ fleetConfig: namespace: fleetconfig-system addOns: - configName: test-addon - - configName: fleetconfig-controller-manager + - configName: fleetconfig-controller-agent createNamespace: true syncLabels: false kubeconfig: From 2d6f6fbfc4a665f28717cb4cf14ed75ff9315ea9 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Wed, 1 Oct 2025 13:35:37 -0700 Subject: [PATCH 25/62] fix: make reviewable Signed-off-by: Artur Shad Nik --- fleetconfig-controller/internal/controller/v1beta1/addon.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fleetconfig-controller/internal/controller/v1beta1/addon.go b/fleetconfig-controller/internal/controller/v1beta1/addon.go index badddd8b..6bdf3beb 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/addon.go +++ b/fleetconfig-controller/internal/controller/v1beta1/addon.go @@ -444,7 +444,7 @@ func patchFCCMca(ctx context.Context, spokeName string, addonC *addonapi.Clients patchBytes, metav1.PatchOptions{}, ); err != nil { - return fmt.Errorf("failed to patch %S: %v", v1beta1.FCCAddOnName, err) + return fmt.Errorf("failed to patch %s: %v", v1beta1.FCCAddOnName, err) } return nil } From a8ea9f83c31bd6f65f44521fc6505e89a4726888 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Wed, 1 Oct 2025 14:50:41 -0700 Subject: [PATCH 26/62] fix: only set finalizers once; validate addons Signed-off-by: Artur Shad Nik --- fleetconfig-controller/cmd/manager/manager.go | 2 +- fleetconfig-controller/devspace.yaml | 12 +++++++++++- .../controller/v1beta1/spoke_controller.go | 16 +++++++++------- .../internal/webhook/v1beta1/spoke_webhook.go | 13 +++++++------ .../internal/webhook/v1beta1/validation.go | 6 +++--- .../webhook/v1beta1/webhook_suite_test.go | 6 +++--- 6 files changed, 34 insertions(+), 21 deletions(-) diff --git a/fleetconfig-controller/cmd/manager/manager.go b/fleetconfig-controller/cmd/manager/manager.go index 87b4ffa1..33ddedd3 100644 --- a/fleetconfig-controller/cmd/manager/manager.go +++ b/fleetconfig-controller/cmd/manager/manager.go @@ -137,7 +137,7 @@ func ForHub(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { setupLog.Error(err, "unable to create webhook", "webhook", "Hub") return nil, err } - if err := webhookv1beta1.SetupSpokeWebhookWithManager(mgr); err != nil { + if err := webhookv1beta1.SetupSpokeWebhookWithManager(mgr, opts.InstanceType); err != nil { setupLog.Error(err, "unable to create webhook", "webhook", "Spoke") return nil, err } diff --git a/fleetconfig-controller/devspace.yaml b/fleetconfig-controller/devspace.yaml index fd15d246..28ab53e1 100644 --- a/fleetconfig-controller/devspace.yaml +++ b/fleetconfig-controller/devspace.yaml @@ -18,6 +18,8 @@ vars: DEVSPACE_ENV_FILE: './hack/.versions.env' FLEETCONFIG_ENABLED: value: true + ADDON_MODE: + value: true # profiles are used to set the fleetconfig-controller values for the different versions of the fleetconfig-controller API profiles: @@ -25,12 +27,17 @@ profiles: patches: - path: vars.FLEETCONFIG_ENABLED.value op: replace - value: false + value: false - name: v1beta1 patches: - path: vars.FLEETCONFIG_ENABLED.value op: replace value: true + - name: unified + patches: + - path: vars.ADDON_MODE.value + op: replace + value: false pipelines: dev: |- @@ -103,6 +110,7 @@ deployments: tag: ${IMAGE_TAG} fleetConfig: enabled: ${FLEETCONFIG_ENABLED} + addonMode: ${ADDON_MODE} valuesFiles: - ${CONTEXT}/charts/fleetconfig-controller/values.yaml updateImageTags: false @@ -115,6 +123,7 @@ deployments: devspaceEnabled: true fleetConfig: enabled: ${FLEETCONFIG_ENABLED} + addonMode: ${ADDON_MODE} valuesFiles: - ${CONTEXT}/charts/fleetconfig-controller/values.yaml @@ -128,6 +137,7 @@ deployments: image: repository: ${IMAGE_REPOSITORY}-local tag: local + addonMode: ${ADDON_MODE} valuesFiles: - ${CONTEXT}/charts/fleetconfig-controller/values.yaml - ${CONTEXT}/test/data/fleetconfig-values.yaml diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go index 2cf978a6..ae1db081 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go @@ -106,13 +106,15 @@ func (r *SpokeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl return ret(ctx, ctrl.Result{RequeueAfter: requeue}, nil) } case v1beta1.InstanceTypeUnified: - setDefaults(ctx, spoke, hubMeta) - spoke.Finalizers = append( - spoke.Finalizers, - v1beta1.HubCleanupPreflightFinalizer, // removed by the hub to signal to the spoke that preflight is completed - v1beta1.SpokeCleanupFinalizer, // removed by the hub after successful unjoin - v1beta1.HubCleanupFinalizer, // removed by the hub after post-unjoin cleanup is finished - ) + if !slices.Contains(spoke.Finalizers, v1beta1.HubCleanupFinalizer) { + setDefaults(ctx, spoke, hubMeta) + spoke.Finalizers = append( + spoke.Finalizers, + v1beta1.HubCleanupPreflightFinalizer, // removed by the hub to signal to the spoke that preflight is completed + v1beta1.SpokeCleanupFinalizer, // removed by the hub after successful unjoin + v1beta1.HubCleanupFinalizer, // removed by the hub after post-unjoin cleanup is finished + ) + } case v1beta1.InstanceTypeAgent: if !slices.Contains(spoke.Finalizers, v1beta1.SpokeCleanupFinalizer) && spoke.DeletionTimestamp.IsZero() { spoke.Finalizers = append(spoke.Finalizers, v1beta1.SpokeCleanupFinalizer) // removed by the spoke to signal to the hub that unjoin succeeded diff --git a/fleetconfig-controller/internal/webhook/v1beta1/spoke_webhook.go b/fleetconfig-controller/internal/webhook/v1beta1/spoke_webhook.go index 61360b5f..c142ab92 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/spoke_webhook.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/spoke_webhook.go @@ -41,7 +41,7 @@ import ( var spokelog = logf.Log.WithName("spoke-resource") // SetupSpokeWebhookWithManager registers the webhook for Spoke in the manager. -func SetupSpokeWebhookWithManager(mgr ctrl.Manager) error { +func SetupSpokeWebhookWithManager(mgr ctrl.Manager, instanceType string) error { kubeconfig, err := kube.RawFromInClusterRestConfig() if err != nil { return err @@ -51,7 +51,7 @@ func SetupSpokeWebhookWithManager(mgr ctrl.Manager) error { return err } return ctrl.NewWebhookManagedBy(mgr).For(&v1beta1.Spoke{}). - WithValidator(&SpokeCustomValidator{client: mgr.GetClient(), addonC: addonC}). + WithValidator(&SpokeCustomValidator{client: mgr.GetClient(), addonC: addonC, instanceType: instanceType}). Complete() } @@ -65,8 +65,9 @@ func SetupSpokeWebhookWithManager(mgr ctrl.Manager) error { // NOTE: The +kubebuilder:object:generate=false marker prevents controller-gen from generating DeepCopy methods, // as this struct is used only for temporary operations and does not need to be deeply copied. type SpokeCustomValidator struct { - client client.Client - addonC *versioned.Clientset + client client.Client + addonC *versioned.Clientset + instanceType string } var _ webhook.CustomValidator = &SpokeCustomValidator{} @@ -102,7 +103,7 @@ func (v *SpokeCustomValidator) ValidateCreate(ctx context.Context, obj runtime.O ) } - warn, errs := validateAddons(ctx, v.client, spoke, v.addonC) + warn, errs := v.validateAddons(ctx, v.client, spoke) allErrs = append(allErrs, errs...) if len(allErrs) > 0 { @@ -137,7 +138,7 @@ func (v *SpokeCustomValidator) ValidateUpdate(ctx context.Context, oldObj, newOb ) } - warn, valErrs := validateAddons(ctx, v.client, spoke, v.addonC) + warn, valErrs := v.validateAddons(ctx, v.client, spoke) allErrs = append(allErrs, valErrs...) if len(allErrs) > 0 { diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation.go b/fleetconfig-controller/internal/webhook/v1beta1/validation.go index 7ea2312d..a23b6ff6 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation.go @@ -364,7 +364,7 @@ func validateAddonNotInUse(ctx context.Context, removedAddons []string, fieldPat } // validates that any addon which is enabled on a spoke is configured -func validateAddons(ctx context.Context, cli client.Client, newObject *v1beta1.Spoke, addonC *versioned.Clientset) (admission.Warnings, field.ErrorList) { +func (v *SpokeCustomValidator) validateAddons(ctx context.Context, cli client.Client, newObject *v1beta1.Spoke) (admission.Warnings, field.ErrorList) { errs := field.ErrorList{} if newObject.IsHubAsSpoke() { @@ -373,7 +373,7 @@ func validateAddons(ctx context.Context, cli client.Client, newObject *v1beta1.S }) { errs = append(errs, field.Invalid(field.NewPath("spec").Child("addOns"), newObject.Spec.AddOns, "hub-as-spoke Spoke cannot enable fleetconfig-controller-agent addon")) } - } else { + } else if v.instanceType != v1beta1.InstanceTypeUnified { // fcc-agent MUST be enabled when using manager-agent (addon), MUST NOT be enabled when using unified mode if !slices.ContainsFunc(newObject.Spec.AddOns, func(a v1beta1.AddOn) bool { return a.ConfigName == v1beta1.FCCAddOnName }) { @@ -397,7 +397,7 @@ func validateAddons(ctx context.Context, cli client.Client, newObject *v1beta1.S return admission.Warnings{warnHubNotFound}, errs } - cmaList, err := addonC.AddonV1alpha1().ClusterManagementAddOns().List(ctx, metav1.ListOptions{}) + cmaList, err := v.addonC.AddonV1alpha1().ClusterManagementAddOns().List(ctx, metav1.ListOptions{}) if err != nil { errs = append(errs, field.InternalError(field.NewPath("spec").Child("addOns"), err)) return nil, errs diff --git a/fleetconfig-controller/internal/webhook/v1beta1/webhook_suite_test.go b/fleetconfig-controller/internal/webhook/v1beta1/webhook_suite_test.go index f659d8d6..5bdcf82a 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/webhook_suite_test.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/webhook_suite_test.go @@ -39,7 +39,7 @@ import ( metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" - fleetconfigopenclustermanagementiov1beta1 "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1beta1" + "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1beta1" "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/file" "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/kube" "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/test" @@ -76,7 +76,7 @@ var _ = BeforeSuite(func() { ctx, cancel = context.WithCancel(context.TODO()) var err error - err = fleetconfigopenclustermanagementiov1beta1.AddToScheme(scheme.Scheme) + err = v1beta1.AddToScheme(scheme.Scheme) Expect(err).NotTo(HaveOccurred()) // +kubebuilder:scaffold:scheme @@ -133,7 +133,7 @@ var _ = BeforeSuite(func() { Expect(os.Setenv("KUBECONFIG", kubeconfigPath)).To(Succeed()) logf.Log.Info("Kubeconfig", "path", kubeconfigPath) - err = SetupSpokeWebhookWithManager(mgr) + err = SetupSpokeWebhookWithManager(mgr, v1beta1.InstanceTypeManager) Expect(err).NotTo(HaveOccurred()) err = SetupHubWebhookWithManager(mgr) From f7bc644ff126aaece0e886000ad8cbf5041bb9d9 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Wed, 1 Oct 2025 20:53:52 -0700 Subject: [PATCH 27/62] chore: always use base image for agent Signed-off-by: Artur Shad Nik --- .../fleetconfig-controller/templates/_helpers.tpl | 14 +++++++++++++- .../templates/ocm/fcc-addon/addon-template.yaml | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/_helpers.tpl b/fleetconfig-controller/charts/fleetconfig-controller/templates/_helpers.tpl index 48cf038e..78f1cad3 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/_helpers.tpl +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/_helpers.tpl @@ -95,6 +95,18 @@ Get the Kubernetes provider {{- end -}} {{- end -}} +{{/* +Format the image name and tag for the given provider. +For managed kubernetes providers, the image tag is suffixed with the provider name. +These images are bundled with provider-specific auth binaries. +For generic kubernetes providers, the image tag is used as is. +This image has no additional binaries bundled, other than clusteradm. +*/}} +{{- define "controller.baseImage" -}} +{{- printf "%s%s:%s" .Values.imageRegistry .Values.image.repository .Values.image.tag -}} +{{- end -}} + + {{/* Format the image name and tag for the given provider. For managed kubernetes providers, the image tag is suffixed with the provider name. @@ -103,7 +115,7 @@ For generic kubernetes providers, the image tag is used as is. This image has no additional binaries bundled, other than clusteradm. */}} {{- define "controller.image" -}} -{{- $baseImage := printf "%s%s:%s" .Values.imageRegistry .Values.image.repository .Values.image.tag -}} +{{- $baseImage := include "controller.baseImage" . -}} {{- $provider := include "kubernetesProvider" . -}} {{- if eq $provider "eks" -}} {{- printf "%s-%s" $baseImage $provider -}} diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml index e91e3dac..04fd77b2 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml @@ -63,7 +63,7 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace - image: {{ include "controller.image" . }} + image: {{ include "controller.baseImage" . }} imagePullPolicy: {{ quote .Values.image.pullPolicy }} name: manager resources: From ec08d1ee36aae340c29cb79b46eeb8cf23e4f936 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 11:56:05 -0700 Subject: [PATCH 28/62] docs: update dev guide Signed-off-by: Artur Shad Nik --- fleetconfig-controller/README.md | 85 ++++++++++++++++--- .../templates/deployment-dev.yaml | 6 ++ .../config/devspace/spoke/manager.yaml | 4 +- fleetconfig-controller/devspace-start-hub.sh | 2 +- .../devspace-start-spoke.sh | 2 +- fleetconfig-controller/hack/dev/spoke.yaml | 30 +++++++ 6 files changed, 114 insertions(+), 15 deletions(-) create mode 100644 fleetconfig-controller/hack/dev/spoke.yaml diff --git a/fleetconfig-controller/README.md b/fleetconfig-controller/README.md index ce815076..986a9673 100644 --- a/fleetconfig-controller/README.md +++ b/fleetconfig-controller/README.md @@ -2,10 +2,16 @@ ## 🌱 Project Overview -The `fleetconfig-controller` introduces a new `FleetConfig` custom resource to the OCM ecosystem. It reconciles `FleetConfig` resources to declaratively manage the lifecycle of Open Cluster Management (OCM) multi-clusters. The `fleetconfig-controller` will initialize an OCM hub and one or more spoke clusters; add, remove, and upgrade clustermanagers and klusterlets when their bundle versions change, manage their feature gates, and uninstall all OCM components properly whenever a `FleetConfig` is deleted. +The `fleetconfig-controller` introduces 2 new custom resource to the OCM ecosystem: `Hub` and `Spoke` . It reconciles `Hub` and `Spoke` resources to declaratively manage the lifecycle of Open Cluster Management (OCM) multi-clusters. The `fleetconfig-controller` will initialize an OCM hub and one or more spoke clusters; add, remove, and upgrade clustermanagers and klusterlets when their bundle versions change, manage their feature gates, and uninstall all OCM components properly whenever a `Hub` or `Spoke`s are deleted. The controller is a lightweight wrapper around [clusteradm](https://github.com/open-cluster-management-io/clusteradm). Anything you can accomplish imperatively via a series of `clusteradm` commands can now be accomplished declaratively using the `fleetconfig-controller`. +`fleetconfig-controller` supports 2 modes of operation: +- `addonMode: true` (recommended): After the initial join, a `fleetconfig-controller-agent` will be installed on the spoke cluster as an OCM addon. Once installed, the agent will manage all day 2 operations for the spoke cluster asyncronously. For more information about addon mode, see [2-phase-spoke-reconcile.md](./docs/2-phase-spoke-reconcile.md). +- `addonMode: false`: All management of all spokes is done from the hub cluster. No agent is installed on the spoke cluster. Currently, this is the only mode supported for EKS. + +For the deprecated `v1alpha1` `FleetConfig` API, addon mode is not supported. + ## 🔧 Installation The controller is installed via Helm. @@ -16,18 +22,18 @@ helm repo update ocm helm install fleetconfig-controller ocm/fleetconfig-controller -n fleetconfig-system --create-namespace ``` -By default the Helm chart will also produce a `FleetConfig` to orchestrate, however that behaviour can be disabled. Refer to the chart [README](./charts/fleetconfig-controller/README.md) for full documentation. +By default the Helm chart will also produce a `Hub` and 1 `Spoke` (`hub-as-spoke`) to orchestrate, however that behaviour can be disabled. Refer to the chart [README](./charts/fleetconfig-controller/README.md) for full documentation. ## 🏗️ Support Matrix Support for orchestration of OCM multi-clusters varies based on the Kubernetes distribution and/or cloud provider. -| Kubernetes Distribution | Support Level | -|-------------------------|--------------------| -| Vanilla Kubernetes | ✅ Fully Supported | -| Amazon EKS | ✅ Fully Supported | -| Google GKE | ✅ Fully Supported | -| Azure AKS | 🚧 On Roadmap | +| Kubernetes Distribution | Support Level | +|-------------------------|---------------------------------------| +| Vanilla Kubernetes | ✅ Fully Supported | +| Amazon EKS | ✅ Fully Supported (addonMode: false) | +| Google GKE | ✅ Fully Supported | +| Azure AKS | 🚧 On Roadmap | ## 🏃🏼‍♂️ Quick Start @@ -43,7 +49,7 @@ Support for orchestration of OCM multi-clusters varies based on the Kubernetes d To familiarize yourself with the `Hub` and `Spoke` APIs and the `fleetconfig-controller`, we recommend doing one or more of the following onboarding steps. 1. Step through a [smoke test](./docs/smoketests.md) -1. Invoke the [end-to-end tests](./test/e2e/fleetconfig.go) and inspect the content of the kind clusters that the E2E suite automatically creates +1. Invoke the [end-to-end tests](./test/e2e/v1beta1_hub_spoke.go) and inspect the content of the kind clusters that the E2E suite automatically creates ```bash SKIP_CLEANUP=true make test-e2e @@ -53,6 +59,7 @@ To familiarize yourself with the `Hub` and `Spoke` APIs and the `fleetconfig-con The `fleetconfig-controller` repository is pre-wired for development using [DevSpace](https://www.devspace.sh/docs/getting-started/introduction). +### Single cluster (Hub and `hub-as-spoke` Spoke development) ```bash # Create a dev kind cluster kind create cluster \ @@ -64,18 +71,58 @@ export KUBECONFIG=~/Downloads/fleetconfig-dev.kubeconfig # Initialize a devspace development container devspace run-pipeline dev -n fleetconfig-system ``` +See [Debugging](#debugging) for instructions on how to start the fleetconfig controller manager in debug mode. + +### Two clusters (Hub and Spoke development) +```bash +# Create two dev kind clusters +kind create cluster \ + --name fleetconfig-dev-hub \ + --kubeconfig ~/Downloads/fleetconfig-dev-hub.kubeconfig +export KUBECONFIG=~/Downloads/fleetconfig-dev-hub.kubeconfig + +kind create cluster \ + --name fleetconfig-dev-spoke \ + --kubeconfig ~/Downloads/fleetconfig-dev-spoke.kubeconfig + +# Get the spoke kind cluster's internal kubeconfig +kind get kubeconfig --name fleetconfig-dev-spoke --internal > ~/Downloads/fleetconfig-dev-spoke-internal.kubeconfig + +# Initialize a devspace development container. This will bootstrap in hub-as-spoke mode. +devspace run-pipeline dev --namespace fleetconfig-system --force-build +``` +See [Debugging](#debugging) for instructions on how to start the fleetconfig controller manager in debug mode. + +In a new terminal session, execute the following commands to create a Spoke resource and start the fleetconfig controller agent on the spoke cluster. + +```bash +# Create a secret containing the spoke cluster kubeconfig +export KUBECONFIG=~/Downloads/fleetconfig-dev-hub.kubeconfig +kubectl --namespace fleetconfig-system create secret generic spoke-kubeconfig \ + --from-file=value= + +# Create a minimal Spoke resource +kubectl apply -f hack/dev/spoke.yaml + +# Once fleetconfig-controller-agent is created on the spoke cluster, start the debug session +export KUBECONFIG=~/Downloads/fleetconfig-dev-spoke.kubeconfig +devspace run-pipeline debug-spoke --namespace fleetconfig-system --force-build --profile v1alpha1 +``` +The `--profile v1alpha1` flag disables installing the default Hub and Spoke resources. + +See [Debugging](#debugging) for instructions on how to start the fleetconfig controller agent in debug mode. ### Debugging - Hit up arrow, then enter from within the dev container to start a headless delve session -- Use the following launch config to connect VSCode with the delve session running in the dev container: +- Use one of the following launch configs to connect VSCode with the delve session running in the dev container: ```json { "version": "0.2.0", "configurations": [ { - "name": "DevSpace", + "name": "DevSpace - Hub", "type": "go", "request": "attach", "mode": "remote", @@ -89,6 +136,22 @@ devspace run-pipeline dev -n fleetconfig-system ], "showLog": true, // "trace": "verbose", // useful for debugging delve (breakpoints not working, etc.) + }, + { + "name": "DevSpace - Spoke", + "type": "go", + "request": "attach", + "mode": "remote", + "port": 2345, + "host": "127.0.0.1", + "substitutePath": [ + { + "from": "${workspaceFolder}/fleetconfig-controller", + "to": "/workspace", + } + ], + "showLog": true, + // "trace": "verbose", // useful for debugging delve (breakpoints not working, etc.) } ] } diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment-dev.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment-dev.yaml index 8735aa87..270d1113 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment-dev.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment-dev.yaml @@ -34,6 +34,12 @@ spec: env: - name: KUBERNETES_CLUSTER_DOMAIN value: {{ quote .Values.kubernetesClusterDomain }} + - name: CONTROLLER_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: ROLE_NAME + value: {{ include "chart.fullname" . }}-manager-role ports: - containerPort: {{ .Values.webhookService.port }} name: webhook-server diff --git a/fleetconfig-controller/config/devspace/spoke/manager.yaml b/fleetconfig-controller/config/devspace/spoke/manager.yaml index 2d957d07..ffd9461f 100644 --- a/fleetconfig-controller/config/devspace/spoke/manager.yaml +++ b/fleetconfig-controller/config/devspace/spoke/manager.yaml @@ -25,11 +25,11 @@ spec: - name: HUB_KUBECONFIG value: /managed/hub-kubeconfig/kubeconfig - name: CLUSTER_NAME - value: spoke-1 + value: spoke - name: INSTALL_NAMESPACE value: fleetconfig-system - name: HUB_NAMESPACE - value: default + value: fleetconfig-system - name: CONTROLLER_NAMESPACE valueFrom: fieldRef: diff --git a/fleetconfig-controller/devspace-start-hub.sh b/fleetconfig-controller/devspace-start-hub.sh index bcedb86e..bab31671 100755 --- a/fleetconfig-controller/devspace-start-hub.sh +++ b/fleetconfig-controller/devspace-start-hub.sh @@ -5,7 +5,7 @@ COLOR_CYAN="\033[0;36m" COLOR_RESET="\033[0m" export CGO_ENABLED=0 -FLAGS="--use-webhook=true --webhook-port=9443 --webhook-cert-dir=/etc/k8s-webhook-certs" +FLAGS="--use-webhook=true --webhook-port=9443 --webhook-cert-dir=/etc/k8s-webhook-certs --instance-type=manager" RUN_CMD="go run ./cmd/main.go $FLAGS" DEBUG_CMD="dlv debug ./cmd/main.go --listen=0.0.0.0:2344 --api-version=2 --output /tmp/__debug_bin --headless -- $FLAGS" diff --git a/fleetconfig-controller/devspace-start-spoke.sh b/fleetconfig-controller/devspace-start-spoke.sh index e2059cd8..191914ae 100755 --- a/fleetconfig-controller/devspace-start-spoke.sh +++ b/fleetconfig-controller/devspace-start-spoke.sh @@ -5,7 +5,7 @@ COLOR_CYAN="\033[0;36m" COLOR_RESET="\033[0m" export CGO_ENABLED=0 -FLAGS="--cluster-type=spoke --spoke-concurrent-reconciles=1" +FLAGS="--instance-type=agent --spoke-concurrent-reconciles=1" RUN_CMD="go run ./cmd/main.go $FLAGS" DEBUG_CMD="dlv debug ./cmd/main.go --listen=0.0.0.0:2345 --api-version=2 --output /tmp/__debug_bin --headless -- $FLAGS" diff --git a/fleetconfig-controller/hack/dev/spoke.yaml b/fleetconfig-controller/hack/dev/spoke.yaml new file mode 100644 index 00000000..61424706 --- /dev/null +++ b/fleetconfig-controller/hack/dev/spoke.yaml @@ -0,0 +1,30 @@ +apiVersion: fleetconfig.open-cluster-management.io/v1beta1 +kind: Spoke +metadata: + name: spoke + namespace: fleetconfig-system +spec: + hubRef: + name: hub + namespace: fleetconfig-system + addOns: + - configName: fleetconfig-controller-agent + createNamespace: true + syncLabels: false + kubeconfig: + context: "" + inCluster: false + secretReference: + kubeconfigKey: value + name: spoke-kubeconfig + proxyCa: "" + proxyUrl: "" + klusterlet: + annotations: + foo: "bar" + mode: "Default" + purgeOperator: true + featureGates: "ClusterClaim=true,RawFeedbackJsonString=true" + forceInternalEndpointLookup: true + forceInternalEndpointLookupManaged: false + singleton: false \ No newline at end of file From bbdfc36f35de81e176ca35c15da06be7d1550000 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 12:00:39 -0700 Subject: [PATCH 29/62] chore: guard against unset env vars Signed-off-by: Artur Shad Nik --- fleetconfig-controller/cmd/manager/manager.go | 34 ++++++++++++------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/fleetconfig-controller/cmd/manager/manager.go b/fleetconfig-controller/cmd/manager/manager.go index 33ddedd3..e00f7370 100644 --- a/fleetconfig-controller/cmd/manager/manager.go +++ b/fleetconfig-controller/cmd/manager/manager.go @@ -149,6 +149,27 @@ func ForHub(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { func ForSpoke(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { _, tlsOpts := setupServer(opts, setupLog) + var err error + + // Verify that all required environment variables have been set + spokeNamespace := os.Getenv(apiv1beta1.SpokeNamespaceEnvVar) + if spokeNamespace == "" { + err = fmt.Errorf("%s environment variable must be set", apiv1beta1.SpokeNamespaceEnvVar) + setupLog.Error(err, "unable to create controller", "controller", "Spoke") + return nil, err + } + hubNamespace := os.Getenv(apiv1beta1.HubNamespaceEnvVar) + if hubNamespace == "" { + err = fmt.Errorf("%s environment variable must be set", apiv1beta1.HubNamespaceEnvVar) + setupLog.Error(err, "unable to create controller", "controller", "Spoke") + return nil, err + } + ctrlNamespace := os.Getenv(apiv1beta1.ControllerNamespaceEnvVar) + if ctrlNamespace == "" { + err = fmt.Errorf("%s environment variable must be set", apiv1beta1.ControllerNamespaceEnvVar) + setupLog.Error(err, "unable to create controller", "controller", "Spoke") + return nil, err + } // enables watching resources in the hub cluster hubRestCfg, err := getHubRestConfig() if err != nil { @@ -163,19 +184,6 @@ func ForSpoke(setupLog logr.Logger, opts Options) (ctrl.Manager, error) { return nil, err } - spokeNamespace := os.Getenv(apiv1beta1.SpokeNamespaceEnvVar) - if spokeNamespace == "" { - err = fmt.Errorf("CLUSTER_NAMESPACE environment variable must be set") - setupLog.Error(err, "unable to create controller", "controller", "Spoke") - return nil, err - } - hubNamespace := os.Getenv(apiv1beta1.HubNamespaceEnvVar) - if hubNamespace == "" { - err = fmt.Errorf("HUB_NAMESPACE environment variable must be set") - setupLog.Error(err, "unable to create controller", "controller", "Spoke") - return nil, err - } - mgr, err := ctrl.NewManager(hubRestCfg, ctrl.Options{ Scheme: opts.Scheme, Metrics: metricsserver.Options{ From f2a234ce8dd7aea8535acb6c543b32ef8da035b9 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 12:24:43 -0700 Subject: [PATCH 30/62] docs: typo Signed-off-by: Artur Shad Nik --- fleetconfig-controller/README.md | 4 ++-- .../charts/fleetconfig-controller/templates/_helpers.tpl | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/fleetconfig-controller/README.md b/fleetconfig-controller/README.md index 986a9673..eabdbc5f 100644 --- a/fleetconfig-controller/README.md +++ b/fleetconfig-controller/README.md @@ -2,12 +2,12 @@ ## 🌱 Project Overview -The `fleetconfig-controller` introduces 2 new custom resource to the OCM ecosystem: `Hub` and `Spoke` . It reconciles `Hub` and `Spoke` resources to declaratively manage the lifecycle of Open Cluster Management (OCM) multi-clusters. The `fleetconfig-controller` will initialize an OCM hub and one or more spoke clusters; add, remove, and upgrade clustermanagers and klusterlets when their bundle versions change, manage their feature gates, and uninstall all OCM components properly whenever a `Hub` or `Spoke`s are deleted. +The `fleetconfig-controller` introduces 2 new custom resources to the OCM ecosystem: `Hub` and `Spoke` . It reconciles `Hub` and `Spoke` resources to declaratively manage the lifecycle of Open Cluster Management (OCM) multi-clusters. The `fleetconfig-controller` will initialize an OCM hub and one or more spoke clusters; add, remove, and upgrade clustermanagers and klusterlets when their bundle versions change, manage their feature gates, and uninstall all OCM components properly whenever a `Hub` or `Spoke`s are deleted. The controller is a lightweight wrapper around [clusteradm](https://github.com/open-cluster-management-io/clusteradm). Anything you can accomplish imperatively via a series of `clusteradm` commands can now be accomplished declaratively using the `fleetconfig-controller`. `fleetconfig-controller` supports 2 modes of operation: -- `addonMode: true` (recommended): After the initial join, a `fleetconfig-controller-agent` will be installed on the spoke cluster as an OCM addon. Once installed, the agent will manage all day 2 operations for the spoke cluster asyncronously. For more information about addon mode, see [2-phase-spoke-reconcile.md](./docs/2-phase-spoke-reconcile.md). +- `addonMode: true` (recommended): After the initial join, a `fleetconfig-controller-agent` will be installed on the spoke cluster as an OCM addon. Once installed, the agent will manage all day 2 operations for the spoke cluster asynchronously. For more information about addon mode, see [2-phase-spoke-reconcile.md](./docs/2-phase-spoke-reconcile.md). - `addonMode: false`: All management of all spokes is done from the hub cluster. No agent is installed on the spoke cluster. Currently, this is the only mode supported for EKS. For the deprecated `v1alpha1` `FleetConfig` API, addon mode is not supported. diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/_helpers.tpl b/fleetconfig-controller/charts/fleetconfig-controller/templates/_helpers.tpl index 78f1cad3..c4f9d718 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/_helpers.tpl +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/_helpers.tpl @@ -96,11 +96,7 @@ Get the Kubernetes provider {{- end -}} {{/* -Format the image name and tag for the given provider. -For managed kubernetes providers, the image tag is suffixed with the provider name. -These images are bundled with provider-specific auth binaries. -For generic kubernetes providers, the image tag is used as is. -This image has no additional binaries bundled, other than clusteradm. +Build the base controller image string from registry, repository, and tag. */}} {{- define "controller.baseImage" -}} {{- printf "%s%s:%s" .Values.imageRegistry .Values.image.repository .Values.image.tag -}} From a5560da40f4fb7877ccec3df8f2a732bc63969a3 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 12:28:04 -0700 Subject: [PATCH 31/62] chore: bump image version Signed-off-by: Artur Shad Nik --- fleetconfig-controller/charts/fleetconfig-controller/README.md | 2 +- .../charts/fleetconfig-controller/values.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fleetconfig-controller/charts/fleetconfig-controller/README.md b/fleetconfig-controller/charts/fleetconfig-controller/README.md index 2a82b62d..214a7cc2 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/README.md +++ b/fleetconfig-controller/charts/fleetconfig-controller/README.md @@ -157,7 +157,7 @@ Resource specifications for all klusterlet-managed containers. | `replicas` | fleetconfig-controller replica count | `1` | | `imageRegistry` | Image registry | `""` | | `image.repository` | Image repository | `quay.io/open-cluster-management/fleetconfig-controller` | -| `image.tag` | Image tag | `v0.0.14` | +| `image.tag` | Image tag | `v0.1.0` | | `image.pullPolicy` | Image pull policy | `IfNotPresent` | | `imagePullSecrets` | Image pull secrets | `[]` | | `serviceAccount.annotations` | Annotations to add to the service account | `{}` | diff --git a/fleetconfig-controller/charts/fleetconfig-controller/values.yaml b/fleetconfig-controller/charts/fleetconfig-controller/values.yaml index 70ef64e6..d858992b 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/values.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/values.yaml @@ -290,7 +290,7 @@ imageRegistry: "" ## @param image.pullPolicy Image pull policy image: repository: quay.io/open-cluster-management/fleetconfig-controller - tag: v0.0.14 + tag: v0.1.0 pullPolicy: IfNotPresent ## @param imagePullSecrets Image pull secrets From a1480b78de8f0b2cf9f3da24ca922a73a2622c45 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 13:49:23 -0700 Subject: [PATCH 32/62] docs: typo Signed-off-by: Artur Shad Nik --- fleetconfig-controller/charts/fleetconfig-controller/README.md | 2 +- .../charts/fleetconfig-controller/values.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fleetconfig-controller/charts/fleetconfig-controller/README.md b/fleetconfig-controller/charts/fleetconfig-controller/README.md index 214a7cc2..827b0785 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/README.md +++ b/fleetconfig-controller/charts/fleetconfig-controller/README.md @@ -80,7 +80,7 @@ Resource specifications for all klusterlet-managed containers. | `fleetConfig.registrationAuth.driver` | The authentication driver to use (default: "csr"). Set to "awsirsa" to use AWS IAM Roles for Service Accounts (IRSA) for EKS FleetConfigs. | `csr` | | `fleetConfig.registrationAuth.hubClusterARN` | The ARN of the hub cluster. This is only required if configuring an EKS FleetConfig. Example: "arn:aws:eks:us-west-2::cluster/". | `""` | | `fleetConfig.registrationAuth.autoApprovedARNPatterns` | Optional list of spoke cluster name ARN patterns that the hub will auto-approve. | `[]` | -| `fleetConfig.hub.name` | Name of the Hub resource which will manage the spoke cluster. | `hub` | +| `fleetConfig.hub.name` | Name of the Hub resource which will manage the spoke clusters. | `hub` | | `fleetConfig.hub.addOnConfigs` | Global add-on configuration for the hub cluster. | `[]` | | `fleetConfig.hub.hubAddOns` | Built-in add-on configuration for the hub cluster. | `[]` | | `fleetConfig.hub.clusterManager.enabled` | Whether to enable the cluster manager. Set to false if using Singleton Control Plane. | `true` | diff --git a/fleetconfig-controller/charts/fleetconfig-controller/values.yaml b/fleetconfig-controller/charts/fleetconfig-controller/values.yaml index d858992b..dc81ca4f 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/values.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/values.yaml @@ -60,7 +60,7 @@ fleetConfig: autoApprovedARNPatterns: [] ## Configuration for the Hub cluster. hub: - ## @param fleetConfig.hub.name Name of the Hub resource which will manage the spoke cluster. + ## @param fleetConfig.hub.name Name of the Hub resource which will manage the spoke clusters. name: hub ## @param fleetConfig.hub.addOnConfigs Global add-on configuration for the hub cluster. addOnConfigs: [] From 97b6607ae0691acdb64536ced7f3bd71ca0a15da Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 15:30:11 -0700 Subject: [PATCH 33/62] feat: manager controls namespace lifecycle Signed-off-by: Artur Shad Nik --- .../api/v1beta1/spoke_types.go | 5 +++ ...fig.open-cluster-management.io_spokes.yaml | 6 ++++ .../ocm/fcc-addon/addon-template.yaml | 4 --- .../controller/v1beta1/spoke_handler.go | 31 +++++++++++++++++++ .../internal/webhook/v1beta1/validation.go | 5 ++- 5 files changed, 46 insertions(+), 5 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/spoke_types.go b/fleetconfig-controller/api/v1beta1/spoke_types.go index 142cb0eb..c91556e5 100644 --- a/fleetconfig-controller/api/v1beta1/spoke_types.go +++ b/fleetconfig-controller/api/v1beta1/spoke_types.go @@ -80,6 +80,11 @@ type SpokeSpec struct { // +kubebuilder:default:=0 // +optional LogVerbosity int `json:"logVerbosity,omitempty"` + + // PurgeAgentNamespace. If true, the agent will attempt to garbage collect it's own namespace after the spoke cluster is unjoined. + // +kubebuilder:default:=false + // +optional + PurgeAgentNamespace bool `json:"purgeAgentNamespace,omitempty"` } // HubRef is the information required to get a Hub resource. diff --git a/fleetconfig-controller/charts/fleetconfig-controller/crds/fleetconfig.open-cluster-management.io_spokes.yaml b/fleetconfig-controller/charts/fleetconfig-controller/crds/fleetconfig.open-cluster-management.io_spokes.yaml index 9bba4b41..c9fd2a69 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/crds/fleetconfig.open-cluster-management.io_spokes.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/crds/fleetconfig.open-cluster-management.io_spokes.yaml @@ -2281,6 +2281,12 @@ spec: description: URL of a forward proxy server used by agents to connect to the Hub cluster. type: string + purgeAgentNamespace: + default: false + description: PurgeAgentNamespace. If true, the agent will attempt + to garbage collect it's own namespace after the spoke cluster is + unjoined. + type: boolean syncLabels: description: If true, sync the labels from klusterlet to all agent resources. diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml index 04fd77b2..d55ffb02 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml @@ -9,10 +9,6 @@ spec: agentSpec: workload: manifests: - - kind: Namespace - apiVersion: v1 - metadata: - name: {{ .Release.Namespace }} - kind: Deployment apiVersion: apps/v1 metadata: diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 328bab3a..bbd09892 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -171,6 +171,33 @@ func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, h logger.Error(err, "failed to get managedCluster after join", "spoke", spoke.Name) return err } + + // precreate the namespace that the agent will be installed into + // this prevents it from being automatically garbage collected when the spoke is deregistered + if r.InstanceType != v1beta1.InstanceTypeUnified { + spokeRestCfg, err := kube.RestConfigFromKubeconfig(spokeKubeconfig) + if err != nil { + logger.Error(err, "failed to create agent namespace", "spoke", spoke.Name) + return err + } + spokeCli, err := client.New(spokeRestCfg, client.Options{}) + if err != nil { + logger.Error(err, "failed to create agent namespace", "spoke", spoke.Name) + return err + } + agentNamespace := os.Getenv(v1beta1.ControllerNamespaceEnvVar) // manager.go enforces that this is not "" + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: agentNamespace, + }, + } + err = spokeCli.Create(ctx, ns) + if err != nil && !kerrs.IsNotFound(err) { + logger.Error(err, "failed to create agent namespace", "spoke", spoke.Name) + return err + } + } + } // check managed clusters joined condition @@ -491,6 +518,10 @@ func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spo "open-cluster-management-agent-addon", "open-cluster-management", } + if spoke.Spec.PurgeAgentNamespace { + agentNamespace := os.Getenv(v1beta1.ControllerNamespaceEnvVar) // manager.go enforces that this is not "" + namespacesToDelete = append(namespacesToDelete, agentNamespace) + } restCfg, err := kube.RestConfigFromKubeconfig(spokeKubeconfig) if err != nil { diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation.go b/fleetconfig-controller/internal/webhook/v1beta1/validation.go index a23b6ff6..79363b0b 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation.go @@ -23,7 +23,7 @@ import ( const ( warnHubNotFound = "hub not found, cannot validate spoke addons" - errAllowedSpokeUpdate = "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.klusterlet.valuesFrom, spec.kubeconfig, spec.addOns, spec.timeout, and spec.logVerbosity are allowed when updating a spoke" + errAllowedSpokeUpdate = "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.klusterlet.valuesFrom, spec.kubeconfig, spec.addOns, spec.purgeAgentNamespace, spec.timeout, and spec.logVerbosity are allowed when updating a spoke" errAllowedHubUpdate = "only changes to spec.apiServer, spec.clusterManager.source.*, spec.hubAddOns, spec.addOnConfigs, spec.logVerbosity, spec.timeout, spec.registrationAuth, and spec.kubeconfig are allowed when updating the hub" ) @@ -104,6 +104,7 @@ func allowHubUpdate(oldHub, newHub *v1beta1.Hub) error { // - spec.addOns // - spec.timeout // - spec.logVerbosity +// - spec.purgeAgentNamespace func allowSpokeUpdate(oldSpoke, newSpoke *v1beta1.Spoke) error { if !reflect.DeepEqual(newSpoke.Spec, oldSpoke.Spec) { oldSpokeCopy := oldSpoke.Spec.DeepCopy() @@ -122,6 +123,8 @@ func allowSpokeUpdate(oldSpoke, newSpoke *v1beta1.Spoke) error { newSpokeCopy.LogVerbosity = 0 oldSpokeCopy.Timeout = 0 newSpokeCopy.Timeout = 0 + oldSpokeCopy.PurgeAgentNamespace = false + newSpokeCopy.PurgeAgentNamespace = false if !reflect.DeepEqual(oldSpokeCopy, newSpokeCopy) { return errors.New(errAllowedSpokeUpdate) From 13ced7a4b4ea9dd615ce3ea38c83601ecd22a8d9 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 15:44:26 -0700 Subject: [PATCH 34/62] test: update test values Signed-off-by: Artur Shad Nik --- fleetconfig-controller/test/data/fleetconfig-values.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/fleetconfig-controller/test/data/fleetconfig-values.yaml b/fleetconfig-controller/test/data/fleetconfig-values.yaml index 90296a1a..89a9d767 100644 --- a/fleetconfig-controller/test/data/fleetconfig-values.yaml +++ b/fleetconfig-controller/test/data/fleetconfig-values.yaml @@ -42,6 +42,7 @@ fleetConfig: - configName: fleetconfig-controller-agent createNamespace: true syncLabels: false + purgeAgentNamespace: true kubeconfig: # secret is provisioned during E2E test setup secretReference: From af51379a2c702842ee6f5ab8aeb799ac2b4e83ff Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 16:54:22 -0700 Subject: [PATCH 35/62] feat: conditional secret purge; update chart Signed-off-by: Artur Shad Nik --- .../api/v1beta1/spoke_types.go | 27 ++++-- .../api/v1beta1/zz_generated.deepcopy.go | 16 ++++ .../charts/fleetconfig-controller/README.md | 4 +- ...fig.open-cluster-management.io_spokes.yaml | 35 +++++--- .../templates/fleetconfig.yaml | 5 +- .../charts/fleetconfig-controller/values.yaml | 9 +- .../controller/v1beta1/spoke_handler.go | 84 +++++++++++++------ .../internal/webhook/v1beta1/validation.go | 11 ++- .../test/data/fleetconfig-values.yaml | 5 +- .../test/e2e/v1beta1_hub_spoke.go | 15 ++-- 10 files changed, 150 insertions(+), 61 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/spoke_types.go b/fleetconfig-controller/api/v1beta1/spoke_types.go index c91556e5..80565b42 100644 --- a/fleetconfig-controller/api/v1beta1/spoke_types.go +++ b/fleetconfig-controller/api/v1beta1/spoke_types.go @@ -81,10 +81,29 @@ type SpokeSpec struct { // +optional LogVerbosity int `json:"logVerbosity,omitempty"` - // PurgeAgentNamespace. If true, the agent will attempt to garbage collect it's own namespace after the spoke cluster is unjoined. + // CleanupConfig is used to configure which resources should be automatically garbage collected during cleanup. + // +kubebuilder:default:={} + // +required + CleanupConfig CleanupConfig `json:"cleanupConfig,omitzero"` +} + +// CleanupConfig is the configuration for cleaning up resources during Spoke cleanup. +type CleanupConfig struct { + // If true, the agent will attempt to garbage collect it's own namespace after the spoke cluster is unjoined. // +kubebuilder:default:=false // +optional PurgeAgentNamespace bool `json:"purgeAgentNamespace,omitempty"` + + // If set, the klusterlet operator will be purged and all open-cluster-management namespaces deleted + // when the klusterlet is unjoined from its Hub cluster. + // +kubebuilder:default:=true + // +optional + PurgeKlusterletOperator bool `json:"purgeKlusterletOperator,omitempty"` + + // If set, the kubeconfig secret will will be automatically deleted after the agent has taken over managing the Spoke. + // +kubebuilder:default:=false + // +optional + PurgeKubeconfigSecret bool `json:"purgeKubeconfigSecret,omitempty"` } // HubRef is the information required to get a Hub resource. @@ -144,12 +163,6 @@ type Klusterlet struct { // +optional Mode string `json:"mode,omitempty"` - // If set, the klusterlet operator will be purged and all open-cluster-management namespaces deleted - // when the klusterlet is unjoined from its Hub cluster. - // +kubebuilder:default:=true - // +optional - PurgeOperator bool `json:"purgeOperator,omitempty"` - // If true, the installed klusterlet agent will start the cluster registration process by looking for the // internal endpoint from the public cluster-info in the Hub cluster instead of using hubApiServer. // +optional diff --git a/fleetconfig-controller/api/v1beta1/zz_generated.deepcopy.go b/fleetconfig-controller/api/v1beta1/zz_generated.deepcopy.go index 50eda6ef..197a8809 100644 --- a/fleetconfig-controller/api/v1beta1/zz_generated.deepcopy.go +++ b/fleetconfig-controller/api/v1beta1/zz_generated.deepcopy.go @@ -61,6 +61,21 @@ func (in *AddOnConfig) DeepCopy() *AddOnConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CleanupConfig) DeepCopyInto(out *CleanupConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CleanupConfig. +func (in *CleanupConfig) DeepCopy() *CleanupConfig { + if in == nil { + return nil + } + out := new(CleanupConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ClusterManager) DeepCopyInto(out *ClusterManager) { *out = *in @@ -547,6 +562,7 @@ func (in *SpokeSpec) DeepCopyInto(out *SpokeSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + out.CleanupConfig = in.CleanupConfig } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SpokeSpec. diff --git a/fleetconfig-controller/charts/fleetconfig-controller/README.md b/fleetconfig-controller/charts/fleetconfig-controller/README.md index 827b0785..ea20fe90 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/README.md +++ b/fleetconfig-controller/charts/fleetconfig-controller/README.md @@ -115,6 +115,9 @@ Resource specifications for all klusterlet-managed containers. | `fleetConfig.spokes[0].createNamespace` | If true, create open-cluster-management namespace and agent namespace (open-cluster-management-agent for Default mode, for Hosted mode), otherwise use existing one. Do not edit this name if you are using the default hub-as-spoke mode. | `true` | | `fleetConfig.spokes[0].syncLabels` | If true, sync the labels from klusterlet to all agent resources. | `false` | | `fleetConfig.spokes[0].clusterARN` | The ARN of the spoke cluster. This is only required if configuring an EKS FleetConfig. Example: "arn:aws:eks:us-west-2::cluster/". | `""` | +| `fleetConfig.spokes[0].cleanupConfig.purgeKlusterletOperator` | If set, the klusterlet operator will be purged and all open-cluster-management namespaces deleted when the klusterlet is unjoined from its Hub cluster. | `true` | +| `fleetConfig.spokes[0].cleanupConfig.purgeKubeconfigSecret` | If set, the kubeconfig secret will will be automatically deleted after the agent has taken over managing the Spoke. | `false` | +| `fleetConfig.spokes[0].cleanupConfig.purgeAgentNamespace` | If true, the agent will attempt to garbage collect it's own namespace after the spoke cluster is unjoined. | `false` | | `fleetConfig.spokes[0].kubeconfig.context` | The context to use in the kubeconfig file. Leave empty to use the current context. | `""` | | `fleetConfig.spokes[0].kubeconfig.inCluster` | If set, the kubeconfig will be read from the cluster. Only applicable for same-cluster operations. | `true` | | `fleetConfig.spokes[0].kubeconfig.secretReference.name` | The name of the secret. | `""` | @@ -124,7 +127,6 @@ Resource specifications for all klusterlet-managed containers. | `fleetConfig.spokes[0].proxyUrl` | URL of a forward proxy server used by agents to connect to the Hub cluster, optional. | `""` | | `fleetConfig.spokes[0].klusterlet.annotations` | Annotations to apply to the spoke cluster. If not present, the 'agent.open-cluster-management.io/' prefix is added to each key. Each annotation is added to klusterlet.spec.registrationConfiguration.clusterAnnotations on the spoke and subsequently to the ManagedCluster on the hub. These annotations take precedence over the global spoke annotations. | `{}` | | `fleetConfig.spokes[0].klusterlet.mode` | Deployment mode for klusterlet. Options: Default (agents run on spoke cluster) | Hosted (agents run on hub cluster). | `Default` | -| `fleetConfig.spokes[0].klusterlet.purgeOperator` | If set, the klusterlet operator will be purged and all open-cluster-management namespaces deleted when the klusterlet is unjoined from its Hub cluster. | `true` | | `fleetConfig.spokes[0].klusterlet.forceInternalEndpointLookup` | If true, the klusterlet agent will start the cluster registration process by looking for the internal endpoint from the cluster-info ConfigMap in the Hub cluster instead of using the Hub's public API server endpoint. Set to true when registering the hub cluster as a spoke. | `true` | | `fleetConfig.spokes[0].klusterlet.managedClusterKubeconfig.context` | The context to use in the kubeconfig file. | `""` | | `fleetConfig.spokes[0].klusterlet.managedClusterKubeconfig.inCluster` | If set, the kubeconfig will be read from the cluster. Only applicable for same-cluster operations. | `false` | diff --git a/fleetconfig-controller/charts/fleetconfig-controller/crds/fleetconfig.open-cluster-management.io_spokes.yaml b/fleetconfig-controller/charts/fleetconfig-controller/crds/fleetconfig.open-cluster-management.io_spokes.yaml index c9fd2a69..c78a462b 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/crds/fleetconfig.open-cluster-management.io_spokes.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/crds/fleetconfig.open-cluster-management.io_spokes.yaml @@ -68,6 +68,28 @@ spec: - configName type: object type: array + cleanupConfig: + default: {} + description: CleanupConfig is used to configure which resources should + be automatically garbage collected during cleanup. + properties: + purgeAgentNamespace: + default: false + description: If true, the agent will attempt to garbage collect + it's own namespace after the spoke cluster is unjoined. + type: boolean + purgeKlusterletOperator: + default: true + description: |- + If set, the klusterlet operator will be purged and all open-cluster-management namespaces deleted + when the klusterlet is unjoined from its Hub cluster. + type: boolean + purgeKubeconfigSecret: + default: false + description: If set, the kubeconfig secret will will be automatically + deleted after the agent has taken over managing the Spoke. + type: boolean + type: object clusterARN: description: |- ClusterARN is the ARN of the spoke cluster. @@ -167,12 +189,6 @@ spec: - Default - Hosted type: string - purgeOperator: - default: true - description: |- - If set, the klusterlet operator will be purged and all open-cluster-management namespaces deleted - when the klusterlet is unjoined from its Hub cluster. - type: boolean resources: default: {} description: Resource specifications for all klusterlet-managed @@ -2281,12 +2297,6 @@ spec: description: URL of a forward proxy server used by agents to connect to the Hub cluster. type: string - purgeAgentNamespace: - default: false - description: PurgeAgentNamespace. If true, the agent will attempt - to garbage collect it's own namespace after the spoke cluster is - unjoined. - type: boolean syncLabels: description: If true, sync the labels from klusterlet to all agent resources. @@ -2298,6 +2308,7 @@ spec: If not set, defaults to the Hub's timeout. type: integer required: + - cleanupConfig - hubRef - kubeconfig type: object diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/fleetconfig.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/fleetconfig.yaml index 95bb8c1b..c2fbc188 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/fleetconfig.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/fleetconfig.yaml @@ -70,6 +70,10 @@ metadata: name: {{ .name }} namespace: {{ $releaseNamespace }} spec: + cleanupConfig: + purgeKlusterletOperator: {{ .cleanupConfig.purgeKlusterletOperator }} + purgeKubeconfigSecret: {{ .cleanupConfig.purgeKubeconfigSecret }} + purgeAgentNamespace: {{ .cleanupConfig.purgeAgentNamespace }} hubRef: name: {{ .hubRef.name }} namespace: {{ $releaseNamespace }} @@ -97,7 +101,6 @@ spec: annotations: {{- toYaml $.Values.fleetConfig.spokeAnnotations | nindent 6 }} {{- end }} mode: {{ .klusterlet.mode | quote }} - purgeOperator: {{ .klusterlet.purgeOperator }} featureGates: {{ include "featureGates" (dict "dict" $spokeFeatureGates) | quote }} forceInternalEndpointLookup: {{ .klusterlet.forceInternalEndpointLookup }} forceInternalEndpointLookupManaged: {{ .klusterlet.forceInternalEndpointLookupManaged }} diff --git a/fleetconfig-controller/charts/fleetconfig-controller/values.yaml b/fleetconfig-controller/charts/fleetconfig-controller/values.yaml index dc81ca4f..fb5c88ea 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/values.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/values.yaml @@ -183,6 +183,9 @@ fleetConfig: ## @param fleetConfig.spokes[0].createNamespace If true, create open-cluster-management namespace and agent namespace (open-cluster-management-agent for Default mode, for Hosted mode), otherwise use existing one. Do not edit this name if you are using the default hub-as-spoke mode. ## @param fleetConfig.spokes[0].syncLabels If true, sync the labels from klusterlet to all agent resources. ## @param fleetConfig.spokes[0].clusterARN The ARN of the spoke cluster. This is only required if configuring an EKS FleetConfig. Example: "arn:aws:eks:us-west-2::cluster/". + ## @param fleetConfig.spokes[0].cleanupConfig.purgeKlusterletOperator If set, the klusterlet operator will be purged and all open-cluster-management namespaces deleted when the klusterlet is unjoined from its Hub cluster. + ## @param fleetConfig.spokes[0].cleanupConfig.purgeKubeconfigSecret If set, the kubeconfig secret will will be automatically deleted after the agent has taken over managing the Spoke. + ## @param fleetConfig.spokes[0].cleanupConfig.purgeAgentNamespace If true, the agent will attempt to garbage collect it's own namespace after the spoke cluster is unjoined. ## @param fleetConfig.spokes[0].kubeconfig.context The context to use in the kubeconfig file. Leave empty to use the current context. ## @param fleetConfig.spokes[0].kubeconfig.inCluster If set, the kubeconfig will be read from the cluster. Only applicable for same-cluster operations. ## @param fleetConfig.spokes[0].kubeconfig.secretReference.name The name of the secret. @@ -192,7 +195,6 @@ fleetConfig: ## @param fleetConfig.spokes[0].proxyUrl URL of a forward proxy server used by agents to connect to the Hub cluster, optional. ## @param fleetConfig.spokes[0].klusterlet.annotations Annotations to apply to the spoke cluster. If not present, the 'agent.open-cluster-management.io/' prefix is added to each key. Each annotation is added to klusterlet.spec.registrationConfiguration.clusterAnnotations on the spoke and subsequently to the ManagedCluster on the hub. These annotations take precedence over the global spoke annotations. ## @param fleetConfig.spokes[0].klusterlet.mode Deployment mode for klusterlet. Options: Default (agents run on spoke cluster) | Hosted (agents run on hub cluster). - ## @param fleetConfig.spokes[0].klusterlet.purgeOperator If set, the klusterlet operator will be purged and all open-cluster-management namespaces deleted when the klusterlet is unjoined from its Hub cluster. ## @param fleetConfig.spokes[0].klusterlet.forceInternalEndpointLookup If true, the klusterlet agent will start the cluster registration process by looking for the internal endpoint from the cluster-info ConfigMap in the Hub cluster instead of using the Hub's public API server endpoint. Set to true when registering the hub cluster as a spoke. ## @param fleetConfig.spokes[0].klusterlet.managedClusterKubeconfig.context The context to use in the kubeconfig file. ## @param fleetConfig.spokes[0].klusterlet.managedClusterKubeconfig.inCluster If set, the kubeconfig will be read from the cluster. Only applicable for same-cluster operations. @@ -214,6 +216,10 @@ fleetConfig: createNamespace: true syncLabels: false clusterARN: "" + cleanupConfig: + purgeKlusterletOperator: true + purgeKubeconfigSecret: false + purgeAgentNamespace: false ## Kubeconfig details for the Spoke cluster. kubeconfig: context: "" @@ -229,7 +235,6 @@ fleetConfig: klusterlet: annotations: {} mode: "Default" - purgeOperator: true forceInternalEndpointLookup: true ## @descriptionStart ## External managed cluster kubeconfig, required if using hosted mode. diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index bbd09892..50fd387f 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -38,11 +38,7 @@ import ( func (r *SpokeReconciler) cleanup(ctx context.Context, spoke *v1beta1.Spoke, hubKubeconfig []byte) error { switch r.InstanceType { case v1beta1.InstanceTypeManager: - originalSpoke, ok := ctx.Value(originalSpokeKey).(*v1beta1.Spoke) // use the original object to check conditions/finalizers - if !ok { - originalSpoke = spoke.DeepCopy() - } - pivotComplete := originalSpoke.PivotComplete() + pivotComplete := spoke.PivotComplete() err := r.doHubCleanup(ctx, spoke, hubKubeconfig, pivotComplete) if err != nil { return err @@ -175,27 +171,11 @@ func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, h // precreate the namespace that the agent will be installed into // this prevents it from being automatically garbage collected when the spoke is deregistered if r.InstanceType != v1beta1.InstanceTypeUnified { - spokeRestCfg, err := kube.RestConfigFromKubeconfig(spokeKubeconfig) - if err != nil { - logger.Error(err, "failed to create agent namespace", "spoke", spoke.Name) - return err - } - spokeCli, err := client.New(spokeRestCfg, client.Options{}) + err = r.createAgentNamespace(ctx, spokeKubeconfig) if err != nil { logger.Error(err, "failed to create agent namespace", "spoke", spoke.Name) return err } - agentNamespace := os.Getenv(v1beta1.ControllerNamespaceEnvVar) // manager.go enforces that this is not "" - ns := &corev1.Namespace{ - ObjectMeta: metav1.ObjectMeta{ - Name: agentNamespace, - }, - } - err = spokeCli.Create(ctx, ns) - if err != nil && !kerrs.IsNotFound(err) { - logger.Error(err, "failed to create agent namespace", "spoke", spoke.Name) - return err - } } } @@ -243,6 +223,12 @@ func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, h logger.V(0).Info("labeled ManagedCluster as hub-as-spoke", "name", spoke.Name) } + err = r.deleteKubeconfigSecret(ctx, spoke) + if err != nil { + logger.V(1).Info("warning: failed to remove spoke's kubeconfig secret", "spoke", spoke.Name, "error", err) + return err + } + if !spoke.IsHubAsSpoke() { adc := &addonv1alpha1.AddOnDeploymentConfig{ ObjectMeta: metav1.ObjectMeta{ @@ -286,6 +272,56 @@ func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, h return nil } +func (r *SpokeReconciler) createAgentNamespace(ctx context.Context, spokeKubeconfig []byte) error { + spokeRestCfg, err := kube.RestConfigFromKubeconfig(spokeKubeconfig) + if err != nil { + + return err + } + spokeCli, err := client.New(spokeRestCfg, client.Options{}) + if err != nil { + return err + } + agentNamespace := os.Getenv(v1beta1.ControllerNamespaceEnvVar) // manager.go enforces that this is not "" + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: agentNamespace, + }, + } + err = spokeCli.Create(ctx, ns) + if err != nil && !kerrs.IsNotFound(err) { + return err + } + return nil +} + +func (r *SpokeReconciler) deleteKubeconfigSecret(ctx context.Context, spoke *v1beta1.Spoke) error { + if r.InstanceType != v1beta1.InstanceTypeManager { + return nil + } + if !spoke.PivotComplete() { + return nil + } + if !spoke.Spec.Kubeconfig.InCluster { + return nil + } + if !spoke.Spec.CleanupConfig.PurgeKubeconfigSecret { + return nil + } + + sec := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: spoke.Spec.Kubeconfig.SecretReference.Name, + Namespace: spoke.Namespace, + }, + } + err := r.Delete(ctx, sec) + if err != nil && !kerrs.IsNotFound(err) { + return err + } + return nil +} + // doSpokeWork handles spoke-side work such as upgrades func (r *SpokeReconciler) doSpokeWork(ctx context.Context, spoke *v1beta1.Spoke, hub *v1beta1.Hub, klusterletValues *v1beta1.KlusterletChartConfig) error { logger := log.FromContext(ctx) @@ -518,7 +554,7 @@ func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spo "open-cluster-management-agent-addon", "open-cluster-management", } - if spoke.Spec.PurgeAgentNamespace { + if spoke.Spec.CleanupConfig.PurgeAgentNamespace { agentNamespace := os.Getenv(v1beta1.ControllerNamespaceEnvVar) // manager.go enforces that this is not "" namespacesToDelete = append(namespacesToDelete, agentNamespace) } @@ -869,7 +905,7 @@ func (r *SpokeReconciler) unjoinSpoke(ctx context.Context, spoke *v1beta1.Spoke, unjoinArgs := append([]string{ "unjoin", "--cluster-name", spoke.GetName(), - fmt.Sprintf("--purge-operator=%t", spoke.Spec.Klusterlet.PurgeOperator), + fmt.Sprintf("--purge-operator=%t", spoke.Spec.CleanupConfig.PurgeKlusterletOperator), }, spoke.BaseArgs()...) unjoinArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, spokeKubeconfig, spoke.Spec.Kubeconfig.Context, unjoinArgs) diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation.go b/fleetconfig-controller/internal/webhook/v1beta1/validation.go index 79363b0b..55601c24 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation.go @@ -23,7 +23,7 @@ import ( const ( warnHubNotFound = "hub not found, cannot validate spoke addons" - errAllowedSpokeUpdate = "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.klusterlet.valuesFrom, spec.kubeconfig, spec.addOns, spec.purgeAgentNamespace, spec.timeout, and spec.logVerbosity are allowed when updating a spoke" + errAllowedSpokeUpdate = "spoke contains changes which are not allowed; only changes to spec.klusterlet.annotations, spec.klusterlet.values, spec.klusterlet.valuesFrom, spec.kubeconfig, spec.addOns, spec.purgeAgentNamespace, spec.cleanupConfig.purgeKubeconfigSecret, spec.timeout, and spec.logVerbosity are allowed when updating a spoke" errAllowedHubUpdate = "only changes to spec.apiServer, spec.clusterManager.source.*, spec.hubAddOns, spec.addOnConfigs, spec.logVerbosity, spec.timeout, spec.registrationAuth, and spec.kubeconfig are allowed when updating the hub" ) @@ -104,7 +104,8 @@ func allowHubUpdate(oldHub, newHub *v1beta1.Hub) error { // - spec.addOns // - spec.timeout // - spec.logVerbosity -// - spec.purgeAgentNamespace +// - spec.cleanupConfig.purgeAgentNamespace +// - spec.cleanupConfig.purgeKubeconfigSecret func allowSpokeUpdate(oldSpoke, newSpoke *v1beta1.Spoke) error { if !reflect.DeepEqual(newSpoke.Spec, oldSpoke.Spec) { oldSpokeCopy := oldSpoke.Spec.DeepCopy() @@ -123,8 +124,10 @@ func allowSpokeUpdate(oldSpoke, newSpoke *v1beta1.Spoke) error { newSpokeCopy.LogVerbosity = 0 oldSpokeCopy.Timeout = 0 newSpokeCopy.Timeout = 0 - oldSpokeCopy.PurgeAgentNamespace = false - newSpokeCopy.PurgeAgentNamespace = false + oldSpokeCopy.CleanupConfig.PurgeAgentNamespace = false + newSpokeCopy.CleanupConfig.PurgeAgentNamespace = false + oldSpokeCopy.CleanupConfig.PurgeKubeconfigSecret = false + newSpokeCopy.CleanupConfig.PurgeKubeconfigSecret = false if !reflect.DeepEqual(oldSpokeCopy, newSpokeCopy) { return errors.New(errAllowedSpokeUpdate) diff --git a/fleetconfig-controller/test/data/fleetconfig-values.yaml b/fleetconfig-controller/test/data/fleetconfig-values.yaml index 89a9d767..68bd1232 100644 --- a/fleetconfig-controller/test/data/fleetconfig-values.yaml +++ b/fleetconfig-controller/test/data/fleetconfig-values.yaml @@ -42,7 +42,10 @@ fleetConfig: - configName: fleetconfig-controller-agent createNamespace: true syncLabels: false - purgeAgentNamespace: true + cleanupConfig: + purgeAgentNamespace: true + purgeKubeconfigSecret: true + purgeKlusterletOperator: true kubeconfig: # secret is provisioned during E2E test setup secretReference: diff --git a/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go b/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go index 916135eb..b94f74e6 100644 --- a/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go +++ b/fleetconfig-controller/test/e2e/v1beta1_hub_spoke.go @@ -28,6 +28,7 @@ import ( corev1 "k8s.io/api/core/v1" kerrs "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" ktypes "k8s.io/apimachinery/pkg/types" operatorv1 "open-cluster-management.io/api/operator/v1" "open-cluster-management.io/ocm/pkg/operator/helpers/chart" @@ -120,19 +121,15 @@ var _ = Describe("hub and spoke", Label("v1beta1"), Serial, Ordered, func() { }) It("should successfully upgrade spoke Klusterlet, with no kubeconfig secret", func() { - By("deleting the secret") + By("confirming the kubeconfig secret is deleted") EventuallyWithOffset(1, func() error { - secret := &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: spokeSecretName, - Namespace: fcNamespace, - }, - } - err := tc.kClient.Delete(tc.ctx, secret) + secret := &corev1.Secret{} + err := tc.kClient.Get(tc.ctx, types.NamespacedName{Namespace: fcNamespace, Name: spokeSecretName}, secret) if err != nil { return client.IgnoreNotFound(err) } - return nil + utils.Info("kubeconfig secret still exists") + return err }, 1*time.Minute, 1*time.Second).Should(Succeed()) By("updating the klusterlet values and verifying that the upgrade is successful") From 0b7fc480b4f8434475a206c2a0671ce6935292c8 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 16:58:12 -0700 Subject: [PATCH 36/62] test: update test values Signed-off-by: Artur Shad Nik --- fleetconfig-controller/test/data/fleetconfig-values.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/fleetconfig-controller/test/data/fleetconfig-values.yaml b/fleetconfig-controller/test/data/fleetconfig-values.yaml index 68bd1232..c8398162 100644 --- a/fleetconfig-controller/test/data/fleetconfig-values.yaml +++ b/fleetconfig-controller/test/data/fleetconfig-values.yaml @@ -29,7 +29,6 @@ fleetConfig: foo: "not-bar" baz: "quux" mode: "Default" - purgeOperator: true forceInternalEndpointLookup: true forceInternalEndpointLookupManaged: false singleton: false @@ -56,7 +55,6 @@ fleetConfig: baz: "quux" foo: "bar" mode: "Default" - purgeOperator: true forceInternalEndpointLookup: true forceInternalEndpointLookupManaged: false singleton: false \ No newline at end of file From d58c64bed1a64de5053a1f007070368b0f027893 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 17:06:03 -0700 Subject: [PATCH 37/62] chore: logs Signed-off-by: Artur Shad Nik --- .../controller/v1beta1/spoke_handler.go | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 50fd387f..cfeb66a3 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -171,7 +171,7 @@ func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, h // precreate the namespace that the agent will be installed into // this prevents it from being automatically garbage collected when the spoke is deregistered if r.InstanceType != v1beta1.InstanceTypeUnified { - err = r.createAgentNamespace(ctx, spokeKubeconfig) + err = r.createAgentNamespace(ctx, spoke.Name, spokeKubeconfig) if err != nil { logger.Error(err, "failed to create agent namespace", "spoke", spoke.Name) return err @@ -272,7 +272,8 @@ func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, h return nil } -func (r *SpokeReconciler) createAgentNamespace(ctx context.Context, spokeKubeconfig []byte) error { +func (r *SpokeReconciler) createAgentNamespace(ctx context.Context, spokeName string, spokeKubeconfig []byte) error { + logger := log.FromContext(ctx) spokeRestCfg, err := kube.RestConfigFromKubeconfig(spokeKubeconfig) if err != nil { @@ -292,20 +293,16 @@ func (r *SpokeReconciler) createAgentNamespace(ctx context.Context, spokeKubecon if err != nil && !kerrs.IsNotFound(err) { return err } + logger.V(1).Info("agent namespace configured", "spoke", spokeName, "namespace", agentNamespace) return nil } func (r *SpokeReconciler) deleteKubeconfigSecret(ctx context.Context, spoke *v1beta1.Spoke) error { - if r.InstanceType != v1beta1.InstanceTypeManager { - return nil - } - if !spoke.PivotComplete() { - return nil - } - if !spoke.Spec.Kubeconfig.InCluster { - return nil - } - if !spoke.Spec.CleanupConfig.PurgeKubeconfigSecret { + logger := log.FromContext(ctx) + if r.InstanceType != v1beta1.InstanceTypeManager || + !spoke.PivotComplete() || + !spoke.Spec.Kubeconfig.InCluster || + !spoke.Spec.CleanupConfig.PurgeKubeconfigSecret { return nil } @@ -319,6 +316,7 @@ func (r *SpokeReconciler) deleteKubeconfigSecret(ctx context.Context, spoke *v1b if err != nil && !kerrs.IsNotFound(err) { return err } + logger.V(1).Info("kubeconfig secret purged", "spoke", spoke.Name) return nil } From a15a36487d101c71d55661c459359685906b4976 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 17:06:46 -0700 Subject: [PATCH 38/62] chore: logs Signed-off-by: Artur Shad Nik --- .../internal/controller/v1beta1/spoke_handler.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index cfeb66a3..0c17fbb2 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -276,7 +276,6 @@ func (r *SpokeReconciler) createAgentNamespace(ctx context.Context, spokeName st logger := log.FromContext(ctx) spokeRestCfg, err := kube.RestConfigFromKubeconfig(spokeKubeconfig) if err != nil { - return err } spokeCli, err := client.New(spokeRestCfg, client.Options{}) @@ -298,7 +297,6 @@ func (r *SpokeReconciler) createAgentNamespace(ctx context.Context, spokeName st } func (r *SpokeReconciler) deleteKubeconfigSecret(ctx context.Context, spoke *v1beta1.Spoke) error { - logger := log.FromContext(ctx) if r.InstanceType != v1beta1.InstanceTypeManager || !spoke.PivotComplete() || !spoke.Spec.Kubeconfig.InCluster || @@ -306,6 +304,7 @@ func (r *SpokeReconciler) deleteKubeconfigSecret(ctx context.Context, spoke *v1b return nil } + logger := log.FromContext(ctx) sec := &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ Name: spoke.Spec.Kubeconfig.SecretReference.Name, From 4eca54226eeb6591c47b718fa612018a5e185c88 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 17:10:53 -0700 Subject: [PATCH 39/62] chore: default values Signed-off-by: Artur Shad Nik --- .../charts/fleetconfig-controller/values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fleetconfig-controller/charts/fleetconfig-controller/values.yaml b/fleetconfig-controller/charts/fleetconfig-controller/values.yaml index fb5c88ea..d01ab169 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/values.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/values.yaml @@ -218,8 +218,8 @@ fleetConfig: clusterARN: "" cleanupConfig: purgeKlusterletOperator: true - purgeKubeconfigSecret: false - purgeAgentNamespace: false + purgeKubeconfigSecret: true + purgeAgentNamespace: true ## Kubeconfig details for the Spoke cluster. kubeconfig: context: "" From 0691cdc7b238914c65058ceefe48ccd8867e30ae Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 17:26:23 -0700 Subject: [PATCH 40/62] fix: conditional klusterlet purge Signed-off-by: Artur Shad Nik --- .../controller/v1beta1/spoke_handler.go | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 0c17fbb2..e921b509 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -533,29 +533,27 @@ func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spo } // remove all remaining klusterlet resources that unjoin did not remove (because of the remaining AMW) - workClient, err := common.WorkClient(spokeKubeconfig) - if err != nil { - return err - } - operatorClient, err := common.OperatorClient(spokeKubeconfig) - if err != nil { - return err - } - - if err := operatorClient.OperatorV1().Klusterlets().Delete(ctx, "klusterlet", metav1.DeleteOptions{}); err != nil && !kerrs.IsNotFound(err) { - return err - } + var namespacesToDelete []string + if spoke.Spec.CleanupConfig.PurgeKlusterletOperator { + operatorClient, err := common.OperatorClient(spokeKubeconfig) + if err != nil { + return err + } - namespacesToDelete := []string{ - "open-cluster-management-agent", - "open-cluster-management-agent-addon", - "open-cluster-management", + if err := operatorClient.OperatorV1().Klusterlets().Delete(ctx, "klusterlet", metav1.DeleteOptions{}); err != nil && !kerrs.IsNotFound(err) { + return err + } + namespacesToDelete = append(namespacesToDelete, "open-cluster-management-agent", "open-cluster-management-agent-addon", "open-cluster-management") } if spoke.Spec.CleanupConfig.PurgeAgentNamespace { agentNamespace := os.Getenv(v1beta1.ControllerNamespaceEnvVar) // manager.go enforces that this is not "" namespacesToDelete = append(namespacesToDelete, agentNamespace) } + workClient, err := common.WorkClient(spokeKubeconfig) + if err != nil { + return err + } restCfg, err := kube.RestConfigFromKubeconfig(spokeKubeconfig) if err != nil { return err From e30ce4967dec33f888780e19f8fdf8c9f85cd701 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 17:41:48 -0700 Subject: [PATCH 41/62] fix: nil check cleanupConfig Signed-off-by: Artur Shad Nik --- .../charts/fleetconfig-controller/README.md | 4 ++-- .../fleetconfig-controller/templates/fleetconfig.yaml | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fleetconfig-controller/charts/fleetconfig-controller/README.md b/fleetconfig-controller/charts/fleetconfig-controller/README.md index ea20fe90..40aae5ba 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/README.md +++ b/fleetconfig-controller/charts/fleetconfig-controller/README.md @@ -116,8 +116,8 @@ Resource specifications for all klusterlet-managed containers. | `fleetConfig.spokes[0].syncLabels` | If true, sync the labels from klusterlet to all agent resources. | `false` | | `fleetConfig.spokes[0].clusterARN` | The ARN of the spoke cluster. This is only required if configuring an EKS FleetConfig. Example: "arn:aws:eks:us-west-2::cluster/". | `""` | | `fleetConfig.spokes[0].cleanupConfig.purgeKlusterletOperator` | If set, the klusterlet operator will be purged and all open-cluster-management namespaces deleted when the klusterlet is unjoined from its Hub cluster. | `true` | -| `fleetConfig.spokes[0].cleanupConfig.purgeKubeconfigSecret` | If set, the kubeconfig secret will will be automatically deleted after the agent has taken over managing the Spoke. | `false` | -| `fleetConfig.spokes[0].cleanupConfig.purgeAgentNamespace` | If true, the agent will attempt to garbage collect it's own namespace after the spoke cluster is unjoined. | `false` | +| `fleetConfig.spokes[0].cleanupConfig.purgeKubeconfigSecret` | If set, the kubeconfig secret will will be automatically deleted after the agent has taken over managing the Spoke. | `true` | +| `fleetConfig.spokes[0].cleanupConfig.purgeAgentNamespace` | If true, the agent will attempt to garbage collect it's own namespace after the spoke cluster is unjoined. | `true` | | `fleetConfig.spokes[0].kubeconfig.context` | The context to use in the kubeconfig file. Leave empty to use the current context. | `""` | | `fleetConfig.spokes[0].kubeconfig.inCluster` | If set, the kubeconfig will be read from the cluster. Only applicable for same-cluster operations. | `true` | | `fleetConfig.spokes[0].kubeconfig.secretReference.name` | The name of the secret. | `""` | diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/fleetconfig.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/fleetconfig.yaml index c2fbc188..343481cb 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/fleetconfig.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/fleetconfig.yaml @@ -70,10 +70,12 @@ metadata: name: {{ .name }} namespace: {{ $releaseNamespace }} spec: + {{- with .cleanupConfig }} cleanupConfig: - purgeKlusterletOperator: {{ .cleanupConfig.purgeKlusterletOperator }} - purgeKubeconfigSecret: {{ .cleanupConfig.purgeKubeconfigSecret }} - purgeAgentNamespace: {{ .cleanupConfig.purgeAgentNamespace }} + purgeKlusterletOperator: {{ .purgeKlusterletOperator | default true }} + purgeKubeconfigSecret: {{ .purgeKubeconfigSecret | default false }} + purgeAgentNamespace: {{ .purgeAgentNamespace | default false }} + {{- end }} hubRef: name: {{ .hubRef.name }} namespace: {{ $releaseNamespace }} From 62ccf9aa8431787af79fc78194867cf9deb17bce Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 17:56:05 -0700 Subject: [PATCH 42/62] chore: rabbit Signed-off-by: Artur Shad Nik --- .../charts/fleetconfig-controller/values.yaml | 4 ++-- .../internal/controller/v1beta1/spoke_handler.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fleetconfig-controller/charts/fleetconfig-controller/values.yaml b/fleetconfig-controller/charts/fleetconfig-controller/values.yaml index d01ab169..15045707 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/values.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/values.yaml @@ -184,8 +184,8 @@ fleetConfig: ## @param fleetConfig.spokes[0].syncLabels If true, sync the labels from klusterlet to all agent resources. ## @param fleetConfig.spokes[0].clusterARN The ARN of the spoke cluster. This is only required if configuring an EKS FleetConfig. Example: "arn:aws:eks:us-west-2::cluster/". ## @param fleetConfig.spokes[0].cleanupConfig.purgeKlusterletOperator If set, the klusterlet operator will be purged and all open-cluster-management namespaces deleted when the klusterlet is unjoined from its Hub cluster. - ## @param fleetConfig.spokes[0].cleanupConfig.purgeKubeconfigSecret If set, the kubeconfig secret will will be automatically deleted after the agent has taken over managing the Spoke. - ## @param fleetConfig.spokes[0].cleanupConfig.purgeAgentNamespace If true, the agent will attempt to garbage collect it's own namespace after the spoke cluster is unjoined. + ## @param fleetConfig.spokes[0].cleanupConfig.purgeKubeconfigSecret If set, the kubeconfig secret will be automatically deleted after the agent has taken over managing the Spoke. + ## @param fleetConfig.spokes[0].cleanupConfig.purgeAgentNamespace If true, the agent will attempt to garbage collect its own namespace after the spoke cluster is unjoined. ## @param fleetConfig.spokes[0].kubeconfig.context The context to use in the kubeconfig file. Leave empty to use the current context. ## @param fleetConfig.spokes[0].kubeconfig.inCluster If set, the kubeconfig will be read from the cluster. Only applicable for same-cluster operations. ## @param fleetConfig.spokes[0].kubeconfig.secretReference.name The name of the secret. diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index e921b509..0da26fe0 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -225,7 +225,7 @@ func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, h err = r.deleteKubeconfigSecret(ctx, spoke) if err != nil { - logger.V(1).Info("warning: failed to remove spoke's kubeconfig secret", "spoke", spoke.Name, "error", err) + logger.Error(err, "warning: failed to remove spoke's kubeconfig secret", "spoke", spoke.Name) return err } @@ -289,7 +289,7 @@ func (r *SpokeReconciler) createAgentNamespace(ctx context.Context, spokeName st }, } err = spokeCli.Create(ctx, ns) - if err != nil && !kerrs.IsNotFound(err) { + if err != nil && !kerrs.IsAlreadyExists(err) { return err } logger.V(1).Info("agent namespace configured", "spoke", spokeName, "namespace", agentNamespace) From 19fb14154ec790ab84a33bb3301f75eb8e60785c Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 17:59:47 -0700 Subject: [PATCH 43/62] chore: make reviewable Signed-off-by: Artur Shad Nik --- .../charts/fleetconfig-controller/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fleetconfig-controller/charts/fleetconfig-controller/README.md b/fleetconfig-controller/charts/fleetconfig-controller/README.md index 40aae5ba..a767804d 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/README.md +++ b/fleetconfig-controller/charts/fleetconfig-controller/README.md @@ -116,8 +116,8 @@ Resource specifications for all klusterlet-managed containers. | `fleetConfig.spokes[0].syncLabels` | If true, sync the labels from klusterlet to all agent resources. | `false` | | `fleetConfig.spokes[0].clusterARN` | The ARN of the spoke cluster. This is only required if configuring an EKS FleetConfig. Example: "arn:aws:eks:us-west-2::cluster/". | `""` | | `fleetConfig.spokes[0].cleanupConfig.purgeKlusterletOperator` | If set, the klusterlet operator will be purged and all open-cluster-management namespaces deleted when the klusterlet is unjoined from its Hub cluster. | `true` | -| `fleetConfig.spokes[0].cleanupConfig.purgeKubeconfigSecret` | If set, the kubeconfig secret will will be automatically deleted after the agent has taken over managing the Spoke. | `true` | -| `fleetConfig.spokes[0].cleanupConfig.purgeAgentNamespace` | If true, the agent will attempt to garbage collect it's own namespace after the spoke cluster is unjoined. | `true` | +| `fleetConfig.spokes[0].cleanupConfig.purgeKubeconfigSecret` | If set, the kubeconfig secret will be automatically deleted after the agent has taken over managing the Spoke. | `true` | +| `fleetConfig.spokes[0].cleanupConfig.purgeAgentNamespace` | If true, the agent will attempt to garbage collect its own namespace after the spoke cluster is unjoined. | `true` | | `fleetConfig.spokes[0].kubeconfig.context` | The context to use in the kubeconfig file. Leave empty to use the current context. | `""` | | `fleetConfig.spokes[0].kubeconfig.inCluster` | If set, the kubeconfig will be read from the cluster. Only applicable for same-cluster operations. | `true` | | `fleetConfig.spokes[0].kubeconfig.secretReference.name` | The name of the secret. | `""` | From ac12489ef87aad80cdfaabfdc581233257523d48 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 18:07:20 -0700 Subject: [PATCH 44/62] fix: explicit fcc agent exclusion when in unified mode Signed-off-by: Artur Shad Nik --- fleetconfig-controller/internal/webhook/v1beta1/validation.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fleetconfig-controller/internal/webhook/v1beta1/validation.go b/fleetconfig-controller/internal/webhook/v1beta1/validation.go index 55601c24..8288ec17 100644 --- a/fleetconfig-controller/internal/webhook/v1beta1/validation.go +++ b/fleetconfig-controller/internal/webhook/v1beta1/validation.go @@ -373,11 +373,11 @@ func validateAddonNotInUse(ctx context.Context, removedAddons []string, fieldPat func (v *SpokeCustomValidator) validateAddons(ctx context.Context, cli client.Client, newObject *v1beta1.Spoke) (admission.Warnings, field.ErrorList) { errs := field.ErrorList{} - if newObject.IsHubAsSpoke() { + if newObject.IsHubAsSpoke() || v.instanceType == v1beta1.InstanceTypeUnified { if slices.ContainsFunc(newObject.Spec.AddOns, func(a v1beta1.AddOn) bool { return a.ConfigName == v1beta1.FCCAddOnName }) { - errs = append(errs, field.Invalid(field.NewPath("spec").Child("addOns"), newObject.Spec.AddOns, "hub-as-spoke Spoke cannot enable fleetconfig-controller-agent addon")) + errs = append(errs, field.Invalid(field.NewPath("spec").Child("addOns"), newObject.Spec.AddOns, "fleetconfig-controller-agent addon must not be enabled for hub-as-spoke Spokes, or when using Unified mode")) } } else if v.instanceType != v1beta1.InstanceTypeUnified { // fcc-agent MUST be enabled when using manager-agent (addon), MUST NOT be enabled when using unified mode if !slices.ContainsFunc(newObject.Spec.AddOns, func(a v1beta1.AddOn) bool { From 6c83eb6468685be61baae060f5c3f55e881de3df Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 20:26:39 -0700 Subject: [PATCH 45/62] chore: some review comments Signed-off-by: Artur Shad Nik --- .../api/v1beta1/constants.go | 20 +++++++++++-- .../templates/deployment.yaml | 2 +- fleetconfig-controller/devspace.yaml | 2 +- .../internal/controller/v1beta1/addon.go | 29 +++++++++++++++---- .../controller/v1beta1/spoke_handler.go | 4 +-- 5 files changed, 45 insertions(+), 12 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/constants.go b/fleetconfig-controller/api/v1beta1/constants.go index 87d51c36..f5c4d625 100644 --- a/fleetconfig-controller/api/v1beta1/constants.go +++ b/fleetconfig-controller/api/v1beta1/constants.go @@ -105,14 +105,23 @@ const ( // ControllerNamespaceEnvVar is the environment variable containing the namespace that the controller is deployed to. ControllerNamespaceEnvVar = "CONTROLLER_NAMESPACE" - // RoleNameEnvVar containing the name of the ClusterRole for fleetconfig-controller-manager. - RoleNameEnvVar = "ROLE_NAME" + // ClusterRoleNameEnvVar containing the name of the ClusterRole for fleetconfig-controller-manager. + ClusterRoleNameEnvVar = "ROLE_NAME" // FCCAddOnName is the name of the fleetconfig-controller addon FCCAddOnName = "fleetconfig-controller-agent" // DefaultFCCManagerRole is the default name of the fleetconfig-controller-manager ClusterRole DefaultFCCManagerRole = "fleetconfig-controller-manager-role" + + // NamespaceOCM is the open-cluster-management namespace + NamespaceOCM = "open-cluster-management" + + // NamespaceOCMAgent is the namespace for the open-cluster-management agent. + NamespaceOCMAgent = "open-cluster-management-agent" + + // NamespaceOCMAgentAddOn is the namespace for open-cluster-management agent addons. + NamespaceOCMAgentAddOn = "open-cluster-management-agent-addon" ) // SupportedInstanceTypes are the valid cluster types that the controller can be installed in. @@ -122,6 +131,13 @@ var SupportedInstanceTypes = []string{ InstanceTypeUnified, } +// OCMSpokeNamespaces are the namespaces created on an OCM managed cluster +var OCMSpokeNamespaces = []string{ + NamespaceOCM, + NamespaceOCMAgent, + NamespaceOCMAgentAddOn, +} + // FleetConfig labels const ( // LabelManagedClusterType is the label key for the managed cluster type. diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml index 6fd5981e..770a662a 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment.yaml @@ -52,7 +52,7 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace - - name: ROLE_NAME + - name: CLUSTER_ROLE_NAME value: {{ include "chart.fullname" . }}-manager-role image: {{ include "controller.image" . }} imagePullPolicy: {{ quote .Values.image.pullPolicy }} diff --git a/fleetconfig-controller/devspace.yaml b/fleetconfig-controller/devspace.yaml index 28ab53e1..dd6a4c7c 100644 --- a/fleetconfig-controller/devspace.yaml +++ b/fleetconfig-controller/devspace.yaml @@ -27,7 +27,7 @@ profiles: patches: - path: vars.FLEETCONFIG_ENABLED.value op: replace - value: false + value: false - name: v1beta1 patches: - path: vars.FLEETCONFIG_ENABLED.value diff --git a/fleetconfig-controller/internal/controller/v1beta1/addon.go b/fleetconfig-controller/internal/controller/v1beta1/addon.go index 6bdf3beb..4a7ea797 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/addon.go +++ b/fleetconfig-controller/internal/controller/v1beta1/addon.go @@ -387,8 +387,6 @@ func handleAddonEnable(ctx context.Context, spoke *v1beta1.Spoke, addons []v1bet continue } // TODO - do this natively with clusteradm once https://github.com/open-cluster-management-io/clusteradm/issues/501 is resolved. - // When switching to using Placements strategy once https://github.com/open-cluster-management-io/ocm/pull/1123 is merged, - // this will still need to be done, just in a different part of the code. if a.ConfigName == v1beta1.FCCAddOnName { err = patchFCCMca(ctx, spoke.Name, addonC) if err != nil { @@ -729,7 +727,7 @@ func allOwnersAddOns(mws []workv1.ManifestWork) bool { // bindAddonAgent creates the necessary bindings for fcc agent to access hub resources func (r *SpokeReconciler) bindAddonAgent(ctx context.Context, spoke *v1beta1.Spoke) error { - roleName := os.Getenv(v1beta1.RoleNameEnvVar) + roleName := os.Getenv(v1beta1.ClusterRoleNameEnvVar) if roleName == "" { roleName = v1beta1.DefaultFCCManagerRole } @@ -753,11 +751,15 @@ func (r *SpokeReconciler) bindAddonAgent(ctx context.Context, spoke *v1beta1.Spo return nil } -// createBinding creates a binding for a given role +// createBinding creates a role binding for a given role. +// The role binding follows a different naming format than OCM uses for addon agents. +// We need to append the spoke name to avoid possible conflicts in cases where multiple spokes exist in 1 namespace func (r *SpokeReconciler) createBinding(ctx context.Context, roleRef rbacv1.RoleRef, namespace, spokeName string) error { + logger := log.FromContext(ctx) + binding := &rbacv1.RoleBinding{ ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("open-cluster-management:%s:%s:agent-%s", // this is a different naming format than OCM uses for addon agents. we need to append the spoke name to avoid possible conflicts in cases where multiple spokes exist in 1 namespace + Name: fmt.Sprintf("open-cluster-management:%s:%s:agent-%s", v1beta1.FCCAddOnName, strings.ToLower(roleRef.Kind), spokeName), Namespace: namespace, Labels: map[string]string{ @@ -776,7 +778,22 @@ func (r *SpokeReconciler) createBinding(ctx context.Context, roleRef rbacv1.Role err := r.Create(ctx, binding, &client.CreateOptions{}) if err != nil { - return client.IgnoreAlreadyExists(err) + if !kerrs.IsAlreadyExists(err) { + logger.Error(err, "failed to create role binding for addon") + return err + } + curr := binding.DeepCopy() + err = r.Get(ctx, types.NamespacedName{Namespace: curr.Namespace, Name: curr.Name}, curr) + if err != nil { + logger.Error(err, "failed to get role binding for addon") + return err + } + binding.SetResourceVersion(curr.ResourceVersion) + err = r.Patch(ctx, binding, client.MergeFrom(curr)) + if err != nil { + logger.Error(err, "failed to patch role binding for addon") + return err + } } return nil } diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 0da26fe0..13918cb0 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -495,7 +495,7 @@ func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke return nil } -// doHubCleanup handles all the required cleanup of a spoke cluster when deregistering a Spoke +// doSpokeCleanup handles all the required cleanup of a spoke cluster when deregistering a Spoke func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spoke, pivotComplete bool) error { logger := log.FromContext(ctx) // requeue until preflight is complete by the hub's controller @@ -543,7 +543,7 @@ func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spo if err := operatorClient.OperatorV1().Klusterlets().Delete(ctx, "klusterlet", metav1.DeleteOptions{}); err != nil && !kerrs.IsNotFound(err) { return err } - namespacesToDelete = append(namespacesToDelete, "open-cluster-management-agent", "open-cluster-management-agent-addon", "open-cluster-management") + namespacesToDelete = append(namespacesToDelete, v1beta1.OCMSpokeNamespaces...) } if spoke.Spec.CleanupConfig.PurgeAgentNamespace { agentNamespace := os.Getenv(v1beta1.ControllerNamespaceEnvVar) // manager.go enforces that this is not "" From df3ae69f8972000e69788887d558a6c7cca82834 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 20:31:43 -0700 Subject: [PATCH 46/62] chore: narrow scope of ns perms Signed-off-by: Artur Shad Nik --- .../templates/ocm/fcc-addon/addon-template.yaml | 1 + .../internal/controller/v1beta1/spoke_handler.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml index d55ffb02..0911def6 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml @@ -85,6 +85,7 @@ spec: - apiGroups: [""] resources: ["namespaces"] verbs: ["get", "list", "watch", "delete"] + resourceNames: ["{{ .Release.Namespace }}", "open-cluster-management", "open-cluster-management-agent", "open-cluster-management-agent-addon"] - apiGroups: [""] resources: ["pods", "serviceaccounts"] verbs: ["get", "list", "watch"] diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 13918cb0..27f5049e 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -60,7 +60,7 @@ func (r *SpokeReconciler) cleanup(ctx context.Context, spoke *v1beta1.Spoke, hub return r.doSpokeCleanup(ctx, spoke, true) default: // this is guarded against when the manager is initialized. should never reach this point - panic(fmt.Sprintf("unknown cluster type %s. Must be one of %v", r.InstanceType, v1beta1.SupportedInstanceTypes)) + panic(fmt.Sprintf("unknown instance type %s. Must be one of %v", r.InstanceType, v1beta1.SupportedInstanceTypes)) } } From 8145181d0f353010e1d5b1dbbfefb125d957b0b3 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 20:55:20 -0700 Subject: [PATCH 47/62] chore: make addon mw checks more robust Signed-off-by: Artur Shad Nik --- fleetconfig-controller/api/v1beta1/constants.go | 11 +++++++---- .../internal/controller/v1beta1/addon.go | 12 ++++++++++-- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/constants.go b/fleetconfig-controller/api/v1beta1/constants.go index f5c4d625..56bbc65c 100644 --- a/fleetconfig-controller/api/v1beta1/constants.go +++ b/fleetconfig-controller/api/v1beta1/constants.go @@ -77,6 +77,7 @@ const ( ManagedClusterTypeHubAsSpoke = "hub-as-spoke" ) +// Addon mode const ( // InstanceTypeManager indicates that the controller is running in a Hub cluster and only handles day 1 Spoke operations. InstanceTypeManager = "manager" @@ -108,13 +109,13 @@ const ( // ClusterRoleNameEnvVar containing the name of the ClusterRole for fleetconfig-controller-manager. ClusterRoleNameEnvVar = "ROLE_NAME" - // FCCAddOnName is the name of the fleetconfig-controller addon + // FCCAddOnName is the name of the fleetconfig-controller addon. FCCAddOnName = "fleetconfig-controller-agent" - // DefaultFCCManagerRole is the default name of the fleetconfig-controller-manager ClusterRole + // DefaultFCCManagerRole is the default name of the fleetconfig-controller-manager ClusterRole. DefaultFCCManagerRole = "fleetconfig-controller-manager-role" - // NamespaceOCM is the open-cluster-management namespace + // NamespaceOCM is the open-cluster-management namespace. NamespaceOCM = "open-cluster-management" // NamespaceOCMAgent is the namespace for the open-cluster-management agent. @@ -122,6 +123,8 @@ const ( // NamespaceOCMAgentAddOn is the namespace for open-cluster-management agent addons. NamespaceOCMAgentAddOn = "open-cluster-management-agent-addon" + + ManifestWorkAddOnNameLabelKey = "open-cluster-management.io/addon-name" ) // SupportedInstanceTypes are the valid cluster types that the controller can be installed in. @@ -131,7 +134,7 @@ var SupportedInstanceTypes = []string{ InstanceTypeUnified, } -// OCMSpokeNamespaces are the namespaces created on an OCM managed cluster +// OCMSpokeNamespaces are the namespaces created on an OCM managed cluster. var OCMSpokeNamespaces = []string{ NamespaceOCM, NamespaceOCMAgent, diff --git a/fleetconfig-controller/internal/controller/v1beta1/addon.go b/fleetconfig-controller/internal/controller/v1beta1/addon.go index 4a7ea797..12791aed 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/addon.go +++ b/fleetconfig-controller/internal/controller/v1beta1/addon.go @@ -695,8 +695,16 @@ func waitForAddonManifestWorksCleanup(ctx context.Context, workC *workapi.Client } if len(manifestWorks.Items) == expectedWorks { - logger.V(1).Info("addon manifestWorks cleanup completed", "spokeName", spokeName, "remainingManifestWorks", len(manifestWorks.Items)) - return true, nil + if shouldCleanAll { + logger.V(1).Info("addon manifestWorks cleanup completed", "spokeName", spokeName, "remainingManifestWorks", len(manifestWorks.Items)) + return true, nil + } + mw := manifestWorks.Items[0] + val, ok := mw.Labels[v1beta1.ManifestWorkAddOnNameLabelKey] + if ok && val == v1beta1.FCCAddOnName { + logger.V(1).Info("addon manifestWorks cleanup completed", "spokeName", spokeName, "remainingManifestWork", mw.Name) + return true, nil + } } logger.V(3).Info("waiting for addon manifestWorks cleanup", From a09c84de2b5f0c65d2d500378a545996e5c6b8f7 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 21:10:40 -0700 Subject: [PATCH 48/62] fix: use update Signed-off-by: Artur Shad Nik --- fleetconfig-controller/api/v1beta1/constants.go | 2 -- .../internal/controller/v1beta1/addon.go | 10 +++++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/constants.go b/fleetconfig-controller/api/v1beta1/constants.go index 56bbc65c..04b0dfb8 100644 --- a/fleetconfig-controller/api/v1beta1/constants.go +++ b/fleetconfig-controller/api/v1beta1/constants.go @@ -123,8 +123,6 @@ const ( // NamespaceOCMAgentAddOn is the namespace for open-cluster-management agent addons. NamespaceOCMAgentAddOn = "open-cluster-management-agent-addon" - - ManifestWorkAddOnNameLabelKey = "open-cluster-management.io/addon-name" ) // SupportedInstanceTypes are the valid cluster types that the controller can be installed in. diff --git a/fleetconfig-controller/internal/controller/v1beta1/addon.go b/fleetconfig-controller/internal/controller/v1beta1/addon.go index 12791aed..cb3f124c 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/addon.go +++ b/fleetconfig-controller/internal/controller/v1beta1/addon.go @@ -700,7 +700,7 @@ func waitForAddonManifestWorksCleanup(ctx context.Context, workC *workapi.Client return true, nil } mw := manifestWorks.Items[0] - val, ok := mw.Labels[v1beta1.ManifestWorkAddOnNameLabelKey] + val, ok := mw.Labels[addonv1alpha1.AddonLabelKey] if ok && val == v1beta1.FCCAddOnName { logger.V(1).Info("addon manifestWorks cleanup completed", "spokeName", spokeName, "remainingManifestWork", mw.Name) return true, nil @@ -790,16 +790,16 @@ func (r *SpokeReconciler) createBinding(ctx context.Context, roleRef rbacv1.Role logger.Error(err, "failed to create role binding for addon") return err } - curr := binding.DeepCopy() - err = r.Get(ctx, types.NamespacedName{Namespace: curr.Namespace, Name: curr.Name}, curr) + curr := &rbacv1.RoleBinding{} + err = r.Get(ctx, types.NamespacedName{Namespace: binding.Namespace, Name: binding.Name}, curr) if err != nil { logger.Error(err, "failed to get role binding for addon") return err } binding.SetResourceVersion(curr.ResourceVersion) - err = r.Patch(ctx, binding, client.MergeFrom(curr)) + err = r.Update(ctx, binding) if err != nil { - logger.Error(err, "failed to patch role binding for addon") + logger.Error(err, "failed to update role binding for addon") return err } } From c91baa77c040f10a10f9639f49ff50b70bdb2e30 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 21:24:45 -0700 Subject: [PATCH 49/62] fix: add nil checks before clustermanager access Signed-off-by: Artur Shad Nik --- .../internal/controller/v1beta1/addon.go | 5 ++++- .../internal/controller/v1beta1/spoke_handler.go | 11 +++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/fleetconfig-controller/internal/controller/v1beta1/addon.go b/fleetconfig-controller/internal/controller/v1beta1/addon.go index cb3f124c..94165e99 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/addon.go +++ b/fleetconfig-controller/internal/controller/v1beta1/addon.go @@ -497,7 +497,10 @@ func handleHubAddons(ctx context.Context, addonC *addonapi.Clientset, hub *v1bet logger.V(0).Info("handleHubAddons", "fleetconfig", hub.Name) desiredAddOns := hub.Spec.HubAddOns - bundleVersion := hub.Spec.ClusterManager.Source.BundleVersion + bundleVersion := "latest" + if hub.Spec.ClusterManager != nil { + bundleVersion = hub.Spec.ClusterManager.Source.BundleVersion + } hubAddons, err := getHubAddOns(ctx, addonC) if err != nil { diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 27f5049e..e18f4241 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -225,7 +225,7 @@ func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, h err = r.deleteKubeconfigSecret(ctx, spoke) if err != nil { - logger.Error(err, "warning: failed to remove spoke's kubeconfig secret", "spoke", spoke.Name) + logger.Error(err, "failed to remove spoke's kubeconfig secret", "spoke", spoke.Name) return err } @@ -631,11 +631,14 @@ func (r *SpokeReconciler) joinSpoke(ctx context.Context, spoke *v1beta1.Spoke, h "--feature-gates", spoke.Spec.Klusterlet.FeatureGates, fmt.Sprintf("--force-internal-endpoint-lookup=%t", spoke.Spec.Klusterlet.ForceInternalEndpointLookup), fmt.Sprintf("--singleton=%t", spoke.Spec.Klusterlet.Singleton), - // source args - "--bundle-version", hub.Spec.ClusterManager.Source.BundleVersion, - "--image-registry", hub.Spec.ClusterManager.Source.Registry, }, spoke.BaseArgs()...) + if hub.Spec.ClusterManager != nil { + // source args + joinArgs = append(joinArgs, + "--bundle-version", hub.Spec.ClusterManager.Source.BundleVersion, + "--image-registry", hub.Spec.ClusterManager.Source.Registry) + } for k, v := range spoke.Spec.Klusterlet.Annotations { joinArgs = append(joinArgs, fmt.Sprintf("--klusterlet-annotation=%s=%s", k, v)) } From 90ca8838124abd68395209f82e7c73ba003ac7e4 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Thu, 2 Oct 2025 21:39:09 -0700 Subject: [PATCH 50/62] chore: make reviewable Signed-off-by: Artur Shad Nik --- fleetconfig-controller/api/v1beta1/constants.go | 8 ++++++++ .../internal/controller/v1beta1/addon.go | 2 +- .../internal/controller/v1beta1/hub_controller.go | 4 ++-- .../internal/controller/v1beta1/spoke_handler.go | 4 ++-- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/constants.go b/fleetconfig-controller/api/v1beta1/constants.go index 04b0dfb8..2d8d9037 100644 --- a/fleetconfig-controller/api/v1beta1/constants.go +++ b/fleetconfig-controller/api/v1beta1/constants.go @@ -200,3 +200,11 @@ var SupportedHubAddons = []string{ AddonArgoCD, AddonGPF, } + +const ( + // BundleVersionLatest is the latest OCM source version + BundleVersionLatest = "latest" + + // BundleVersionDefault is the default OCM source version + BundleVersionDefault = "default" +) diff --git a/fleetconfig-controller/internal/controller/v1beta1/addon.go b/fleetconfig-controller/internal/controller/v1beta1/addon.go index 94165e99..106cad4c 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/addon.go +++ b/fleetconfig-controller/internal/controller/v1beta1/addon.go @@ -497,7 +497,7 @@ func handleHubAddons(ctx context.Context, addonC *addonapi.Clientset, hub *v1bet logger.V(0).Info("handleHubAddons", "fleetconfig", hub.Name) desiredAddOns := hub.Spec.HubAddOns - bundleVersion := "latest" + bundleVersion := v1beta1.BundleVersionLatest if hub.Spec.ClusterManager != nil { bundleVersion = hub.Spec.ClusterManager.Source.BundleVersion } diff --git a/fleetconfig-controller/internal/controller/v1beta1/hub_controller.go b/fleetconfig-controller/internal/controller/v1beta1/hub_controller.go index 68d63848..d23d09f4 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/hub_controller.go +++ b/fleetconfig-controller/internal/controller/v1beta1/hub_controller.go @@ -453,11 +453,11 @@ func (r *HubReconciler) hubNeedsUpgrade(ctx context.Context, hub *v1beta1.Hub, o logger := log.FromContext(ctx) logger.V(0).Info("hubNeedsUpgrade", "hub", hub.Name) - if hub.Spec.ClusterManager.Source.BundleVersion == "default" { + if hub.Spec.ClusterManager.Source.BundleVersion == v1beta1.BundleVersionDefault { logger.V(0).Info("clustermanager bundleVersion is default, skipping upgrade") return false, nil } - if hub.Spec.ClusterManager.Source.BundleVersion == "latest" { + if hub.Spec.ClusterManager.Source.BundleVersion == v1beta1.BundleVersionLatest { logger.V(0).Info("clustermanager bundleVersion is latest, attempting upgrade") return true, nil } diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index e18f4241..f427fa8b 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -804,11 +804,11 @@ func (r *SpokeReconciler) spokeNeedsUpgrade(ctx context.Context, spoke *v1beta1. return true, nil } - if source.BundleVersion == "default" { + if source.BundleVersion == v1beta1.BundleVersionDefault { logger.V(0).Info("klusterlet bundleVersion is default, skipping upgrade") return false, nil } - if source.BundleVersion == "latest" { + if source.BundleVersion == v1beta1.BundleVersionLatest { logger.V(0).Info("klusterlet bundleVersion is latest, attempting upgrade") return true, nil } From d9a9dd34d07a496997c28728b5795576b1c3b922 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 3 Oct 2025 08:53:43 -0700 Subject: [PATCH 51/62] chore: review changes Signed-off-by: Artur Shad Nik --- fleetconfig-controller/api/v1beta1/constants.go | 2 +- fleetconfig-controller/api/v1beta1/spoke_types.go | 2 +- .../charts/fleetconfig-controller/templates/deployment-dev.yaml | 2 +- fleetconfig-controller/config/devspace/hub/manager.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/constants.go b/fleetconfig-controller/api/v1beta1/constants.go index 2d8d9037..9d6d379c 100644 --- a/fleetconfig-controller/api/v1beta1/constants.go +++ b/fleetconfig-controller/api/v1beta1/constants.go @@ -107,7 +107,7 @@ const ( ControllerNamespaceEnvVar = "CONTROLLER_NAMESPACE" // ClusterRoleNameEnvVar containing the name of the ClusterRole for fleetconfig-controller-manager. - ClusterRoleNameEnvVar = "ROLE_NAME" + ClusterRoleNameEnvVar = "CLUSTER_ROLE_NAME" // FCCAddOnName is the name of the fleetconfig-controller addon. FCCAddOnName = "fleetconfig-controller-agent" diff --git a/fleetconfig-controller/api/v1beta1/spoke_types.go b/fleetconfig-controller/api/v1beta1/spoke_types.go index 80565b42..3e455f1b 100644 --- a/fleetconfig-controller/api/v1beta1/spoke_types.go +++ b/fleetconfig-controller/api/v1beta1/spoke_types.go @@ -84,7 +84,7 @@ type SpokeSpec struct { // CleanupConfig is used to configure which resources should be automatically garbage collected during cleanup. // +kubebuilder:default:={} // +required - CleanupConfig CleanupConfig `json:"cleanupConfig,omitzero"` + CleanupConfig CleanupConfig `json:"cleanupConfig"` } // CleanupConfig is the configuration for cleaning up resources during Spoke cleanup. diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment-dev.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment-dev.yaml index 270d1113..52196756 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment-dev.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/deployment-dev.yaml @@ -38,7 +38,7 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace - - name: ROLE_NAME + - name: CLUSTER_ROLE_NAME value: {{ include "chart.fullname" . }}-manager-role ports: - containerPort: {{ .Values.webhookService.port }} diff --git a/fleetconfig-controller/config/devspace/hub/manager.yaml b/fleetconfig-controller/config/devspace/hub/manager.yaml index 5400d89c..94e8a450 100644 --- a/fleetconfig-controller/config/devspace/hub/manager.yaml +++ b/fleetconfig-controller/config/devspace/hub/manager.yaml @@ -28,7 +28,7 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace - - name: ROLE_NAME + - name: CLUSTER_ROLE_NAME value: fleetconfig-controller-manager-role image: quay.io/open-cluster-management/fleetconfig-controller:dev imagePullPolicy: IfNotPresent From e1e2fa564898ecab5113b80e1a0a2a08282e8d39 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 3 Oct 2025 10:22:28 -0700 Subject: [PATCH 52/62] chore: docstring Signed-off-by: Artur Shad Nik --- fleetconfig-controller/api/v1beta1/spoke_types.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fleetconfig-controller/api/v1beta1/spoke_types.go b/fleetconfig-controller/api/v1beta1/spoke_types.go index 3e455f1b..85872a00 100644 --- a/fleetconfig-controller/api/v1beta1/spoke_types.go +++ b/fleetconfig-controller/api/v1beta1/spoke_types.go @@ -89,7 +89,7 @@ type SpokeSpec struct { // CleanupConfig is the configuration for cleaning up resources during Spoke cleanup. type CleanupConfig struct { - // If true, the agent will attempt to garbage collect it's own namespace after the spoke cluster is unjoined. + // If true, the agent will attempt to garbage collect its own namespace after the spoke cluster is unjoined. // +kubebuilder:default:=false // +optional PurgeAgentNamespace bool `json:"purgeAgentNamespace,omitempty"` @@ -100,7 +100,7 @@ type CleanupConfig struct { // +optional PurgeKlusterletOperator bool `json:"purgeKlusterletOperator,omitempty"` - // If set, the kubeconfig secret will will be automatically deleted after the agent has taken over managing the Spoke. + // If set, the kubeconfig secret will be automatically deleted after the agent has taken over managing the Spoke. // +kubebuilder:default:=false // +optional PurgeKubeconfigSecret bool `json:"purgeKubeconfigSecret,omitempty"` From 65fac350edf4418ed1a6f0b79832f01ebf2e2763 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 3 Oct 2025 12:09:03 -0700 Subject: [PATCH 53/62] feat: use standalone watcher for agent cleanup Signed-off-by: Artur Shad Nik --- .../api/v1beta1/constants.go | 8 +- ...fig.open-cluster-management.io_spokes.yaml | 4 +- .../ocm/fcc-addon/addon-template.yaml | 2 + .../config/devspace/spoke/manager.yaml | 2 + .../controller/v1beta1/spoke_controller.go | 69 ++++++++++++++++ .../controller/v1beta1/spoke_handler.go | 57 +++++-------- .../internal/watch/watch.go | 82 +++++++++++++++++++ 7 files changed, 186 insertions(+), 38 deletions(-) create mode 100644 fleetconfig-controller/internal/watch/watch.go diff --git a/fleetconfig-controller/api/v1beta1/constants.go b/fleetconfig-controller/api/v1beta1/constants.go index 9d6d379c..ae28af3a 100644 --- a/fleetconfig-controller/api/v1beta1/constants.go +++ b/fleetconfig-controller/api/v1beta1/constants.go @@ -106,9 +106,12 @@ const ( // ControllerNamespaceEnvVar is the environment variable containing the namespace that the controller is deployed to. ControllerNamespaceEnvVar = "CONTROLLER_NAMESPACE" - // ClusterRoleNameEnvVar containing the name of the ClusterRole for fleetconfig-controller-manager. + // ClusterRoleNameEnvVar is the environment variable containing the name of the ClusterRole for fleetconfig-controller-manager. ClusterRoleNameEnvVar = "CLUSTER_ROLE_NAME" + // PurgeAgentNamespaceEnvVar is the environment variable used to signal to the agent whether or not it should garbage collect it install namespace. + PurgeAgentNamespaceEnvVar = "PURGE_AGENT_NAMESPACE" + // FCCAddOnName is the name of the fleetconfig-controller addon. FCCAddOnName = "fleetconfig-controller-agent" @@ -123,6 +126,9 @@ const ( // NamespaceOCMAgentAddOn is the namespace for open-cluster-management agent addons. NamespaceOCMAgentAddOn = "open-cluster-management-agent-addon" + + // AgentCleanupWatcherName is the name of the watcher for cleaning up the spoke agent. + AgentCleanupWatcherName = "agent-cleanup-watcher" ) // SupportedInstanceTypes are the valid cluster types that the controller can be installed in. diff --git a/fleetconfig-controller/charts/fleetconfig-controller/crds/fleetconfig.open-cluster-management.io_spokes.yaml b/fleetconfig-controller/charts/fleetconfig-controller/crds/fleetconfig.open-cluster-management.io_spokes.yaml index c78a462b..80ddca8d 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/crds/fleetconfig.open-cluster-management.io_spokes.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/crds/fleetconfig.open-cluster-management.io_spokes.yaml @@ -76,7 +76,7 @@ spec: purgeAgentNamespace: default: false description: If true, the agent will attempt to garbage collect - it's own namespace after the spoke cluster is unjoined. + its own namespace after the spoke cluster is unjoined. type: boolean purgeKlusterletOperator: default: true @@ -86,7 +86,7 @@ spec: type: boolean purgeKubeconfigSecret: default: false - description: If set, the kubeconfig secret will will be automatically + description: If set, the kubeconfig secret will be automatically deleted after the agent has taken over managing the Spoke. type: boolean type: object diff --git a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml index 0911def6..7409f4fd 100644 --- a/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml +++ b/fleetconfig-controller/charts/fleetconfig-controller/templates/ocm/fcc-addon/addon-template.yaml @@ -59,6 +59,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: PURGE_AGENT_NAMESPACE + value: '{{ `{{PURGE_AGENT_NAMESPACE}}` }}' image: {{ include "controller.baseImage" . }} imagePullPolicy: {{ quote .Values.image.pullPolicy }} name: manager diff --git a/fleetconfig-controller/config/devspace/spoke/manager.yaml b/fleetconfig-controller/config/devspace/spoke/manager.yaml index ffd9461f..04f4c075 100644 --- a/fleetconfig-controller/config/devspace/spoke/manager.yaml +++ b/fleetconfig-controller/config/devspace/spoke/manager.yaml @@ -34,6 +34,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: PURGE_AGENT_NAMESPACE + value: "true" command: - /bin/bash - -c diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go index ae1db081..a35bde57 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go @@ -22,7 +22,9 @@ import ( "os" "reflect" "slices" + "strconv" + corev1 "k8s.io/api/core/v1" kerrs "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -40,6 +42,9 @@ import ( "github.com/go-logr/logr" "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1beta1" + "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/kube" + "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/watch" + "github.com/open-cluster-management-io/lab/fleetconfig-controller/pkg/common" ) // SpokeReconciler reconciles a Spoke object @@ -261,6 +266,21 @@ func (r *SpokeReconciler) SetupWithManagerForHub(mgr ctrl.Manager) error { // SetupWithManagerForSpoke sets up the controller with the Manager to run on a Spoke cluster. func (r *SpokeReconciler) SetupWithManagerForSpoke(mgr ctrl.Manager) error { spokeName := os.Getenv(v1beta1.SpokeNameEnvVar) // we know this is set, because the mgr setup would have failed otherwise + + // set up a watcher that is independent of the reconcile loop, so that we can delete the controller AMW after the Spoke is fully deleted + watcher := watch.New(watch.Config{ + Client: mgr.GetClient(), + Log: r.Log.WithName(v1beta1.AgentCleanupWatcherName), + Interval: requeue, + Name: v1beta1.AgentCleanupWatcherName, + Condition: spokeDeletedCondition, + Handler: agentSelfDestruct, + }) + + err := mgr.Add(watcher) + if err != nil { + return err + } return ctrl.NewControllerManagedBy(mgr). For(&v1beta1.Spoke{}, builder.WithPredicates(predicate.NewPredicateFuncs( @@ -276,6 +296,55 @@ func (r *SpokeReconciler) SetupWithManagerForSpoke(mgr ctrl.Manager) error { Complete(r) } +func spokeDeletedCondition(ctx context.Context, c client.Client) (bool, error) { + spokeName := os.Getenv(v1beta1.SpokeNameEnvVar) + spokeNamespace := os.Getenv(v1beta1.SpokeNamespaceEnvVar) + spoke := &v1beta1.Spoke{} + err := c.Get(ctx, types.NamespacedName{ + Name: spokeName, + Namespace: spokeNamespace, + }, spoke) + + // condition is met when resource is NOT found + return kerrs.IsNotFound(err), client.IgnoreNotFound(err) +} + +func agentSelfDestruct(ctx context.Context, _ client.Client) error { + spokeKubeconfig, err := kube.RawFromInClusterRestConfig() + if err != nil { + return err + } + workClient, err := common.WorkClient(spokeKubeconfig) + if err != nil { + return err + } + restCfg, err := kube.RestConfigFromKubeconfig(spokeKubeconfig) + if err != nil { + return err + } + spokeClient, err := client.New(restCfg, client.Options{}) + if err != nil { + return err + } + + purgeNamespaceStr := os.Getenv(v1beta1.PurgeAgentNamespaceEnvVar) + purgeNamespace, err := strconv.ParseBool(purgeNamespaceStr) + if err != nil { + // fall back to orphaning the namespace which is less destructive + purgeNamespace = false + } + + if purgeNamespace { + agentNamespace := os.Getenv(v1beta1.ControllerNamespaceEnvVar) // manager.go enforces that this is not "" + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: agentNamespace}} + err := spokeClient.Delete(ctx, ns) + if err != nil && !kerrs.IsNotFound(err) { + return err + } + } + return workClient.WorkV1().AppliedManifestWorks().DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{}) +} + // sharedFieldsChanged checks whether the spec fields that are shared between Hub and Spokes were updated, // to prevent unnecessary reconciles of Spokes func sharedFieldsChanged(oldSpec, newSpec *v1beta1.HubSpec) bool { diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index f427fa8b..cb695c88 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -9,6 +9,7 @@ import ( "os" "os/exec" "slices" + "strconv" "dario.cat/mergo" certificatesv1 "k8s.io/api/certificates/v1" @@ -246,6 +247,10 @@ func (r *SpokeReconciler) doHubWork(ctx context.Context, spoke *v1beta1.Spoke, h Name: v1beta1.SpokeNamespaceEnvVar, Value: spoke.Namespace, }, + { + Name: v1beta1.PurgeAgentNamespaceEnvVar, + Value: strconv.FormatBool(spoke.Spec.CleanupConfig.PurgeAgentNamespace), + }, }, }, } @@ -299,7 +304,7 @@ func (r *SpokeReconciler) createAgentNamespace(ctx context.Context, spokeName st func (r *SpokeReconciler) deleteKubeconfigSecret(ctx context.Context, spoke *v1beta1.Spoke) error { if r.InstanceType != v1beta1.InstanceTypeManager || !spoke.PivotComplete() || - !spoke.Spec.Kubeconfig.InCluster || + spoke.Spec.Kubeconfig.InCluster || !spoke.Spec.CleanupConfig.PurgeKubeconfigSecret { return nil } @@ -533,8 +538,15 @@ func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spo } // remove all remaining klusterlet resources that unjoin did not remove (because of the remaining AMW) - var namespacesToDelete []string if spoke.Spec.CleanupConfig.PurgeKlusterletOperator { + restCfg, err := kube.RestConfigFromKubeconfig(spokeKubeconfig) + if err != nil { + return err + } + spokeClient, err := client.New(restCfg, client.Options{}) + if err != nil { + return err + } operatorClient, err := common.OperatorClient(spokeKubeconfig) if err != nil { return err @@ -543,43 +555,18 @@ func (r *SpokeReconciler) doSpokeCleanup(ctx context.Context, spoke *v1beta1.Spo if err := operatorClient.OperatorV1().Klusterlets().Delete(ctx, "klusterlet", metav1.DeleteOptions{}); err != nil && !kerrs.IsNotFound(err) { return err } - namespacesToDelete = append(namespacesToDelete, v1beta1.OCMSpokeNamespaces...) - } - if spoke.Spec.CleanupConfig.PurgeAgentNamespace { - agentNamespace := os.Getenv(v1beta1.ControllerNamespaceEnvVar) // manager.go enforces that this is not "" - namespacesToDelete = append(namespacesToDelete, agentNamespace) - } - workClient, err := common.WorkClient(spokeKubeconfig) - if err != nil { - return err - } - restCfg, err := kube.RestConfigFromKubeconfig(spokeKubeconfig) - if err != nil { - return err - } - spokeClient, err := client.New(restCfg, client.Options{}) - if err != nil { - return err - } - for _, nsName := range namespacesToDelete { - if nsName == "" { - continue - } - ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: nsName}} - if err := spokeClient.Delete(ctx, ns); err != nil && !kerrs.IsNotFound(err) { - return err + for _, nsName := range v1beta1.OCMSpokeNamespaces { + if nsName == "" { + continue + } + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: nsName}} + if err := spokeClient.Delete(ctx, ns); err != nil && !kerrs.IsNotFound(err) { + return err + } } } - // self-destruct as late as possible, so that the controller has enough time to patch the Spoke before being garbage collected - defer func() { - err = workClient.WorkV1().AppliedManifestWorks().DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{}) - if err != nil { - logger.Error(err, "failed to finalize agent cleanup") - } - }() - spoke.Finalizers = slices.DeleteFunc(spoke.Finalizers, func(s string) bool { return s == v1beta1.SpokeCleanupFinalizer }) diff --git a/fleetconfig-controller/internal/watch/watch.go b/fleetconfig-controller/internal/watch/watch.go new file mode 100644 index 00000000..a2a06338 --- /dev/null +++ b/fleetconfig-controller/internal/watch/watch.go @@ -0,0 +1,82 @@ +// Package watch contains a generic watcher that implements manager.Runnable +package watch + +import ( + "context" + "time" + + "github.com/go-logr/logr" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// ConditionFunc checks if a condition is met +// Returns (conditionMet, error) +type ConditionFunc func(ctx context.Context, c client.Client) (bool, error) + +// HandlerFunc is called when the condition is met +type HandlerFunc func(ctx context.Context, c client.Client) error + +// ResourceWatcher periodically checks a condition and triggers a handler +type ResourceWatcher struct { + client client.Client + log logr.Logger + interval time.Duration + name string + condition ConditionFunc + handler HandlerFunc +} + +// Config for creating a new ResourceWatcher +type Config struct { + Client client.Client + Log logr.Logger + Interval time.Duration + Name string + Condition ConditionFunc + Handler HandlerFunc +} + +// New creates a new ResourceWatcher +func New(cfg Config) *ResourceWatcher { + return &ResourceWatcher{ + client: cfg.Client, + log: cfg.Log, + interval: cfg.Interval, + name: cfg.Name, + condition: cfg.Condition, + handler: cfg.Handler, + } +} + +// Start begins the watch loop +func (w *ResourceWatcher) Start(ctx context.Context) error { + w.log.Info("Starting resource watcher", "name", w.name, "watchInterval", w.interval) + ticker := time.NewTicker(w.interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + w.log.Info("Shutting down resource watcher", "name", w.name) + return nil + case <-ticker.C: + if err := w.check(ctx); err != nil { + w.log.Error(err, "Watch check failed", "name", w.name) + } + } + } +} + +func (w *ResourceWatcher) check(ctx context.Context) error { + met, err := w.condition(ctx, w.client) + if err != nil { + return err + } + + if !met { + return nil + } + + w.log.V(1).Info("Condition met, executing handler", "name", w.name) + return w.handler(ctx, w.client) +} From edcdbcdee110cc6c5931fd5d95fc9e61298f7ac7 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 3 Oct 2025 12:45:06 -0700 Subject: [PATCH 54/62] feat: validate watch config and set timeouts on api calls Signed-off-by: Artur Shad Nik --- .../controller/v1beta1/spoke_controller.go | 4 +- .../internal/watch/watch.go | 47 +++++++++++++++++-- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go index a35bde57..b8337b1d 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_controller.go @@ -23,6 +23,7 @@ import ( "reflect" "slices" "strconv" + "time" corev1 "k8s.io/api/core/v1" kerrs "k8s.io/apimachinery/pkg/api/errors" @@ -268,10 +269,11 @@ func (r *SpokeReconciler) SetupWithManagerForSpoke(mgr ctrl.Manager) error { spokeName := os.Getenv(v1beta1.SpokeNameEnvVar) // we know this is set, because the mgr setup would have failed otherwise // set up a watcher that is independent of the reconcile loop, so that we can delete the controller AMW after the Spoke is fully deleted - watcher := watch.New(watch.Config{ + watcher := watch.NewOrDie(watch.Config{ Client: mgr.GetClient(), Log: r.Log.WithName(v1beta1.AgentCleanupWatcherName), Interval: requeue, + Timeout: 10 * time.Second, Name: v1beta1.AgentCleanupWatcherName, Condition: spokeDeletedCondition, Handler: agentSelfDestruct, diff --git a/fleetconfig-controller/internal/watch/watch.go b/fleetconfig-controller/internal/watch/watch.go index a2a06338..22425d7f 100644 --- a/fleetconfig-controller/internal/watch/watch.go +++ b/fleetconfig-controller/internal/watch/watch.go @@ -3,12 +3,16 @@ package watch import ( "context" + "errors" + "fmt" "time" "github.com/go-logr/logr" "sigs.k8s.io/controller-runtime/pkg/client" ) +const defaultTimeout = 10 * time.Second + // ConditionFunc checks if a condition is met // Returns (conditionMet, error) type ConditionFunc func(ctx context.Context, c client.Client) (bool, error) @@ -21,6 +25,7 @@ type ResourceWatcher struct { client client.Client log logr.Logger interval time.Duration + timeout time.Duration name string condition ConditionFunc handler HandlerFunc @@ -31,21 +36,51 @@ type Config struct { Client client.Client Log logr.Logger Interval time.Duration + Timeout time.Duration Name string Condition ConditionFunc Handler HandlerFunc } // New creates a new ResourceWatcher -func New(cfg Config) *ResourceWatcher { +func New(cfg Config) (*ResourceWatcher, error) { + if cfg.Client == nil { + return nil, errors.New("watch.Config.Client must not be nil") + } + if cfg.Log.GetSink() == nil { + return nil, errors.New("watch.Config.Log must not be nil") + } + if cfg.Condition == nil { + return nil, errors.New("watch.Config.Condition must not be nil") + } + if cfg.Handler == nil { + return nil, errors.New("watch.Config.Handler must not be nil") + } + if cfg.Interval <= 0 { + return nil, errors.New("watch.Config.Interval must be positive") + } + timeout := cfg.Timeout + if timeout <= 0 { + timeout = defaultTimeout + } + return &ResourceWatcher{ client: cfg.Client, log: cfg.Log, interval: cfg.Interval, + timeout: timeout, name: cfg.Name, condition: cfg.Condition, handler: cfg.Handler, + }, nil +} + +func NewOrDie(cfg Config) *ResourceWatcher { + rw, err := New(cfg) + if err != nil { + panic(fmt.Errorf("failed to create ResourceWatcher due to invalid input: %w", err)) } + return rw } // Start begins the watch loop @@ -68,7 +103,10 @@ func (w *ResourceWatcher) Start(ctx context.Context) error { } func (w *ResourceWatcher) check(ctx context.Context) error { - met, err := w.condition(ctx, w.client) + cCtx, cancel := context.WithTimeout(ctx, w.timeout) + defer cancel() + + met, err := w.condition(cCtx, w.client) if err != nil { return err } @@ -78,5 +116,8 @@ func (w *ResourceWatcher) check(ctx context.Context) error { } w.log.V(1).Info("Condition met, executing handler", "name", w.name) - return w.handler(ctx, w.client) + hCtx, cancel := context.WithTimeout(ctx, w.timeout) + defer cancel() + + return w.handler(hCtx, w.client) } From 0a5c147fa9c36ce28e09b36d95a2e7c4b6bafa3b Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 3 Oct 2025 12:46:55 -0700 Subject: [PATCH 55/62] chore: recover from watch panics Signed-off-by: Artur Shad Nik --- fleetconfig-controller/internal/watch/watch.go | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fleetconfig-controller/internal/watch/watch.go b/fleetconfig-controller/internal/watch/watch.go index 22425d7f..59ea7b80 100644 --- a/fleetconfig-controller/internal/watch/watch.go +++ b/fleetconfig-controller/internal/watch/watch.go @@ -95,9 +95,16 @@ func (w *ResourceWatcher) Start(ctx context.Context) error { w.log.Info("Shutting down resource watcher", "name", w.name) return nil case <-ticker.C: - if err := w.check(ctx); err != nil { - w.log.Error(err, "Watch check failed", "name", w.name) - } + func() { + defer func() { + if r := recover(); r != nil { + w.log.Error(fmt.Errorf("panic: %v", r), "Watch check panicked", "name", w.name) + } + }() + if err := w.check(ctx); err != nil { + w.log.Error(err, "Watch check failed", "name", w.name) + } + }() } } } From d28fd659fc0a6d001b63ac67e9974af8fb4f57b9 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 3 Oct 2025 12:53:03 -0700 Subject: [PATCH 56/62] chore: make reviewable Signed-off-by: Artur Shad Nik --- fleetconfig-controller/internal/watch/watch.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fleetconfig-controller/internal/watch/watch.go b/fleetconfig-controller/internal/watch/watch.go index 59ea7b80..56074e15 100644 --- a/fleetconfig-controller/internal/watch/watch.go +++ b/fleetconfig-controller/internal/watch/watch.go @@ -42,7 +42,7 @@ type Config struct { Handler HandlerFunc } -// New creates a new ResourceWatcher +// New creates a new ResourceWatcher. Returns an error if misconfigured. func New(cfg Config) (*ResourceWatcher, error) { if cfg.Client == nil { return nil, errors.New("watch.Config.Client must not be nil") @@ -75,6 +75,7 @@ func New(cfg Config) (*ResourceWatcher, error) { }, nil } +// NewOrDie creates a new ResourceWatcher. Panics if misconfigured. func NewOrDie(cfg Config) *ResourceWatcher { rw, err := New(cfg) if err != nil { From 891565de47975db7a24cbdc0b13dc3db7b78da32 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 3 Oct 2025 13:41:00 -0700 Subject: [PATCH 57/62] feat: redact sensitive data in logs Signed-off-by: Artur Shad Nik --- fleetconfig-controller/internal/args/args.go | 40 +++++++++++ .../internal/args/args_test.go | 72 +++++++++++++++++++ .../internal/controller/v1alpha1/addon.go | 10 +-- .../internal/controller/v1alpha1/hub.go | 12 ++-- .../internal/controller/v1alpha1/spoke.go | 24 +++---- .../internal/controller/v1beta1/addon.go | 10 +-- .../controller/v1beta1/hub_controller.go | 11 +-- .../controller/v1beta1/spoke_handler.go | 24 +++---- 8 files changed, 160 insertions(+), 43 deletions(-) diff --git a/fleetconfig-controller/internal/args/args.go b/fleetconfig-controller/internal/args/args.go index 803208a6..a13d5446 100644 --- a/fleetconfig-controller/internal/args/args.go +++ b/fleetconfig-controller/internal/args/args.go @@ -4,12 +4,24 @@ package args import ( "context" "reflect" + "slices" + "strings" "sigs.k8s.io/controller-runtime/pkg/log" "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/file" ) +const Redacted = "REDACTED" + +var sensitiveKeys = []string{ + "token", + "join-token", + "jointoken", + "hub-token", + "hubtoken", +} + // PrepareKubeconfig parses a kubeconfig spec and returns updated clusteradm args. // The '--kubeconfig' flag is added and a cleanup function is returned to remove the temp kubeconfig file. func PrepareKubeconfig(ctx context.Context, rawKubeconfig []byte, context string, args []string) ([]string, func(), error) { @@ -65,3 +77,31 @@ type ResourceSpec interface { type ResourceValues interface { String() string } + +// SanitizeArgs redacts sensitive args to prevent leaking credentials +func SanitizeArgs(args []string) []string { + if len(args) == 0 { + return []string{} + } + + // Start with a copy of all args + out := make([]string, len(args)) + copy(out, args) + + // Iterate through and redact values following sensitive keys + for i := 0; i < len(args); i++ { + if isSensitiveKey(args[i]) { + // Check if there's a next element to redact + if i+1 < len(args) { + out[i+1] = Redacted + i++ // Skip the next element since we just redacted it + } + } + } + return out +} + +func isSensitiveKey(key string) bool { + norm := strings.TrimPrefix(strings.ToLower(strings.TrimSpace(key)), "--") + return slices.Contains(sensitiveKeys, norm) +} diff --git a/fleetconfig-controller/internal/args/args_test.go b/fleetconfig-controller/internal/args/args_test.go index ffc0a2a5..b88b32a4 100644 --- a/fleetconfig-controller/internal/args/args_test.go +++ b/fleetconfig-controller/internal/args/args_test.go @@ -218,3 +218,75 @@ func TestMockResourceValues_String(t *testing.T) { }) } } + +func TestSanitizeArgs(t *testing.T) { + tests := []struct { + name string + args []string + expected []string + }{ + { + name: "empty args", + args: []string{}, + expected: []string{}, + }, + { + name: "no sensitive args", + args: []string{"init", "--hub-name", "my-hub", "--cluster-name", "spoke1"}, + expected: []string{"init", "--hub-name", "my-hub", "--cluster-name", "spoke1"}, + }, + { + name: "with token flag", + args: []string{"join", "--token", "secret-token-value", "--hub-name", "my-hub"}, + expected: []string{"join", "--token", Redacted, "--hub-name", "my-hub"}, + }, + { + name: "with joinToken flag", + args: []string{"join", "--joinToken", "secret-join-token", "--hub-name", "my-hub"}, + expected: []string{"join", "--joinToken", Redacted, "--hub-name", "my-hub"}, + }, + { + name: "case insensitive token", + args: []string{"join", "--TOKEN", "secret-value", "--hub-name", "my-hub"}, + expected: []string{"join", "--TOKEN", Redacted, "--hub-name", "my-hub"}, + }, + { + name: "case insensitive joinToken", + args: []string{"join", "--JoinToken", "secret-value", "--hub-name", "my-hub"}, + expected: []string{"join", "--JoinToken", Redacted, "--hub-name", "my-hub"}, + }, + { + name: "multiple sensitive flags", + args: []string{"join", "--token", "secret1", "--hub-name", "my-hub", "--hub-token", "secret2"}, + expected: []string{"join", "--token", Redacted, "--hub-name", "my-hub", "--hub-token", Redacted}, + }, + { + name: "sensitive flag at end with value", + args: []string{"join", "--hub-name", "my-hub", "--token", "secret-token"}, + expected: []string{"join", "--hub-name", "my-hub", "--token", Redacted}, + }, + { + name: "token value contains sensitive keyword", + args: []string{"join", "--message", "token-message", "--hub-name", "my-hub"}, + expected: []string{"join", "--message", "token-message", "--hub-name", "my-hub"}, + }, + { + name: "consecutive sensitive flags", + args: []string{"join", "--token", "secret1", "--joinToken", "secret2", "--hub-name", "my-hub"}, + expected: []string{"join", "--token", Redacted, "--joinToken", Redacted, "--hub-name", "my-hub"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := SanitizeArgs(tt.args) + if !reflect.DeepEqual(result, tt.expected) { + t.Errorf("SanitizeArgs() = %v, want %v", result, tt.expected) + } + // Verify the output has the same length as input + if len(result) != len(tt.args) { + t.Errorf("SanitizeArgs() output length = %d, want %d", len(result), len(tt.args)) + } + }) + } +} diff --git a/fleetconfig-controller/internal/controller/v1alpha1/addon.go b/fleetconfig-controller/internal/controller/v1alpha1/addon.go index e6392bf8..2e18793a 100644 --- a/fleetconfig-controller/internal/controller/v1alpha1/addon.go +++ b/fleetconfig-controller/internal/controller/v1alpha1/addon.go @@ -19,6 +19,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1alpha1" + arg_utils "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/args" exec_utils "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/exec" "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/file" ) @@ -159,7 +160,7 @@ func handleAddonCreate(ctx context.Context, kClient client.Client, fc *v1alpha1. args = append(args, fmt.Sprintf("--cluster-role-bind=%s", a.ClusterRoleBinding)) } - logger.V(7).Info("running", "command", clusteradm, "args", args) + logger.V(7).Info("running", "command", clusteradm, "args", arg_utils.SanitizeArgs(args)) cmd := exec.Command(clusteradm, args...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm addon create' to complete...") if err != nil { @@ -346,7 +347,7 @@ func handleAddonEnable(ctx context.Context, spokeName string, addons []v1alpha1. args = append(args, fmt.Sprintf("--annotate=%s", annot)) args = append(baseArgs, args...) - logger.V(7).Info("running", "command", clusteradm, "args", args) + logger.V(7).Info("running", "command", clusteradm, "args", arg_utils.SanitizeArgs(args)) cmd := exec.Command(clusteradm, args...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm addon enable' to complete...") if err != nil { @@ -379,7 +380,7 @@ func handleAddonDisable(ctx context.Context, spokeName string, addons []string, fmt.Sprintf("--clusters=%s", spokeName), }, fc.BaseArgs()...) - logger.V(7).Info("running", "command", clusteradm, "args", args) + logger.V(7).Info("running", "command", clusteradm, "args", arg_utils.SanitizeArgs(args)) cmd := exec.Command(clusteradm, args...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm addon disable' to complete...") if err != nil { @@ -488,7 +489,7 @@ func handleHubAddonUninstall(ctx context.Context, addons []v1alpha1.InstalledHub args = append(args, fmt.Sprintf("--namespace=%s", addon.Namespace)) } - logger.V(7).Info("running", "command", clusteradm, "args", args) + logger.V(7).Info("running", "command", clusteradm, "args", arg_utils.SanitizeArgs(args)) cmd := exec.Command(clusteradm, args...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm uninstall hub-addon' to complete...") if err != nil { @@ -538,6 +539,7 @@ func handleHubAddonInstall(ctx context.Context, addonC *addonapi.Clientset, addo args = append(args, fmt.Sprintf("--namespace=%s", addon.InstallNamespace)) } + logger.V(7).Info("running", "command", clusteradm, "args", arg_utils.SanitizeArgs(args)) cmd := exec.Command(clusteradm, args...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm install hub-addon' to complete...") if err != nil { diff --git a/fleetconfig-controller/internal/controller/v1alpha1/hub.go b/fleetconfig-controller/internal/controller/v1alpha1/hub.go index 6680c5a2..9d7ed012 100644 --- a/fleetconfig-controller/internal/controller/v1alpha1/hub.go +++ b/fleetconfig-controller/internal/controller/v1alpha1/hub.go @@ -21,7 +21,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1alpha1" - "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/args" + arg_utils "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/args" exec_utils "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/exec" "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/file" "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/kube" @@ -182,13 +182,13 @@ func initializeHub(ctx context.Context, fc *v1alpha1.FleetConfig, hubKubeconfig initArgs = append(initArgs, "--bundle-version", fc.Spec.Hub.ClusterManager.Source.BundleVersion) initArgs = append(initArgs, "--image-registry", fc.Spec.Hub.ClusterManager.Source.Registry) // resources args - initArgs = append(initArgs, args.PrepareResources(fc.Spec.Hub.ClusterManager.Resources)...) + initArgs = append(initArgs, arg_utils.PrepareResources(fc.Spec.Hub.ClusterManager.Resources)...) } else { // one of clusterManager or singletonControlPlane must be specified, per validating webhook, but handle the edge case anyway return fmt.Errorf("unknown hub type, must specify either hub.clusterManager or hub.singletonControlPlane") } - initArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, hubKubeconfig, fc.Spec.Hub.Kubeconfig.Context, initArgs) + initArgs, cleanupKcfg, err := arg_utils.PrepareKubeconfig(ctx, hubKubeconfig, fc.Spec.Hub.Kubeconfig.Context, initArgs) if cleanupKcfg != nil { defer cleanupKcfg() } @@ -196,7 +196,7 @@ func initializeHub(ctx context.Context, fc *v1alpha1.FleetConfig, hubKubeconfig return err } - logger.V(1).Info("clusteradm init", "args", initArgs) + logger.V(1).Info("clusteradm init", "args", arg_utils.SanitizeArgs(initArgs)) cmd := exec.Command(clusteradm, initArgs...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm init' to complete...") @@ -282,7 +282,7 @@ func upgradeHub(ctx context.Context, fc *v1alpha1.FleetConfig) error { "--wait=true", }, fc.BaseArgs()...) - logger.V(1).Info("clusteradm upgrade clustermanager", "args", upgradeArgs) + logger.V(1).Info("clusteradm upgrade clustermanager", "args", arg_utils.SanitizeArgs(upgradeArgs)) cmd := exec.Command(clusteradm, upgradeArgs...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm upgrade clustermanager' to complete...") @@ -325,7 +325,7 @@ func cleanHub(ctx context.Context, kClient client.Client, hubKubeconfig []byte, fmt.Sprintf("--purge-operator=%t", fc.Spec.Hub.ClusterManager.PurgeOperator), }, fc.BaseArgs()...) - logger.V(1).Info("clusteradm clean", "args", cleanArgs) + logger.V(1).Info("clusteradm clean", "args", arg_utils.SanitizeArgs(cleanArgs)) cmd := exec.Command(clusteradm, cleanArgs...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm clean' to complete...") diff --git a/fleetconfig-controller/internal/controller/v1alpha1/spoke.go b/fleetconfig-controller/internal/controller/v1alpha1/spoke.go index acf09d9f..8a5e7054 100644 --- a/fleetconfig-controller/internal/controller/v1alpha1/spoke.go +++ b/fleetconfig-controller/internal/controller/v1alpha1/spoke.go @@ -25,7 +25,7 @@ import ( "sigs.k8s.io/yaml" "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1alpha1" - "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/args" + arg_utils "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/args" exec_utils "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/exec" "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/file" "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/hash" @@ -240,7 +240,7 @@ func acceptCluster(ctx context.Context, fc *v1alpha1.FleetConfig, name string, s "accept", "--cluster", name, }, fc.BaseArgs()...) - logger.V(1).Info("clusteradm accept", "args", acceptArgs) + logger.V(1).Info("clusteradm accept", "args", arg_utils.SanitizeArgs(acceptArgs)) // TODO: handle other args: // --requesters=[]: @@ -278,7 +278,7 @@ func getToken(ctx context.Context, fc *v1alpha1.FleetConfig, hubKubeconfig []byt if fc.Spec.Hub.ClusterManager != nil { tokenArgs = append(tokenArgs, fmt.Sprintf("--use-bootstrap-token=%t", fc.Spec.Hub.ClusterManager.UseBootstrapToken)) } - tokenArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, hubKubeconfig, fc.Spec.Hub.Kubeconfig.Context, tokenArgs) + tokenArgs, cleanupKcfg, err := arg_utils.PrepareKubeconfig(ctx, hubKubeconfig, fc.Spec.Hub.Kubeconfig.Context, tokenArgs) if cleanupKcfg != nil { defer cleanupKcfg() } @@ -286,7 +286,7 @@ func getToken(ctx context.Context, fc *v1alpha1.FleetConfig, hubKubeconfig []byt return nil, fmt.Errorf("failed to prepare kubeconfig: %w", err) } - logger.V(1).Info("clusteradm get token", "args", tokenArgs) + logger.V(1).Info("clusteradm get token", "args", arg_utils.SanitizeArgs(tokenArgs)) cmd := exec.Command(clusteradm, tokenArgs...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm get token' to complete...") @@ -294,7 +294,7 @@ func getToken(ctx context.Context, fc *v1alpha1.FleetConfig, hubKubeconfig []byt out := append(stdout, stderr...) return nil, fmt.Errorf("failed to get join token: %v, output: %s", err, string(out)) } - logger.V(1).Info("got join token", "output", string(stdout)) + logger.V(1).Info("got join token", "output", arg_utils.Redacted) // TODO - SANITIZE tokenMeta := &tokenMeta{} if err := json.Unmarshal(stdout, &tokenMeta); err != nil { @@ -330,7 +330,7 @@ func joinSpoke(ctx context.Context, kClient client.Client, fc *v1alpha1.FleetCon } // resources args - joinArgs = append(joinArgs, args.PrepareResources(spoke.Klusterlet.Resources)...) + joinArgs = append(joinArgs, arg_utils.PrepareResources(spoke.Klusterlet.Resources)...) // Use hub API server from spec if provided and not forced to use internal endpoint, // otherwise fall back to the hub API server from the tokenMeta @@ -410,7 +410,7 @@ func joinSpoke(ctx context.Context, kClient client.Client, fc *v1alpha1.FleetCon if err != nil { return err } - joinArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, kubeconfig, spoke.Kubeconfig.Context, joinArgs) + joinArgs, cleanupKcfg, err := arg_utils.PrepareKubeconfig(ctx, kubeconfig, spoke.Kubeconfig.Context, joinArgs) if cleanupKcfg != nil { defer cleanupKcfg() } @@ -418,7 +418,7 @@ func joinSpoke(ctx context.Context, kClient client.Client, fc *v1alpha1.FleetCon return err } - logger.V(1).Info("clusteradm join", "args", joinArgs) + logger.V(1).Info("clusteradm join", "args", arg_utils.SanitizeArgs(joinArgs)) cmd := exec.Command(clusteradm, joinArgs...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, fmt.Sprintf("waiting for 'clusteradm join' to complete for spoke %s...", spoke.Name)) @@ -527,7 +527,7 @@ func upgradeSpoke(ctx context.Context, kClient client.Client, fc *v1alpha1.Fleet if err != nil { return err } - upgradeArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, kubeconfig, spoke.Kubeconfig.Context, upgradeArgs) + upgradeArgs, cleanupKcfg, err := arg_utils.PrepareKubeconfig(ctx, kubeconfig, spoke.Kubeconfig.Context, upgradeArgs) if cleanupKcfg != nil { defer cleanupKcfg() } @@ -535,7 +535,7 @@ func upgradeSpoke(ctx context.Context, kClient client.Client, fc *v1alpha1.Fleet return err } - logger.V(1).Info("clusteradm upgrade klusterlet", "args", upgradeArgs) + logger.V(1).Info("clusteradm upgrade klusterlet", "args", arg_utils.SanitizeArgs(upgradeArgs)) cmd := exec.Command(clusteradm, upgradeArgs...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, fmt.Sprintf("waiting for 'clusteradm upgrade klusterlet' to complete for spoke %s...", spoke.Name)) @@ -587,7 +587,7 @@ func unjoinSpoke(ctx context.Context, kClient client.Client, fc *v1alpha1.FleetC if err != nil { return err } - unjoinArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, kubeconfig, spoke.GetKubeconfig().Context, unjoinArgs) + unjoinArgs, cleanupKcfg, err := arg_utils.PrepareKubeconfig(ctx, kubeconfig, spoke.GetKubeconfig().Context, unjoinArgs) if cleanupKcfg != nil { defer cleanupKcfg() } @@ -595,7 +595,7 @@ func unjoinSpoke(ctx context.Context, kClient client.Client, fc *v1alpha1.FleetC return fmt.Errorf("failed to unjoin spoke cluster %s: %w", spoke.GetName(), err) } - logger.V(1).Info("clusteradm unjoin", "args", unjoinArgs) + logger.V(1).Info("clusteradm unjoin", "args", arg_utils.SanitizeArgs(unjoinArgs)) cmd := exec.Command(clusteradm, unjoinArgs...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, fmt.Sprintf("waiting for 'clusteradm unjoin' to complete for spoke %s...", spoke.GetName())) diff --git a/fleetconfig-controller/internal/controller/v1beta1/addon.go b/fleetconfig-controller/internal/controller/v1beta1/addon.go index 106cad4c..6da1ce30 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/addon.go +++ b/fleetconfig-controller/internal/controller/v1beta1/addon.go @@ -27,6 +27,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1beta1" + arg_utils "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/args" exec_utils "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/exec" "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/file" ) @@ -188,7 +189,7 @@ func handleAddonCreate(ctx context.Context, kClient client.Client, hub *v1beta1. args = append(args, fmt.Sprintf("--cluster-role-bind=%s", a.ClusterRoleBinding)) } - logger.V(7).Info("running", "command", clusteradm, "args", args) + logger.V(7).Info("running", "command", clusteradm, "args", arg_utils.SanitizeArgs(args)) cmd := exec.Command(clusteradm, args...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm addon create' to complete...") if err != nil { @@ -378,7 +379,7 @@ func handleAddonEnable(ctx context.Context, spoke *v1beta1.Spoke, addons []v1bet } args = append(baseArgs, args...) - logger.V(7).Info("running", "command", clusteradm, "args", args) + logger.V(7).Info("running", "command", clusteradm, "args", arg_utils.SanitizeArgs(args)) cmd := exec.Command(clusteradm, args...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm addon enable' to complete...") if err != nil { @@ -462,7 +463,7 @@ func handleAddonDisable(ctx context.Context, spoke *v1beta1.Spoke, enabledAddons fmt.Sprintf("--clusters=%s", spoke.Name), }, spoke.BaseArgs()...) - logger.V(7).Info("running", "command", clusteradm, "args", args) + logger.V(7).Info("running", "command", clusteradm, "args", arg_utils.SanitizeArgs(args)) cmd := exec.Command(clusteradm, args...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm addon disable' to complete...") if err != nil { @@ -593,7 +594,7 @@ func handleHubAddonUninstall(ctx context.Context, addons []v1beta1.InstalledHubA args = append(args, fmt.Sprintf("--namespace=%s", addon.Namespace)) } - logger.V(7).Info("running", "command", clusteradm, "args", args) + logger.V(7).Info("running", "command", clusteradm, "args", arg_utils.SanitizeArgs(args)) cmd := exec.Command(clusteradm, args...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm uninstall hub-addon' to complete...") if err != nil { @@ -643,6 +644,7 @@ func handleHubAddonInstall(ctx context.Context, addonC *addonapi.Clientset, addo args = append(args, fmt.Sprintf("--namespace=%s", addon.InstallNamespace)) } + logger.V(7).Info("running", "command", clusteradm, "args", arg_utils.SanitizeArgs(args)) cmd := exec.Command(clusteradm, args...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm install hub-addon' to complete...") if err != nil { diff --git a/fleetconfig-controller/internal/controller/v1beta1/hub_controller.go b/fleetconfig-controller/internal/controller/v1beta1/hub_controller.go index d23d09f4..00aa93c6 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/hub_controller.go +++ b/fleetconfig-controller/internal/controller/v1beta1/hub_controller.go @@ -43,7 +43,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1beta1" - "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/args" + arg_utils "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/args" exec_utils "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/exec" "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/file" "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/kube" @@ -248,6 +248,7 @@ func (r *HubReconciler) cleanHub(ctx context.Context, hub *v1beta1.Hub, hubKubec } cleanArgs = append(cleanArgs, hub.BaseArgs()...) + logger.V(7).Info("running", "command", clusteradm, "args", arg_utils.SanitizeArgs(cleanArgs)) cmd := exec.Command(clusteradm, cleanArgs...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm clean' to complete...") if err != nil { @@ -421,13 +422,13 @@ func (r *HubReconciler) initializeHub(ctx context.Context, hub *v1beta1.Hub, hub initArgs = append(initArgs, "--bundle-version", hub.Spec.ClusterManager.Source.BundleVersion) initArgs = append(initArgs, "--image-registry", hub.Spec.ClusterManager.Source.Registry) // resources args - initArgs = append(initArgs, args.PrepareResources(hub.Spec.ClusterManager.Resources)...) + initArgs = append(initArgs, arg_utils.PrepareResources(hub.Spec.ClusterManager.Resources)...) } else { // one of clusterManager or singletonControlPlane must be specified, per validating webhook, but handle the edge case anyway return fmt.Errorf("unknown hub type, must specify either hub.clusterManager or hub.singletonControlPlane") } - initArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, hubKubeconfig, hub.Spec.Kubeconfig.Context, initArgs) + initArgs, cleanupKcfg, err := arg_utils.PrepareKubeconfig(ctx, hubKubeconfig, hub.Spec.Kubeconfig.Context, initArgs) if cleanupKcfg != nil { defer cleanupKcfg() } @@ -435,7 +436,7 @@ func (r *HubReconciler) initializeHub(ctx context.Context, hub *v1beta1.Hub, hub return err } - logger.V(1).Info("clusteradm init", "args", initArgs) + logger.V(1).Info("clusteradm init", "args", arg_utils.SanitizeArgs(initArgs)) cmd := exec.Command(clusteradm, initArgs...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm init' to complete...") @@ -509,7 +510,7 @@ func (r *HubReconciler) upgradeHub(ctx context.Context, hub *v1beta1.Hub) error "--wait=true", }, hub.BaseArgs()...) - logger.V(1).Info("clusteradm upgrade clustermanager", "args", upgradeArgs) + logger.V(1).Info("clusteradm upgrade clustermanager", "args", arg_utils.SanitizeArgs(upgradeArgs)) cmd := exec.Command(clusteradm, upgradeArgs...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm upgrade clustermanager' to complete...") diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index cb695c88..2507a055 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -26,7 +26,7 @@ import ( "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1alpha1" "github.com/open-cluster-management-io/lab/fleetconfig-controller/api/v1beta1" - "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/args" + arg_utils "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/args" exec_utils "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/exec" "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/file" "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/hash" @@ -631,7 +631,7 @@ func (r *SpokeReconciler) joinSpoke(ctx context.Context, spoke *v1beta1.Spoke, h } // resources args - joinArgs = append(joinArgs, args.PrepareResources(spoke.Spec.Klusterlet.Resources)...) + joinArgs = append(joinArgs, arg_utils.PrepareResources(spoke.Spec.Klusterlet.Resources)...) // Use hub API server from spec if provided and not forced to use internal endpoint, // otherwise fall back to the hub API server from the tokenMeta @@ -708,7 +708,7 @@ func (r *SpokeReconciler) joinSpoke(ctx context.Context, spoke *v1beta1.Spoke, h } joinArgs = append(joinArgs, valuesArgs...) - joinArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, spokeKubeconfig, spoke.Spec.Kubeconfig.Context, joinArgs) + joinArgs, cleanupKcfg, err := arg_utils.PrepareKubeconfig(ctx, spokeKubeconfig, spoke.Spec.Kubeconfig.Context, joinArgs) if cleanupKcfg != nil { defer cleanupKcfg() } @@ -716,7 +716,7 @@ func (r *SpokeReconciler) joinSpoke(ctx context.Context, spoke *v1beta1.Spoke, h return err } - logger.V(1).Info("clusteradm join", "args", joinArgs) + logger.V(1).Info("clusteradm join", "args", arg_utils.SanitizeArgs(joinArgs)) cmd := exec.Command(clusteradm, joinArgs...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, fmt.Sprintf("waiting for 'clusteradm join' to complete for spoke %s...", spoke.Name)) @@ -738,7 +738,7 @@ func acceptCluster(ctx context.Context, spoke *v1beta1.Spoke, skipApproveCheck b "accept", "--cluster", spoke.Name, }, spoke.BaseArgs()...) - logger.V(1).Info("clusteradm accept", "args", acceptArgs) + logger.V(1).Info("clusteradm accept", "args", arg_utils.SanitizeArgs(acceptArgs)) // TODO: handle other args: // --requesters=[]: @@ -858,7 +858,7 @@ func (r *SpokeReconciler) upgradeSpoke(ctx context.Context, spoke *v1beta1.Spoke } upgradeArgs = append(upgradeArgs, valuesArgs...) - upgradeArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, spokeKubeconfig, spoke.Spec.Kubeconfig.Context, upgradeArgs) + upgradeArgs, cleanupKcfg, err := arg_utils.PrepareKubeconfig(ctx, spokeKubeconfig, spoke.Spec.Kubeconfig.Context, upgradeArgs) if cleanupKcfg != nil { defer cleanupKcfg() } @@ -866,7 +866,7 @@ func (r *SpokeReconciler) upgradeSpoke(ctx context.Context, spoke *v1beta1.Spoke return err } - logger.V(1).Info("clusteradm upgrade klusterlet", "args", upgradeArgs) + logger.V(1).Info("clusteradm upgrade klusterlet", "args", arg_utils.SanitizeArgs(upgradeArgs)) cmd := exec.Command(clusteradm, upgradeArgs...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, fmt.Sprintf("waiting for 'clusteradm upgrade klusterlet' to complete for spoke %s...", spoke.Name)) @@ -893,7 +893,7 @@ func (r *SpokeReconciler) unjoinSpoke(ctx context.Context, spoke *v1beta1.Spoke, fmt.Sprintf("--purge-operator=%t", spoke.Spec.CleanupConfig.PurgeKlusterletOperator), }, spoke.BaseArgs()...) - unjoinArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, spokeKubeconfig, spoke.Spec.Kubeconfig.Context, unjoinArgs) + unjoinArgs, cleanupKcfg, err := arg_utils.PrepareKubeconfig(ctx, spokeKubeconfig, spoke.Spec.Kubeconfig.Context, unjoinArgs) if cleanupKcfg != nil { defer cleanupKcfg() } @@ -901,7 +901,7 @@ func (r *SpokeReconciler) unjoinSpoke(ctx context.Context, spoke *v1beta1.Spoke, return fmt.Errorf("failed to unjoin spoke cluster %s: %w", spoke.GetName(), err) } - logger.V(1).Info("clusteradm unjoin", "args", unjoinArgs) + logger.V(1).Info("clusteradm unjoin", "args", arg_utils.SanitizeArgs(unjoinArgs)) cmd := exec.Command(clusteradm, unjoinArgs...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, fmt.Sprintf("waiting for 'clusteradm unjoin' to complete for spoke %s...", spoke.GetName())) @@ -926,7 +926,7 @@ func getToken(ctx context.Context, hubMeta hubMeta) (*tokenMeta, error) { if hubMeta.hub.Spec.ClusterManager != nil { tokenArgs = append(tokenArgs, fmt.Sprintf("--use-bootstrap-token=%t", hubMeta.hub.Spec.ClusterManager.UseBootstrapToken)) } - tokenArgs, cleanupKcfg, err := args.PrepareKubeconfig(ctx, hubMeta.kubeconfig, hubMeta.hub.Spec.Kubeconfig.Context, tokenArgs) + tokenArgs, cleanupKcfg, err := arg_utils.PrepareKubeconfig(ctx, hubMeta.kubeconfig, hubMeta.hub.Spec.Kubeconfig.Context, tokenArgs) if cleanupKcfg != nil { defer cleanupKcfg() } @@ -934,7 +934,7 @@ func getToken(ctx context.Context, hubMeta hubMeta) (*tokenMeta, error) { return nil, fmt.Errorf("failed to prepare kubeconfig: %w", err) } - logger.V(1).Info("clusteradm get token", "args", tokenArgs) + logger.V(1).Info("clusteradm get token", "args", arg_utils.SanitizeArgs(tokenArgs)) cmd := exec.Command(clusteradm, tokenArgs...) stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm get token' to complete...") @@ -942,7 +942,7 @@ func getToken(ctx context.Context, hubMeta hubMeta) (*tokenMeta, error) { out := append(stdout, stderr...) return nil, fmt.Errorf("failed to get join token: %v, output: %s", err, string(out)) } - logger.V(1).Info("got join token", "output", string(stdout)) + logger.V(1).Info("got join token", "output", arg_utils.Redacted) tokenMeta := &tokenMeta{} if err := json.Unmarshal(stdout, &tokenMeta); err != nil { From c0a2e148e7207d01203624c1eb00b9e91a499bb8 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 3 Oct 2025 13:50:52 -0700 Subject: [PATCH 58/62] feat: sanitize output Signed-off-by: Artur Shad Nik --- fleetconfig-controller/internal/args/args.go | 34 ++++++-- .../internal/args/args_test.go | 84 ++++++++++++++++--- .../internal/controller/v1alpha1/hub.go | 2 +- .../internal/controller/v1alpha1/spoke.go | 2 +- .../controller/v1beta1/hub_controller.go | 2 +- .../controller/v1beta1/spoke_handler.go | 2 +- 6 files changed, 102 insertions(+), 24 deletions(-) diff --git a/fleetconfig-controller/internal/args/args.go b/fleetconfig-controller/internal/args/args.go index a13d5446..f1ce11c6 100644 --- a/fleetconfig-controller/internal/args/args.go +++ b/fleetconfig-controller/internal/args/args.go @@ -15,7 +15,6 @@ import ( const Redacted = "REDACTED" var sensitiveKeys = []string{ - "token", "join-token", "jointoken", "hub-token", @@ -80,19 +79,40 @@ type ResourceValues interface { // SanitizeArgs redacts sensitive args to prevent leaking credentials func SanitizeArgs(args []string) []string { - if len(args) == 0 { + return sanitizeSlice(args) +} + +// SanitizeOutput redacts sensitive values from command output +func SanitizeOutput(output []byte) []byte { + if len(output) == 0 { + return output + } + + // Convert bytes to string and split into words + text := string(output) + words := strings.Fields(text) + + // Sanitize the words + sanitized := sanitizeSlice(words) + + // Join back and convert to bytes + return []byte(strings.Join(sanitized, " ")) +} + +func sanitizeSlice(words []string) []string { + if len(words) == 0 { return []string{} } // Start with a copy of all args - out := make([]string, len(args)) - copy(out, args) + out := make([]string, len(words)) + copy(out, words) // Iterate through and redact values following sensitive keys - for i := 0; i < len(args); i++ { - if isSensitiveKey(args[i]) { + for i := 0; i < len(words); i++ { + if isSensitiveKey(words[i]) { // Check if there's a next element to redact - if i+1 < len(args) { + if i+1 < len(words) { out[i+1] = Redacted i++ // Skip the next element since we just redacted it } diff --git a/fleetconfig-controller/internal/args/args_test.go b/fleetconfig-controller/internal/args/args_test.go index b88b32a4..e8a9005c 100644 --- a/fleetconfig-controller/internal/args/args_test.go +++ b/fleetconfig-controller/internal/args/args_test.go @@ -237,19 +237,14 @@ func TestSanitizeArgs(t *testing.T) { }, { name: "with token flag", - args: []string{"join", "--token", "secret-token-value", "--hub-name", "my-hub"}, - expected: []string{"join", "--token", Redacted, "--hub-name", "my-hub"}, + args: []string{"join", "--hub-token", "secret-token-value", "--hub-name", "my-hub"}, + expected: []string{"join", "--hub-token", Redacted, "--hub-name", "my-hub"}, }, { name: "with joinToken flag", args: []string{"join", "--joinToken", "secret-join-token", "--hub-name", "my-hub"}, expected: []string{"join", "--joinToken", Redacted, "--hub-name", "my-hub"}, }, - { - name: "case insensitive token", - args: []string{"join", "--TOKEN", "secret-value", "--hub-name", "my-hub"}, - expected: []string{"join", "--TOKEN", Redacted, "--hub-name", "my-hub"}, - }, { name: "case insensitive joinToken", args: []string{"join", "--JoinToken", "secret-value", "--hub-name", "my-hub"}, @@ -257,13 +252,13 @@ func TestSanitizeArgs(t *testing.T) { }, { name: "multiple sensitive flags", - args: []string{"join", "--token", "secret1", "--hub-name", "my-hub", "--hub-token", "secret2"}, - expected: []string{"join", "--token", Redacted, "--hub-name", "my-hub", "--hub-token", Redacted}, + args: []string{"join", "--hub-token", "secret1", "--hub-name", "my-hub", "--hub-token", "secret2"}, + expected: []string{"join", "--hub-token", Redacted, "--hub-name", "my-hub", "--hub-token", Redacted}, }, { name: "sensitive flag at end with value", - args: []string{"join", "--hub-name", "my-hub", "--token", "secret-token"}, - expected: []string{"join", "--hub-name", "my-hub", "--token", Redacted}, + args: []string{"join", "--hub-name", "my-hub", "--hub-token", "secret-token"}, + expected: []string{"join", "--hub-name", "my-hub", "--hub-token", Redacted}, }, { name: "token value contains sensitive keyword", @@ -272,8 +267,8 @@ func TestSanitizeArgs(t *testing.T) { }, { name: "consecutive sensitive flags", - args: []string{"join", "--token", "secret1", "--joinToken", "secret2", "--hub-name", "my-hub"}, - expected: []string{"join", "--token", Redacted, "--joinToken", Redacted, "--hub-name", "my-hub"}, + args: []string{"join", "--hub-token", "secret1", "--joinToken", "secret2", "--hub-name", "my-hub"}, + expected: []string{"join", "--hub-token", Redacted, "--joinToken", Redacted, "--hub-name", "my-hub"}, }, } @@ -290,3 +285,66 @@ func TestSanitizeArgs(t *testing.T) { }) } } + +func TestSanitizeOutput(t *testing.T) { + tests := []struct { + name string + output []byte + expected []byte + }{ + { + name: "empty output", + output: []byte{}, + expected: []byte{}, + }, + { + name: "no sensitive data", + output: []byte("Successfully initialized cluster my-hub"), + expected: []byte("Successfully initialized cluster my-hub"), + }, + { + name: "output with token flag and value", + output: []byte("clusteradm join --hub-token abc123secret --hub-name my-hub"), + expected: []byte("clusteradm join --hub-token REDACTED --hub-name my-hub"), + }, + { + name: "output with hub-token", + output: []byte("Using --hub-token xyz789secret for authentication"), + expected: []byte("Using --hub-token REDACTED for authentication"), + }, + { + name: "multiline output with token", + output: []byte("Connecting to hub...\nUsing --hub-token mysecret123\nSuccess!"), + expected: []byte("Connecting to hub... Using --hub-token REDACTED Success!"), + }, + { + name: "output with multiple sensitive values", + output: []byte("join --hub-token secret1 --hub-token secret2 --cluster-name spoke1"), + expected: []byte("join --hub-token REDACTED --hub-token REDACTED --cluster-name spoke1"), + }, + { + name: "output with jointoken variations", + output: []byte("Using --jointoken abc123 or --join-token def456"), + expected: []byte("Using --jointoken REDACTED or --join-token REDACTED"), + }, + { + name: "token word that is not a flag", + output: []byte("The token was successfully generated"), + expected: []byte("The token was successfully generated"), + }, + { + name: "output with extra whitespace", + output: []byte(" join --hub-token secret123 --hub-name test "), + expected: []byte("join --hub-token REDACTED --hub-name test"), + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := SanitizeOutput(tt.output) + if !reflect.DeepEqual(result, tt.expected) { + t.Errorf("SanitizeOutput() = %q, want %q", string(result), string(tt.expected)) + } + }) + } +} diff --git a/fleetconfig-controller/internal/controller/v1alpha1/hub.go b/fleetconfig-controller/internal/controller/v1alpha1/hub.go index 9d7ed012..6f02f232 100644 --- a/fleetconfig-controller/internal/controller/v1alpha1/hub.go +++ b/fleetconfig-controller/internal/controller/v1alpha1/hub.go @@ -204,7 +204,7 @@ func initializeHub(ctx context.Context, fc *v1alpha1.FleetConfig, hubKubeconfig out := append(stdout, stderr...) return fmt.Errorf("failed to init hub: %v, output: %s", err, string(out)) } - logger.V(1).Info("hub initialized", "output", string(stdout)) + logger.V(1).Info("hub initialized", "output", string(arg_utils.SanitizeOutput(stdout))) return nil } diff --git a/fleetconfig-controller/internal/controller/v1alpha1/spoke.go b/fleetconfig-controller/internal/controller/v1alpha1/spoke.go index 8a5e7054..f12f8c6a 100644 --- a/fleetconfig-controller/internal/controller/v1alpha1/spoke.go +++ b/fleetconfig-controller/internal/controller/v1alpha1/spoke.go @@ -294,7 +294,7 @@ func getToken(ctx context.Context, fc *v1alpha1.FleetConfig, hubKubeconfig []byt out := append(stdout, stderr...) return nil, fmt.Errorf("failed to get join token: %v, output: %s", err, string(out)) } - logger.V(1).Info("got join token", "output", arg_utils.Redacted) // TODO - SANITIZE + logger.V(1).Info("got join token", "output", arg_utils.SanitizeOutput(stdout)) tokenMeta := &tokenMeta{} if err := json.Unmarshal(stdout, &tokenMeta); err != nil { diff --git a/fleetconfig-controller/internal/controller/v1beta1/hub_controller.go b/fleetconfig-controller/internal/controller/v1beta1/hub_controller.go index 00aa93c6..df62e3e3 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/hub_controller.go +++ b/fleetconfig-controller/internal/controller/v1beta1/hub_controller.go @@ -444,7 +444,7 @@ func (r *HubReconciler) initializeHub(ctx context.Context, hub *v1beta1.Hub, hub out := append(stdout, stderr...) return fmt.Errorf("failed to init hub: %v, output: %s", err, string(out)) } - logger.V(1).Info("hub initialized", "output", string(stdout)) + logger.V(1).Info("hub initialized", "output", string(arg_utils.SanitizeOutput(stdout))) return nil } diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 2507a055..3705f747 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -942,7 +942,7 @@ func getToken(ctx context.Context, hubMeta hubMeta) (*tokenMeta, error) { out := append(stdout, stderr...) return nil, fmt.Errorf("failed to get join token: %v, output: %s", err, string(out)) } - logger.V(1).Info("got join token", "output", arg_utils.Redacted) + logger.V(1).Info("got join token", "output", arg_utils.SanitizeOutput(stdout)) tokenMeta := &tokenMeta{} if err := json.Unmarshal(stdout, &tokenMeta); err != nil { From 9714339c21cbdcd1cc96969f9a3501274a1dc334 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 3 Oct 2025 14:35:39 -0700 Subject: [PATCH 59/62] chore: revert to naive output redaction Signed-off-by: Artur Shad Nik --- fleetconfig-controller/internal/args/args.go | 1 + fleetconfig-controller/internal/controller/v1alpha1/spoke.go | 2 +- .../internal/controller/v1beta1/spoke_handler.go | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fleetconfig-controller/internal/args/args.go b/fleetconfig-controller/internal/args/args.go index f1ce11c6..34d5f201 100644 --- a/fleetconfig-controller/internal/args/args.go +++ b/fleetconfig-controller/internal/args/args.go @@ -12,6 +12,7 @@ import ( "github.com/open-cluster-management-io/lab/fleetconfig-controller/internal/file" ) +// Redacted is used to mask sensitive values const Redacted = "REDACTED" var sensitiveKeys = []string{ diff --git a/fleetconfig-controller/internal/controller/v1alpha1/spoke.go b/fleetconfig-controller/internal/controller/v1alpha1/spoke.go index f12f8c6a..4d04ba3c 100644 --- a/fleetconfig-controller/internal/controller/v1alpha1/spoke.go +++ b/fleetconfig-controller/internal/controller/v1alpha1/spoke.go @@ -294,7 +294,7 @@ func getToken(ctx context.Context, fc *v1alpha1.FleetConfig, hubKubeconfig []byt out := append(stdout, stderr...) return nil, fmt.Errorf("failed to get join token: %v, output: %s", err, string(out)) } - logger.V(1).Info("got join token", "output", arg_utils.SanitizeOutput(stdout)) + logger.V(1).Info("got join token", "output", arg_utils.Redacted) tokenMeta := &tokenMeta{} if err := json.Unmarshal(stdout, &tokenMeta); err != nil { diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 3705f747..2507a055 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -942,7 +942,7 @@ func getToken(ctx context.Context, hubMeta hubMeta) (*tokenMeta, error) { out := append(stdout, stderr...) return nil, fmt.Errorf("failed to get join token: %v, output: %s", err, string(out)) } - logger.V(1).Info("got join token", "output", arg_utils.SanitizeOutput(stdout)) + logger.V(1).Info("got join token", "output", arg_utils.Redacted) tokenMeta := &tokenMeta{} if err := json.Unmarshal(stdout, &tokenMeta); err != nil { From 92d2a33676a4d5242d3b945291b563474cf69693 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 3 Oct 2025 14:42:26 -0700 Subject: [PATCH 60/62] chore: fail fast on unexpected MW Signed-off-by: Artur Shad Nik --- .../internal/controller/v1beta1/addon.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fleetconfig-controller/internal/controller/v1beta1/addon.go b/fleetconfig-controller/internal/controller/v1beta1/addon.go index 6da1ce30..ad0fafd5 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/addon.go +++ b/fleetconfig-controller/internal/controller/v1beta1/addon.go @@ -706,10 +706,11 @@ func waitForAddonManifestWorksCleanup(ctx context.Context, workC *workapi.Client } mw := manifestWorks.Items[0] val, ok := mw.Labels[addonv1alpha1.AddonLabelKey] - if ok && val == v1beta1.FCCAddOnName { - logger.V(1).Info("addon manifestWorks cleanup completed", "spokeName", spokeName, "remainingManifestWork", mw.Name) - return true, nil + if !ok || val != v1beta1.FCCAddOnName { + return false, fmt.Errorf("unexpected remaining ManifestWork: expected %s, got label=%q (ok=%t)", v1beta1.FCCAddOnName, val, ok) } + logger.V(1).Info("addon manifestWorks cleanup completed", "spokeName", spokeName, "remainingManifestWork", mw.Name) + return true, nil } logger.V(3).Info("waiting for addon manifestWorks cleanup", From 9d3898e6207454ed637784c15af636c957bb0c11 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 3 Oct 2025 15:21:58 -0700 Subject: [PATCH 61/62] fix: handle hub-side cleanup properly Signed-off-by: Artur Shad Nik --- .../internal/controller/v1beta1/constants.go | 2 -- .../controller/v1beta1/spoke_handler.go | 21 +++++++++++++++++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/fleetconfig-controller/internal/controller/v1beta1/constants.go b/fleetconfig-controller/internal/controller/v1beta1/constants.go index 4bf7d146..99abd2a7 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/constants.go +++ b/fleetconfig-controller/internal/controller/v1beta1/constants.go @@ -33,6 +33,4 @@ const ( addonCleanupPollInterval = 2 * time.Second manifestWorkAddOnLabelKey = "open-cluster-management.io/addon-name" - - manifestWorkAddOnLabelValueFcc = "fleetconfig-controller-agent" ) diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 2507a055..3a15f133 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -405,7 +405,7 @@ func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke // for hub-as-spoke, or if the addon agent never came up, disable all addons // otherwise, leave fleetconfig-controller-agent addon running so that it can do deregistration - shouldCleanAll := spoke.IsHubAsSpoke() || !pivotComplete + shouldCleanAll := spoke.IsHubAsSpoke() || !pivotComplete || r.InstanceType == v1beta1.InstanceTypeUnified if !shouldCleanAll { spokeCopy.Spec.AddOns = append(spokeCopy.Spec.AddOns, v1beta1.AddOn{ConfigName: v1beta1.FCCAddOnName}) @@ -454,8 +454,17 @@ func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke } } + // delete fcc agent addon + spokeCopy.Spec.AddOns = nil + if _, err := handleSpokeAddons(ctx, addonC, spokeCopy); err != nil { + spoke.SetConditions(true, v1beta1.NewCondition( + err.Error(), v1beta1.CleanupFailed, metav1.ConditionTrue, metav1.ConditionFalse, + )) + return err + } + // at this point, klusterlet-work-agent is uninstalled, so nothing can remove this finalizer. all resources are cleaned up by the spoke's controller, so to prevent a dangling mw/namespace, we remove the finalizer manually - mwList, err := workC.WorkV1().ManifestWorks(spoke.Name).List(ctx, metav1.ListOptions{LabelSelector: fmt.Sprintf("%s=%s", manifestWorkAddOnLabelKey, manifestWorkAddOnLabelValueFcc)}) + mwList, err := workC.WorkV1().ManifestWorks(spoke.Name).List(ctx, metav1.ListOptions{LabelSelector: fmt.Sprintf("%s=%s", manifestWorkAddOnLabelKey, v1beta1.FCCAddOnName)}) if err != nil { return err } @@ -481,6 +490,14 @@ func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke } } + // Wait for all manifestWorks to be cleaned up + if err := waitForAddonManifestWorksCleanup(ctx, workC, spoke.Name, addonCleanupTimeout, true); err != nil { + spoke.SetConditions(true, v1beta1.NewCondition( + err.Error(), v1beta1.CleanupFailed, metav1.ConditionTrue, metav1.ConditionFalse, + )) + return fmt.Errorf("addon manifestWorks cleanup failed: %w", err) + } + // remove ManagedCluster err = clusterC.ClusterV1().ManagedClusters().Delete(ctx, spoke.Name, metav1.DeleteOptions{}) if err != nil && !kerrs.IsNotFound(err) { From 352b2b3fb8460c6adad578eb9591499af8766944 Mon Sep 17 00:00:00 2001 From: Artur Shad Nik Date: Fri, 3 Oct 2025 15:28:05 -0700 Subject: [PATCH 62/62] chore: make reviewable Signed-off-by: Artur Shad Nik --- .../controller/v1beta1/spoke_handler.go | 44 ++++++++++++------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go index 3a15f133..98661e5b 100644 --- a/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go +++ b/fleetconfig-controller/internal/controller/v1beta1/spoke_handler.go @@ -18,6 +18,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" addonv1alpha1 "open-cluster-management.io/api/addon/v1alpha1" + addonapi "open-cluster-management.io/api/client/addon/clientset/versioned" + workapi "open-cluster-management.io/api/client/work/clientset/versioned" clusterv1 "open-cluster-management.io/api/cluster/v1" operatorv1 "open-cluster-management.io/api/operator/v1" "sigs.k8s.io/controller-runtime/pkg/client" @@ -454,6 +456,31 @@ func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke } } + err = r.waitForAgentAddonDeleted(ctx, spoke, spokeCopy, addonC, workC) + if err != nil { + return err + } + + // remove ManagedCluster + err = clusterC.ClusterV1().ManagedClusters().Delete(ctx, spoke.Name, metav1.DeleteOptions{}) + if err != nil && !kerrs.IsNotFound(err) { + return err + } + // remove Namespace + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: spoke.Name}} + err = r.Delete(ctx, ns) + if err != nil && !kerrs.IsNotFound(err) { + return err + } + + spoke.Finalizers = slices.DeleteFunc(spoke.Finalizers, func(s string) bool { + return s == v1beta1.HubCleanupFinalizer + }) + + return nil +} + +func (r *SpokeReconciler) waitForAgentAddonDeleted(ctx context.Context, spoke *v1beta1.Spoke, spokeCopy *v1beta1.Spoke, addonC *addonapi.Clientset, workC *workapi.Clientset) error { // delete fcc agent addon spokeCopy.Spec.AddOns = nil if _, err := handleSpokeAddons(ctx, addonC, spokeCopy); err != nil { @@ -497,23 +524,6 @@ func (r *SpokeReconciler) doHubCleanup(ctx context.Context, spoke *v1beta1.Spoke )) return fmt.Errorf("addon manifestWorks cleanup failed: %w", err) } - - // remove ManagedCluster - err = clusterC.ClusterV1().ManagedClusters().Delete(ctx, spoke.Name, metav1.DeleteOptions{}) - if err != nil && !kerrs.IsNotFound(err) { - return err - } - // remove Namespace - ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: spoke.Name}} - err = r.Delete(ctx, ns) - if err != nil && !kerrs.IsNotFound(err) { - return err - } - - spoke.Finalizers = slices.DeleteFunc(spoke.Finalizers, func(s string) bool { - return s == v1beta1.HubCleanupFinalizer - }) - return nil }