From 10e8b4521de7f540fcc2666b36a8f5e0bb24db0c Mon Sep 17 00:00:00 2001 From: Danil-Grigorev Date: Wed, 8 Apr 2020 17:06:04 +0200 Subject: [PATCH 1/3] Implement leader election for manager --- cmd/manager/main.go | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/cmd/manager/main.go b/cmd/manager/main.go index 87ca303837..89a87271e3 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -35,15 +35,41 @@ import ( ) func main() { - var printVersion bool - flag.BoolVar(&printVersion, "version", false, "print version and exit") + printVersion := flag.Bool( + "version", + false, + "print version and exit", + ) + + watchNamespace := flag.String( + "namespace", + "", + "Namespace that the controller watches to reconcile machine-api objects. If unspecified, the controller watches for machine-api objects across all namespaces.", + ) + + leaderElectResourceNamespace := flag.String( + "leader-elect-resource-namespace", + "", + "The namespace of resource object that is used for locking during leader election. If unspecified and running in cluster, defaults to the service account namespace for the controller. Required for leader-election outside of a cluster.", + ) + + leaderElect := flag.Bool( + "leader-elect", + false, + "Start a leader election client and gain leadership before executing the main loop. Enable this when running replicated components for high availability.", + ) + + leaderElectLeaseDuration := flag.Duration( + "leader-elect-lease-duration", + 15*time.Second, + "The duration that non-leader candidates will wait after observing a leadership renewal until attempting to acquire leadership of a led but unrenewed leader slot. This is effectively the maximum duration that a leader can be stopped before it is replaced by another candidate. This is only applicable if leader election is enabled.", + ) klog.InitFlags(nil) - watchNamespace := flag.String("namespace", "", "Namespace that the controller watches to reconcile machine-api objects. If unspecified, the controller watches for machine-api objects across all namespaces.") flag.Set("logtostderr", "true") flag.Parse() - if printVersion { + if *printVersion { fmt.Println(version.String) os.Exit(0) } @@ -57,10 +83,15 @@ func main() { // Setup a Manager syncPeriod := 10 * time.Minute opts := manager.Options{ - SyncPeriod: &syncPeriod, + LeaderElection: *leaderElect, + LeaderElectionNamespace: *leaderElectResourceNamespace, + LeaderElectionID: "cluster-api-provider-aws-leader", + LeaseDuration: leaderElectLeaseDuration, + SyncPeriod: &syncPeriod, // Disable metrics serving MetricsBindAddress: "0", } + if *watchNamespace != "" { opts.Namespace = *watchNamespace klog.Infof("Watching machine-api objects only in namespace %q for reconciliation.", opts.Namespace) From e4373ccad4ff6cc9cca512f4dbf2b6d419b7f037 Mon Sep 17 00:00:00 2001 From: Danil-Grigorev Date: Mon, 20 Jul 2020 12:36:31 +0200 Subject: [PATCH 2/3] Increase leader election lease time The machine-api-controller components are refreshing their lease more than all other components combined. Bringing this to 90s each, will decrease etcd writes at idle. --- cmd/manager/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/manager/main.go b/cmd/manager/main.go index 89a87271e3..459e77dead 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -61,7 +61,7 @@ func main() { leaderElectLeaseDuration := flag.Duration( "leader-elect-lease-duration", - 15*time.Second, + 90*time.Second, "The duration that non-leader candidates will wait after observing a leadership renewal until attempting to acquire leadership of a led but unrenewed leader slot. This is effectively the maximum duration that a leader can be stopped before it is replaced by another candidate. This is only applicable if leader election is enabled.", ) From 362122608702ed7e18311c551fc73f21d85edbb6 Mon Sep 17 00:00:00 2001 From: Danil-Grigorev Date: Wed, 12 Aug 2020 19:27:41 +0200 Subject: [PATCH 3/3] Slow the default lease retry and renew rate for machine controller Prevent machine controllers from writing in etcd at idle too often by setting 120s lease, 20s retry and 110s deadline on all renewals. Higher values cause tests to flake. BZ 1858403 --- cmd/manager/main.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/cmd/manager/main.go b/cmd/manager/main.go index 459e77dead..f978e43f77 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -34,6 +34,13 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" ) +// The default durations for the leader electrion operations. +var ( + leaseDuration = 120 * time.Second + renewDealine = 110 * time.Second + retryPeriod = 20 * time.Second +) + func main() { printVersion := flag.Bool( "version", @@ -61,7 +68,7 @@ func main() { leaderElectLeaseDuration := flag.Duration( "leader-elect-lease-duration", - 90*time.Second, + leaseDuration, "The duration that non-leader candidates will wait after observing a leadership renewal until attempting to acquire leadership of a led but unrenewed leader slot. This is effectively the maximum duration that a leader can be stopped before it is replaced by another candidate. This is only applicable if leader election is enabled.", ) @@ -90,6 +97,9 @@ func main() { SyncPeriod: &syncPeriod, // Disable metrics serving MetricsBindAddress: "0", + // Slow the default retry and renew election rate to reduce etcd writes at idle: BZ 1858400 + RetryPeriod: &retryPeriod, + RenewDeadline: &renewDealine, } if *watchNamespace != "" {