Skip to content

Commit

Permalink
Update the leader election durations to be tolerant
Browse files Browse the repository at this point in the history
In OCP, we tolerate about a minute of apiserver downtime before losing a
lease by carefully setting the leader election values.  Combined with
graceful lease release, this makes the default case fast, with a
tolerant controller that doesnt' churn
  • Loading branch information
deads2k authored and openshift-cherrypick-robot committed Feb 21, 2024
1 parent 488fb7a commit 54cd32e
Showing 1 changed file with 27 additions and 5 deletions.
32 changes: 27 additions & 5 deletions main.go
Expand Up @@ -20,6 +20,7 @@ import (
"crypto/tls"
"flag"
"os"
"time"

"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
Expand Down Expand Up @@ -84,7 +85,7 @@ func main() {

config := ctrl.GetConfigOrDie()

mgr, err := ctrl.NewManager(config, ctrl.Options{
controllerOptions := ctrl.Options{
Scheme: scheme,
MetricsBindAddress: metricsAddr,
WebhookServer: webhook.NewServer(webhook.Options{
Expand All @@ -98,10 +99,31 @@ func main() {
}),
NewCache: cache.MultiNamespacedCacheBuilder(
[]string{controllers.ComponentNamespace, provisioning.OpenshiftConfigNamespace}),
LeaderElection: enableLeaderElection,
LeaderElectionID: "cluster-baremetal-operator",
LeaderElectionNamespace: controllers.ComponentNamespace,
})
}

if enableLeaderElection {
controllerOptions.LeaderElection = true
controllerOptions.LeaderElectionReleaseOnCancel = true
controllerOptions.LeaderElectionID = "cluster-baremetal-operator"
controllerOptions.LeaderElectionNamespace = controllers.ComponentNamespace

// these values match library-go LeaderElectionDefaulting, to produce this outcome
// see https://github.com/openshift/library-go/blob/release-4.15/pkg/config/leaderelection/leaderelection.go#L97-L105
// 1. clock skew tolerance is leaseDuration-renewDeadline == 30s
// 2. kube-apiserver downtime tolerance is == 78s
// lastRetry=floor(renewDeadline/retryPeriod)*retryPeriod == 104
// downtimeTolerance = lastRetry-retryPeriod == 78s
// 3. worst non-graceful lease acquisition is leaseDuration+retryPeriod == 163s
// 4. worst graceful lease acquisition is retryPeriod == 26s
leaseDuration := 137 * time.Second
renewDeadline := 107 * time.Second
retryPeriod := 26 * time.Second
controllerOptions.LeaseDuration = &leaseDuration
controllerOptions.RenewDeadline = &renewDeadline
controllerOptions.RetryPeriod = &retryPeriod
}

mgr, err := ctrl.NewManager(config, controllerOptions)
if err != nil {
klog.ErrorS(err, "unable to start manager")
os.Exit(1)
Expand Down

0 comments on commit 54cd32e

Please sign in to comment.