pkg/start: Release leader lease on graceful shutdown

So the incoming cluster-version operator doesn't need to wait for the outgoing operator's lease to expire, which can take a while [1]: I0802 10:06:01.056591 1 leaderelection.go:243] attempting to acquire leader lease openshift-cluster-version/version... ... I0802 10:07:42.632719 1 leaderelection.go:253] successfully acquired lease openshift-cluster-version/version and time out the: Cluster did not acknowledge request to upgrade in a reasonable time testcase [2]. Using ReleaseOnCancel has been the plan since 2b81f47 (cvo: Release our leader lease when we are gracefully terminated, 2019-01-16, #87). I'm not clear on why it (sometimes?) doesn't work today. The discrepancy between the "exit after 2s no matter what" comment and the 5s After dates back to dbedb7a (cvo: When the CVO restarts, perform one final sync to write status, 2019-04-27, #179), which bumped the After from 2s to 5s, but forgot to bump the comment. I'm removing that code here in favor of the two-minute timeout from b30aa0e (pkg/cvo/metrics: Graceful server shutdown, 2020-04-15, #349). We still exit immediately on a second TERM, for folks who get impatient waiting for the graceful timeout. Also drop an unnecessary runCancel() from the "shutting down" branch of the error collector. I'd added it in b30aa0e, but you can only ever get into the "shutting down" branch if runCancel has already been called. And fix the scoping for the shutdownTimer variable so we don't clear it on each for-loop iteration (oops :p, bug from b30aa0e). Also add some logging to the error collector, so it's easier to see where we are in the collection process from the operator logs. Although because we only collect errors at the moment, it's not obvious from the logs exactly what has been collected. Also set terminationGracePeriodSeconds 130 to extend the default 30s [3], to give the container the full two-minute graceful timeout window before the kubelet steps in with a KILL. Also push the Background() initialization all the way up to the command-line handler, to make it more obvious that the context is scoped to the whole 'start' invocation. [1]: https://storage.googleapis.com/origin-ci-test/pr-logs/pull/25365/pull-ci-openshift-origin-master-e2e-gcp-upgrade/1289853267223777280/artifacts/e2e-gcp-upgrade/pods/openshift-cluster-version_cluster-version-operator-5b6ff896c6-57ppb_cluster-version-operator.log [2]: https://bugzilla.redhat.com/show_bug.cgi?id=1843505#c7 [3]: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#podspec-v1-core
openshift · Aug 5, 2020 · d82e2bf · d82e2bf
1 parent ed864d6
commit d82e2bf
Show file tree

Hide file tree

Showing 4 changed files with 42 additions and 72 deletions.
diff --git a/bootstrap/bootstrap-pod.yaml b/bootstrap/bootstrap-pod.yaml
@@ -37,6 +37,7 @@ spec:
           fieldRef:
             fieldPath: spec.nodeName
   hostNetwork: true
+  terminationGracePeriodSeconds: 130
   volumes:
     - name: kubeconfig
       hostPath:

diff --git a/cmd/start.go b/cmd/start.go
@@ -1,6 +1,8 @@
 package main
 
 import (
+	"context"
+
 	"github.com/spf13/cobra"
 	"k8s.io/klog"
 
@@ -18,9 +20,10 @@ func init() {
 			// To help debugging, immediately log version
 			klog.Infof("%s", version.String)
 
-			if err := opts.Run(); err != nil {
+			if err := opts.Run(context.Background()); err != nil {
 				klog.Fatalf("error: %v", err)
 			}
+			klog.Info("Graceful shutdown complete.")
 		},
 	}
 

diff --git a/install/0000_00_cluster-version-operator_03_deployment.yaml b/install/0000_00_cluster-version-operator_03_deployment.yaml
@@ -57,6 +57,7 @@ spec:
       nodeSelector:
         node-role.kubernetes.io/master: ""
       priorityClassName: "system-cluster-critical"
+      terminationGracePeriodSeconds: 130
       tolerations:
       - key: "node-role.kubernetes.io/master"
         operator: Exists

diff --git a/pkg/start/start.go b/pkg/start/start.go
@@ -10,7 +10,6 @@ import (
 	"math/rand"
 	"os"
 	"os/signal"
-	"sync"
 	"syscall"
 	"time"
 
@@ -97,7 +96,7 @@ func NewOptions() *Options {
 	}
 }
 
-func (o *Options) Run() error {
+func (o *Options) Run(ctx context.Context) error {
 	if o.NodeName == "" {
 		return fmt.Errorf("node-name is required")
 	}
@@ -133,29 +132,6 @@ func (o *Options) Run() error {
 		return err
 	}
 
-	// TODO: Kube 1.14 will contain a ReleaseOnCancel boolean on
-	//   LeaderElectionConfig that allows us to have the lock code
-	//   release the lease when this context is cancelled. At that
-	//   time we can remove our changes to OnStartedLeading.
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-	ch := make(chan os.Signal, 1)
-	defer func() { signal.Stop(ch) }()
-	signal.Notify(ch, os.Interrupt, syscall.SIGTERM)
-	go func() {
-		sig := <-ch
-		klog.Infof("Shutting down due to %s", sig)
-		cancel()
-
-		// exit after 2s no matter what
-		select {
-		case <-time.After(5 * time.Second):
-			klog.Fatalf("Exiting")
-		case <-ch:
-			klog.Fatalf("Received shutdown signal twice, exiting")
-		}
-	}()
-
 	o.run(ctx, controllerCtx, lock)
 	return nil
 }
@@ -187,6 +163,18 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
 	defer runCancel()
 	shutdownContext, shutdownCancel := context.WithCancel(ctx)
 	defer shutdownCancel()
+
+	ch := make(chan os.Signal, 1)
+	defer func() { signal.Stop(ch) }()
+	signal.Notify(ch, os.Interrupt, syscall.SIGTERM)
+	go func() {
+		sig := <-ch
+		klog.Infof("Shutting down due to %s", sig)
+		runCancel()
+		sig = <-ch
+		klog.Fatalf("Received shutdown signal twice, exiting: %s", sig)
+	}()
+
 	errorChannel := make(chan error, 1)
 	errorChannelCount := 0
 	if o.ListenAddr != "" {
@@ -204,55 +192,34 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
 		}()
 	}
 
-	exit := make(chan struct{})
-	exitClose := sync.Once{}
-
-	// TODO: when we switch to graceful lock shutdown, this can be
-	// moved back inside RunOrDie
-	// TODO: properly wire ctx here
-	go leaderelection.RunOrDie(context.TODO(), leaderelection.LeaderElectionConfig{
-		Lock:          lock,
-		LeaseDuration: leaseDuration,
-		RenewDeadline: renewDeadline,
-		RetryPeriod:   retryPeriod,
-		Callbacks: leaderelection.LeaderCallbacks{
-			OnStartedLeading: func(localCtx context.Context) {
-				controllerCtx.Start(runContext)
-				select {
-				case <-runContext.Done():
-					// WARNING: this is not completely safe until we have Kube 1.14 and ReleaseOnCancel
-					//   and client-go ContextCancelable, which allows us to block new API requests before
-					//   we step down. However, the CVO isn't that sensitive to races and can tolerate
-					//   brief overlap.
-					klog.Infof("Stepping down as leader")
-					// give the controllers some time to shut down
-					time.Sleep(100 * time.Millisecond)
-					// if we still hold the leader lease, clear the owner identity (other lease watchers
-					// still have to wait for expiration) like the new ReleaseOnCancel code will do.
-					if err := lock.Update(localCtx, resourcelock.LeaderElectionRecord{}); err == nil {
-						// if we successfully clear the owner identity, we can safely delete the record
-						if err := lock.Client.ConfigMaps(lock.ConfigMapMeta.Namespace).Delete(localCtx, lock.ConfigMapMeta.Name, metav1.DeleteOptions{}); err != nil {
-							klog.Warningf("Unable to step down cleanly: %v", err)
-						}
-					}
-					klog.Infof("Finished shutdown")
-					exitClose.Do(func() { close(exit) })
-				case <-localCtx.Done():
-					// we will exit in OnStoppedLeading
-				}
-			},
-			OnStoppedLeading: func() {
-				klog.Warning("leaderelection lost")
-				exitClose.Do(func() { close(exit) })
+	errorChannelCount++
+	go func() {
+		leaderelection.RunOrDie(runContext, leaderelection.LeaderElectionConfig{
+			Lock:            lock,
+			ReleaseOnCancel: true,
+			LeaseDuration:   leaseDuration,
+			RenewDeadline:   renewDeadline,
+			RetryPeriod:     retryPeriod,
+			Callbacks: leaderelection.LeaderCallbacks{
+				OnStartedLeading: func(ctx context.Context) {
+					controllerCtx.Start(ctx)
+				},
+				OnStoppedLeading: func() {
+					klog.Info("Stopped leading; shutting down.")
+					runCancel()
+				},
 			},
-		},
-	})
+		})
+		errorChannel <- nil
+	}()
 
+	var shutdownTimer *time.Timer
 	for errorChannelCount > 0 {
-		var shutdownTimer *time.Timer
+		klog.Infof("Waiting on %d outstanding goroutines.", errorChannelCount)
 		if shutdownTimer == nil { // running
 			select {
 			case <-runContext.Done():
+				klog.Info("Run context completed; beginning two-minute graceful shutdown period.")
 				shutdownTimer = time.NewTimer(2 * time.Minute)
 			case err := <-errorChannel:
 				errorChannelCount--
@@ -270,13 +237,11 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
 				errorChannelCount--
 				if err != nil {
 					klog.Error(err)
-					runCancel()
 				}
 			}
 		}
 	}
-
-	<-exit
+	klog.Info("Finished collecting operator goroutines.")
 }
 
 // createResourceLock initializes the lock.