Skip to content

Commit

Permalink
pkg/start: Release leader lease on graceful shutdown
Browse files Browse the repository at this point in the history
So the incoming cluster-version operator doesn't need to wait for the
outgoing operator's lease to expire, which can take a while [1]:

  I0802 10:06:01.056591       1 leaderelection.go:243] attempting to acquire leader lease  openshift-cluster-version/version...
  ...
  I0802 10:07:42.632719       1 leaderelection.go:253] successfully acquired lease openshift-cluster-version/version

and time out the:

  Cluster did not acknowledge request to upgrade in a reasonable time

testcase [2].  Using ReleaseOnCancel has been the plan since
2b81f47 (cvo: Release our leader lease when we are gracefully
terminated, 2019-01-16, #87).  I'm not clear on why it (sometimes?)
doesn't work today.

The discrepancy between the "exit after 2s no matter what" comment and
the 5s After dates back to dbedb7a (cvo: When the CVO restarts,
perform one final sync to write status, 2019-04-27, #179), which
bumped the After from 2s to 5s, but forgot to bump the comment.  I'm
removing that code here in favor of the two-minute timeout from
b30aa0e (pkg/cvo/metrics: Graceful server shutdown, 2020-04-15, #349).
We still exit immediately on a second TERM, for folks who get
impatient waiting for the graceful timeout.

I'm also pushing the Background() initialization all the way up to the
command-line handler, to make it more obvious that the context is
scoped to the whole 'start' invocation.

[1]: https://storage.googleapis.com/origin-ci-test/pr-logs/pull/25365/pull-ci-openshift-origin-master-e2e-gcp-upgrade/1289853267223777280/artifacts/e2e-gcp-upgrade/pods/openshift-cluster-version_cluster-version-operator-5b6ff896c6-57ppb_cluster-version-operator.log
[2]: https://bugzilla.redhat.com/show_bug.cgi?id=1843505#c7
  • Loading branch information
wking committed Aug 4, 2020
1 parent ed864d6 commit 621d0cd
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 64 deletions.
5 changes: 4 additions & 1 deletion cmd/start.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package main

import (
"context"

"github.com/spf13/cobra"
"k8s.io/klog"

Expand All @@ -18,9 +20,10 @@ func init() {
// To help debugging, immediately log version
klog.Infof("%s", version.String)

if err := opts.Run(); err != nil {
if err := opts.Run(context.Background()); err != nil {
klog.Fatalf("error: %v", err)
}
klog.Info("Graceful shutdown complete.")
},
}

Expand Down
85 changes: 22 additions & 63 deletions pkg/start/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
"math/rand"
"os"
"os/signal"
"sync"
"syscall"
"time"

Expand Down Expand Up @@ -97,7 +96,7 @@ func NewOptions() *Options {
}
}

func (o *Options) Run() error {
func (o *Options) Run(ctx context.Context) error {
if o.NodeName == "" {
return fmt.Errorf("node-name is required")
}
Expand Down Expand Up @@ -133,29 +132,6 @@ func (o *Options) Run() error {
return err
}

// TODO: Kube 1.14 will contain a ReleaseOnCancel boolean on
// LeaderElectionConfig that allows us to have the lock code
// release the lease when this context is cancelled. At that
// time we can remove our changes to OnStartedLeading.
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
ch := make(chan os.Signal, 1)
defer func() { signal.Stop(ch) }()
signal.Notify(ch, os.Interrupt, syscall.SIGTERM)
go func() {
sig := <-ch
klog.Infof("Shutting down due to %s", sig)
cancel()

// exit after 2s no matter what
select {
case <-time.After(5 * time.Second):
klog.Fatalf("Exiting")
case <-ch:
klog.Fatalf("Received shutdown signal twice, exiting")
}
}()

o.run(ctx, controllerCtx, lock)
return nil
}
Expand Down Expand Up @@ -187,6 +163,18 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
defer runCancel()
shutdownContext, shutdownCancel := context.WithCancel(ctx)
defer shutdownCancel()

ch := make(chan os.Signal, 1)
defer func() { signal.Stop(ch) }()
signal.Notify(ch, os.Interrupt, syscall.SIGTERM)
go func() {
sig := <-ch
klog.Infof("Shutting down due to %s", sig)
runCancel()
sig = <-ch
klog.Fatalf("Received shutdown signal twice, exiting: %s", sig)
}()

errorChannel := make(chan error, 1)
errorChannelCount := 0
if o.ListenAddr != "" {
Expand All @@ -204,46 +192,19 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
}()
}

exit := make(chan struct{})
exitClose := sync.Once{}

// TODO: when we switch to graceful lock shutdown, this can be
// moved back inside RunOrDie
// TODO: properly wire ctx here
go leaderelection.RunOrDie(context.TODO(), leaderelection.LeaderElectionConfig{
Lock: lock,
LeaseDuration: leaseDuration,
RenewDeadline: renewDeadline,
RetryPeriod: retryPeriod,
go leaderelection.RunOrDie(runContext, leaderelection.LeaderElectionConfig{
Lock: lock,
ReleaseOnCancel: true,
LeaseDuration: leaseDuration,
RenewDeadline: renewDeadline,
RetryPeriod: retryPeriod,
Callbacks: leaderelection.LeaderCallbacks{
OnStartedLeading: func(localCtx context.Context) {
controllerCtx.Start(runContext)
select {
case <-runContext.Done():
// WARNING: this is not completely safe until we have Kube 1.14 and ReleaseOnCancel
// and client-go ContextCancelable, which allows us to block new API requests before
// we step down. However, the CVO isn't that sensitive to races and can tolerate
// brief overlap.
klog.Infof("Stepping down as leader")
// give the controllers some time to shut down
time.Sleep(100 * time.Millisecond)
// if we still hold the leader lease, clear the owner identity (other lease watchers
// still have to wait for expiration) like the new ReleaseOnCancel code will do.
if err := lock.Update(localCtx, resourcelock.LeaderElectionRecord{}); err == nil {
// if we successfully clear the owner identity, we can safely delete the record
if err := lock.Client.ConfigMaps(lock.ConfigMapMeta.Namespace).Delete(localCtx, lock.ConfigMapMeta.Name, metav1.DeleteOptions{}); err != nil {
klog.Warningf("Unable to step down cleanly: %v", err)
}
}
klog.Infof("Finished shutdown")
exitClose.Do(func() { close(exit) })
case <-localCtx.Done():
// we will exit in OnStoppedLeading
}
},
OnStoppedLeading: func() {
klog.Warning("leaderelection lost")
exitClose.Do(func() { close(exit) })
klog.Info("Stopped leading; shutting down.")
runCancel()
},
},
})
Expand All @@ -270,13 +231,11 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
errorChannelCount--
if err != nil {
klog.Error(err)
runCancel()
}
}
}
}

<-exit
klog.Info("Finished collecting operator goroutines.")
}

// createResourceLock initializes the lock.
Expand Down

0 comments on commit 621d0cd

Please sign in to comment.