Skip to content

Commit

Permalink
pkg/start: Release leader lease on graceful shutdown
Browse files Browse the repository at this point in the history
So the incoming cluster-version operator doesn't need to wait for the
outgoing operator's lease to expire, which can take a while [1]:

  I0802 10:06:01.056591       1 leaderelection.go:243] attempting to acquire leader lease  openshift-cluster-version/version...
  ...
  I0802 10:07:42.632719       1 leaderelection.go:253] successfully acquired lease openshift-cluster-version/version

and time out the:

  Cluster did not acknowledge request to upgrade in a reasonable time

testcase [2].  Using ReleaseOnCancel has been the plan since
2b81f47 (cvo: Release our leader lease when we are gracefully
terminated, 2019-01-16, #87).  I'm not clear on why it (sometimes?)
doesn't work today.

The discrepancy between the "exit after 2s no matter what" comment and
the 5s After dates back to dbedb7a (cvo: When the CVO restarts,
perform one final sync to write status, 2019-04-27, #179), which
bumped the After from 2s to 5s, but forgot to bump the comment.  I'm
removing that code here in favor of the two-minute timeout from
b30aa0e (pkg/cvo/metrics: Graceful server shutdown, 2020-04-15, #349).
We still exit immediately on a second TERM, for folks who get
impatient waiting for the graceful timeout.

Also drop an unnecessary runCancel() from the "shutting down" branch
of the error collector.  I'd added it in b30aa0e, but you can only
ever get into the "shutting down" branch if runCancel has already been
called.  And fix the scoping for the shutdownTimer variable so we
don't clear it on each for-loop iteration (oops :p, bug from
b30aa0e).

Also add some logging to the error collector, so it's easier to see
where we are in the collection process from the operator logs.
Although because we only collect errors at the moment, it's not
obvious from the logs exactly what has been collected.

Also set terminationGracePeriodSeconds 130 to extend the default 30s
[3], to give the container the full two-minute graceful timeout window
before the kubelet steps in with a KILL.

Also push the Background() initialization all the way up to the
command-line handler, to make it more obvious that the context is
scoped to the whole 'start' invocation.

[1]: https://storage.googleapis.com/origin-ci-test/pr-logs/pull/25365/pull-ci-openshift-origin-master-e2e-gcp-upgrade/1289853267223777280/artifacts/e2e-gcp-upgrade/pods/openshift-cluster-version_cluster-version-operator-5b6ff896c6-57ppb_cluster-version-operator.log
[2]: https://bugzilla.redhat.com/show_bug.cgi?id=1843505#c7
[3]: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#podspec-v1-core
  • Loading branch information
wking committed Aug 5, 2020
1 parent ed864d6 commit d82e2bf
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 72 deletions.
1 change: 1 addition & 0 deletions bootstrap/bootstrap-pod.yaml
Expand Up @@ -37,6 +37,7 @@ spec:
fieldRef:
fieldPath: spec.nodeName
hostNetwork: true
terminationGracePeriodSeconds: 130
volumes:
- name: kubeconfig
hostPath:
Expand Down
5 changes: 4 additions & 1 deletion cmd/start.go
@@ -1,6 +1,8 @@
package main

import (
"context"

"github.com/spf13/cobra"
"k8s.io/klog"

Expand All @@ -18,9 +20,10 @@ func init() {
// To help debugging, immediately log version
klog.Infof("%s", version.String)

if err := opts.Run(); err != nil {
if err := opts.Run(context.Background()); err != nil {
klog.Fatalf("error: %v", err)
}
klog.Info("Graceful shutdown complete.")
},
}

Expand Down
Expand Up @@ -57,6 +57,7 @@ spec:
nodeSelector:
node-role.kubernetes.io/master: ""
priorityClassName: "system-cluster-critical"
terminationGracePeriodSeconds: 130
tolerations:
- key: "node-role.kubernetes.io/master"
operator: Exists
Expand Down
107 changes: 36 additions & 71 deletions pkg/start/start.go
Expand Up @@ -10,7 +10,6 @@ import (
"math/rand"
"os"
"os/signal"
"sync"
"syscall"
"time"

Expand Down Expand Up @@ -97,7 +96,7 @@ func NewOptions() *Options {
}
}

func (o *Options) Run() error {
func (o *Options) Run(ctx context.Context) error {
if o.NodeName == "" {
return fmt.Errorf("node-name is required")
}
Expand Down Expand Up @@ -133,29 +132,6 @@ func (o *Options) Run() error {
return err
}

// TODO: Kube 1.14 will contain a ReleaseOnCancel boolean on
// LeaderElectionConfig that allows us to have the lock code
// release the lease when this context is cancelled. At that
// time we can remove our changes to OnStartedLeading.
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
ch := make(chan os.Signal, 1)
defer func() { signal.Stop(ch) }()
signal.Notify(ch, os.Interrupt, syscall.SIGTERM)
go func() {
sig := <-ch
klog.Infof("Shutting down due to %s", sig)
cancel()

// exit after 2s no matter what
select {
case <-time.After(5 * time.Second):
klog.Fatalf("Exiting")
case <-ch:
klog.Fatalf("Received shutdown signal twice, exiting")
}
}()

o.run(ctx, controllerCtx, lock)
return nil
}
Expand Down Expand Up @@ -187,6 +163,18 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
defer runCancel()
shutdownContext, shutdownCancel := context.WithCancel(ctx)
defer shutdownCancel()

ch := make(chan os.Signal, 1)
defer func() { signal.Stop(ch) }()
signal.Notify(ch, os.Interrupt, syscall.SIGTERM)
go func() {
sig := <-ch
klog.Infof("Shutting down due to %s", sig)
runCancel()
sig = <-ch
klog.Fatalf("Received shutdown signal twice, exiting: %s", sig)
}()

errorChannel := make(chan error, 1)
errorChannelCount := 0
if o.ListenAddr != "" {
Expand All @@ -204,55 +192,34 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
}()
}

exit := make(chan struct{})
exitClose := sync.Once{}

// TODO: when we switch to graceful lock shutdown, this can be
// moved back inside RunOrDie
// TODO: properly wire ctx here
go leaderelection.RunOrDie(context.TODO(), leaderelection.LeaderElectionConfig{
Lock: lock,
LeaseDuration: leaseDuration,
RenewDeadline: renewDeadline,
RetryPeriod: retryPeriod,
Callbacks: leaderelection.LeaderCallbacks{
OnStartedLeading: func(localCtx context.Context) {
controllerCtx.Start(runContext)
select {
case <-runContext.Done():
// WARNING: this is not completely safe until we have Kube 1.14 and ReleaseOnCancel
// and client-go ContextCancelable, which allows us to block new API requests before
// we step down. However, the CVO isn't that sensitive to races and can tolerate
// brief overlap.
klog.Infof("Stepping down as leader")
// give the controllers some time to shut down
time.Sleep(100 * time.Millisecond)
// if we still hold the leader lease, clear the owner identity (other lease watchers
// still have to wait for expiration) like the new ReleaseOnCancel code will do.
if err := lock.Update(localCtx, resourcelock.LeaderElectionRecord{}); err == nil {
// if we successfully clear the owner identity, we can safely delete the record
if err := lock.Client.ConfigMaps(lock.ConfigMapMeta.Namespace).Delete(localCtx, lock.ConfigMapMeta.Name, metav1.DeleteOptions{}); err != nil {
klog.Warningf("Unable to step down cleanly: %v", err)
}
}
klog.Infof("Finished shutdown")
exitClose.Do(func() { close(exit) })
case <-localCtx.Done():
// we will exit in OnStoppedLeading
}
},
OnStoppedLeading: func() {
klog.Warning("leaderelection lost")
exitClose.Do(func() { close(exit) })
errorChannelCount++
go func() {
leaderelection.RunOrDie(runContext, leaderelection.LeaderElectionConfig{
Lock: lock,
ReleaseOnCancel: true,
LeaseDuration: leaseDuration,
RenewDeadline: renewDeadline,
RetryPeriod: retryPeriod,
Callbacks: leaderelection.LeaderCallbacks{
OnStartedLeading: func(ctx context.Context) {
controllerCtx.Start(ctx)
},
OnStoppedLeading: func() {
klog.Info("Stopped leading; shutting down.")
runCancel()
},
},
},
})
})
errorChannel <- nil
}()

var shutdownTimer *time.Timer
for errorChannelCount > 0 {
var shutdownTimer *time.Timer
klog.Infof("Waiting on %d outstanding goroutines.", errorChannelCount)
if shutdownTimer == nil { // running
select {
case <-runContext.Done():
klog.Info("Run context completed; beginning two-minute graceful shutdown period.")
shutdownTimer = time.NewTimer(2 * time.Minute)
case err := <-errorChannel:
errorChannelCount--
Expand All @@ -270,13 +237,11 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
errorChannelCount--
if err != nil {
klog.Error(err)
runCancel()
}
}
}
}

<-exit
klog.Info("Finished collecting operator goroutines.")
}

// createResourceLock initializes the lock.
Expand Down

0 comments on commit d82e2bf

Please sign in to comment.