Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-4.6] Bug 1987221: Backport drain timeout optimizations #2707

Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,4 @@ Dockerfile.rhel7: Dockerfile Makefile

# This was copied from https://github.com/openshift/cluster-image-registry-operator
test-e2e:
go test -failfast -timeout 75m -v$${WHAT:+ -run="$$WHAT"} ./test/e2e/
go test -failfast -timeout 90m -v$${WHAT:+ -run="$$WHAT"} ./test/e2e/
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ spec:
mcd_drain_err > 0
labels:
severity: warning
for: 15m
annotations:
message: "Drain failed on {{ $labels.node }} , updates may be blocked. For more details: oc logs -f -n openshift-machine-config-operator machine-config-daemon-<hash> -c machine-config-daemon"
- name: mcd-pivot-error
Expand Down
6 changes: 3 additions & 3 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -1118,7 +1118,7 @@ func (dn *Daemon) checkStateOnFirstRun() error {
// Great, we've successfully rebooted for the desired config,
// let's mark it done!
glog.Infof("Completing pending config %s", state.pendingConfig.GetName())
if err := dn.completeUpdate(dn.node, state.pendingConfig.GetName()); err != nil {
if err := dn.completeUpdate(state.pendingConfig.GetName()); err != nil {
MCDUpdateState.WithLabelValues("", err.Error()).SetToCurrentTime()
return err
}
Expand Down Expand Up @@ -1256,8 +1256,8 @@ func (dn *Daemon) prepUpdateFromCluster() (*mcfgv1.MachineConfig, *mcfgv1.Machin
// completeUpdate marks the node as schedulable again, then deletes the
// "transient state" file, which signifies that all of those prior steps have
// been completed.
func (dn *Daemon) completeUpdate(node *corev1.Node, desiredConfigName string) error {
if err := drain.RunCordonOrUncordon(dn.drainer, node, false); err != nil {
func (dn *Daemon) completeUpdate(desiredConfigName string) error {
if err := dn.cordonOrUncordonNode(false); err != nil {
return err
}

Expand Down
9 changes: 5 additions & 4 deletions pkg/daemon/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ var (
Help: "indicates a successful SSH login",
})

// MCDDrainErr logs errors received during failed drain
MCDDrainErr = prometheus.NewGaugeVec(
// MCDDrainErr logs failed drain
MCDDrainErr = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "mcd_drain_err",
Help: "errors from failed drain",
}, []string{"node", "err"})
Help: "logs failed drain",
})

// MCDPivotErr shows errors encountered during pivot
MCDPivotErr = prometheus.NewGaugeVec(
Expand Down Expand Up @@ -88,6 +88,7 @@ func registerMCDMetrics() error {
}
}

MCDDrainErr.Set(0)
MCDPivotErr.WithLabelValues("", "", "").Set(0)
KubeletHealthState.Set(0)
MCDRebootErr.WithLabelValues("", "", "").Set(0)
Expand Down
94 changes: 61 additions & 33 deletions pkg/daemon/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,56 +118,84 @@ func (dn *Daemon) finalizeAndReboot(newConfig *mcfgv1.MachineConfig) (retErr err
return dn.reboot(fmt.Sprintf("Node will reboot into config %v", newConfig.GetName()))
}

func (dn *Daemon) drain() error {
// Skip draining of the node when we're not cluster driven
if dn.kubeClient == nil {
return nil
}
MCDDrainErr.WithLabelValues(dn.node.Name, "").Set(0)

dn.logSystem("Update prepared; beginning drain")
startTime := time.Now()

dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Drain", "Draining node to update config.")

func (dn *Daemon) cordonOrUncordonNode(desired bool) error {
backoff := wait.Backoff{
Steps: 5,
Duration: 10 * time.Second,
Factor: 2,
}
var lastErr error
if err := wait.ExponentialBackoff(backoff, func() (bool, error) {
err := drain.RunCordonOrUncordon(dn.drainer, dn.node, true)
err := drain.RunCordonOrUncordon(dn.drainer, dn.node, desired)
if err != nil {
lastErr = err
glog.Infof("Cordon failed with: %v, retrying", err)
glog.Infof("cordon/uncordon failed with: %v, retrying", err)
return false, nil
}
err = drain.RunNodeDrain(dn.drainer, dn.node.Name)
if err == nil {
return true, nil
}
lastErr = err
glog.Infof("Draining failed with: %v, retrying", err)
return false, nil
return true, nil
}); err != nil {
if err == wait.ErrWaitTimeout {
failMsg := fmt.Sprintf("%d tries: %v", backoff.Steps, lastErr)
MCDDrainErr.WithLabelValues(dn.node.Name, "WaitTimeout").Set(float64(backoff.Steps))
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeWarning, "FailedToDrain", failMsg)
return errors.Wrapf(lastErr, "failed to drain node (%d tries): %v", backoff.Steps, err)
return errors.Wrapf(lastErr, "failed to cordon/uncordon node (%d tries): %v", backoff.Steps, err)
}
MCDDrainErr.WithLabelValues(dn.node.Name, "UnknownError").Set(float64(backoff.Steps))
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeWarning, "FailedToDrain", err.Error())
return errors.Wrap(err, "failed to drain node")
return errors.Wrap(err, "failed to cordon/uncordon node")
}
return nil
}

dn.logSystem("drain complete")
t := time.Since(startTime).Seconds()
glog.Infof("Successful drain took %v seconds", t)
MCDDrainErr.WithLabelValues(dn.node.Name, "").Set(0)
func (dn *Daemon) drain() error {
// Skip draining of the node when we're not cluster driven
if dn.kubeClient == nil {
return nil
}

return nil
if err := dn.cordonOrUncordonNode(true); err != nil {
return err
}
dn.logSystem("Node has been successfully cordoned")
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Cordon", "Cordoned node to apply update")

dn.logSystem("Update prepared; beginning drain")
startTime := time.Now()

dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Drain", "Draining node to update config.")

done := make(chan bool, 1)

drainer := func() chan error {
ret := make(chan error)
go func() {
for {
select {
case <-done:
return
default:
if err := drain.RunNodeDrain(dn.drainer, dn.node.Name); err != nil {
glog.Infof("Draining failed with: %v, retrying", err)
time.Sleep(5 * time.Minute)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we also include patch #2612 so that few initial retry occurs every 1 mins on failure?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ahh good catch!

continue
}
close(ret)
return
}
}
}()
return ret
}

select {
case <-time.After(1 * time.Hour):
done <- true
failMsg := fmt.Sprintf("failed to drain node : %s after 1 hour", dn.node.Name)
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeWarning, "FailedToDrain", failMsg)
MCDDrainErr.Set(1)
return errors.New(failMsg)
case <-drainer():
dn.logSystem("drain complete")
t := time.Since(startTime).Seconds()
glog.Infof("Successful drain took %v seconds", t)
MCDDrainErr.Set(0)
return nil
}
}

var errUnreconcilable = errors.New("unreconcilable")
Expand Down