Skip to content

Commit

Permalink
Apply exp. backoff on every runCheck error
Browse files Browse the repository at this point in the history
Exp. backoff should be used on all errors, incl. failed connections to
vCenter. Reorganize calculation of the next check to accomodate that.
  • Loading branch information
jsafrane committed May 5, 2021
1 parent 95e98ae commit df807c6
Showing 1 changed file with 11 additions and 7 deletions.
18 changes: 11 additions & 7 deletions pkg/operator/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,15 @@ func (c *vSphereProblemDetectorController) sync(ctx context.Context, syncCtx fac
availableCnd.Reason = "SyncFailed"
syncErrrorMetric.Set(1)
} else {
// Clean the error metric
syncErrrorMetric.Set(0)
}

// Poke the controller sync loop after the delay to re-run tests
queue := syncCtx.Queue()
queueKey := syncCtx.QueueKey()
c.nextCheck = c.lastCheck.Add(delay)
klog.V(2).Infof("Scheduled the next check in %s (%s)", delay, c.nextCheck)
time.AfterFunc(delay, func() {
queue.Add(queueKey)
})
Expand All @@ -162,9 +165,13 @@ func (c *vSphereProblemDetectorController) sync(ctx context.Context, syncCtx fac
}

func (c *vSphereProblemDetectorController) runChecks(ctx context.Context) (time.Duration, error) {
// pre-calculate exp. backoff on error
nextErrorDelay := c.backoff.Step()
c.lastCheck = time.Now()

vmConfig, vmClient, err := c.connect(ctx)
if err != nil {
return 0, err
return nextErrorDelay, err
}

checkContext := &check.CheckContext{
Expand All @@ -178,12 +185,12 @@ func (c *vSphereProblemDetectorController) runChecks(ctx context.Context) (time.
resultCollector := NewResultsCollector()
c.enqueueClusterChecks(checkContext, checkRunner, resultCollector)
if err := c.enqueueNodeChecks(checkContext, checkRunner, resultCollector); err != nil {
return 0, err
return nextErrorDelay, err
}

klog.V(4).Infof("Waiting for all checks")
if err := checkRunner.Wait(ctx); err != nil {
return 0, err
return nextErrorDelay, err
}
c.finishNodeChecks(checkContext)

Expand All @@ -192,20 +199,17 @@ func (c *vSphereProblemDetectorController) runChecks(ctx context.Context) (time.
results, checksFailed := resultCollector.Collect()
c.reportResults(results)
c.lastResults = results
c.lastCheck = time.Now()
var nextDelay time.Duration
if checksFailed {
// Use exponential backoff
nextDelay = c.backoff.Step()
nextDelay = nextErrorDelay
} else {
// Reset the backoff on success
c.backoff = defaultBackoff
// Delay after success is after the maximum backoff
// (i.e. retry as slow as allowed).
nextDelay = defaultBackoff.Cap
}
c.nextCheck = c.lastCheck.Add(nextDelay)
klog.V(2).Infof("Scheduled the next check in %s (%s)", nextDelay, c.nextCheck)
return nextDelay, nil
}

Expand Down

0 comments on commit df807c6

Please sign in to comment.