Skip to content

Commit

Permalink
Don't degrade cluster on connection error
Browse files Browse the repository at this point in the history
Clusters that do not have correct credentials to vCenter should not get
degraded when vsphere-problem-detector cannot connect to it.

Instead, keep the cluster Availabe=true and only report a new metric +
alert on it.

Add the error message to Availabe=true condition, so it can be found
without digging through logs.
  • Loading branch information
jsafrane committed Apr 30, 2021
1 parent c32cbf1 commit e4fc8cf
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 7 deletions.
9 changes: 9 additions & 0 deletions pkg/operator/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,20 @@ var (
},
[]string{checkNameLabel, nodeNameLabel},
)

syncErrrorMetric = metrics.NewGauge(
&metrics.GaugeOpts{
Name: "sync_errors",
Help: "Indicates failing vSphere problem detector sync error. Value 1 means that the last sync failed.",
StabilityLevel: metrics.ALPHA,
},
)
)

func init() {
legacyregistry.MustRegister(clusterCheckTotalMetric)
legacyregistry.MustRegister(clusterCheckErrrorMetric)
legacyregistry.MustRegister(nodeCheckTotalMetric)
legacyregistry.MustRegister(nodeCheckErrrorMetric)
legacyregistry.MustRegister(syncErrrorMetric)
}
22 changes: 15 additions & 7 deletions pkg/operator/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,26 @@ func (c *vSphereProblemDetectorController) sync(ctx context.Context, syncCtx fac
return err
}

availableCnd := operatorapi.OperatorCondition{
Type: controllerName + operatorapi.OperatorStatusTypeAvailable,
Status: operatorapi.ConditionTrue,
}

// TODO: Run in a separate goroutine? We may not want to run time-consuming checks here.
if platformSupported && time.Now().After(c.nextCheck) {
delay, err := c.runChecks(ctx)
if err != nil {
// This sets VSphereProblemDetectorControllerDegraded condition
return err
// Do not return the error, it would degrade the whole cluster.
// Keep the operator Available=true, but give it a specific message.
klog.Errorf("Failed to run checks: %s", err)
// E.g.: "failed to connect to vcenter.example.com: ServerFaultCode: Cannot complete login due to an incorrect user name or password."
availableCnd.Message = err.Error()
availableCnd.Reason = "SyncFailed"
syncErrrorMetric.Set(1)
} else {
syncErrrorMetric.Set(0)
}

// Poke the controller sync loop after the delay to re-run tests
queue := syncCtx.Queue()
queueKey := syncCtx.QueueKey()
Expand All @@ -126,11 +139,6 @@ func (c *vSphereProblemDetectorController) sync(ctx context.Context, syncCtx fac
})
}

availableCnd := operatorapi.OperatorCondition{
Type: controllerName + operatorapi.OperatorStatusTypeAvailable,
Status: operatorapi.ConditionTrue,
}

if _, _, updateErr := v1helpers.UpdateStatus(c.operatorClient,
v1helpers.UpdateConditionFn(availableCnd),
); updateErr != nil {
Expand Down

0 comments on commit e4fc8cf

Please sign in to comment.