Skip to content

Commit

Permalink
operator: refactor sync functions, add retry to applyManifests
Browse files Browse the repository at this point in the history
Refactored ApplyManifests by hoisting the waitForDaemonsetRollout call to the syncMachineConfigxxx function as this is more consistent with the other sync functions. This also makes the retry action cleaner by not multiplying the final wait timeout. Retry only takes place on rpc errors and resource conflicts, for all other cases, an immediate degrade takes place. Operator will no longer report Available=False for any case.
  • Loading branch information
djoshy committed Mar 11, 2024
1 parent a073953 commit f648930
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 131 deletions.
33 changes: 9 additions & 24 deletions pkg/operator/status.go
Expand Up @@ -96,7 +96,7 @@ func (optr *Operator) syncRelatedObjects() error {
}

// syncAvailableStatus applies the new condition to the mco's ClusterOperator object.
func (optr *Operator) syncAvailableStatus(ierr syncError) error {
func (optr *Operator) syncAvailableStatus() error {
co, err := optr.fetchClusterOperator()
if err != nil {
return err
Expand All @@ -105,32 +105,17 @@ func (optr *Operator) syncAvailableStatus(ierr syncError) error {
return nil
}

message := fmt.Sprintf("Cluster has deployed %s", co.Status.Versions)
available := configv1.ConditionTrue
reason := asExpectedReason

// we will only be Available = False where there is a problem syncing
// operands of the MCO as that points to impaired operator functionality.
// RequiredPools failing but everything else being ok, should be just Degraded = True.
if ierr.err != nil && ierr.task != "RequiredPools" {
available = configv1.ConditionFalse
mcoObjectRef := &corev1.ObjectReference{
Kind: co.Kind,
Name: co.Name,
Namespace: co.Namespace,
UID: co.GetUID(),
}

reason = taskFailed(ierr.task)
message = fmt.Sprintf("Cluster not available for %s: %s", co.Status.Versions, ierr.err.Error())
optr.eventRecorder.Eventf(mcoObjectRef, corev1.EventTypeWarning, reason, message)
}
// Based on Openshift Operator Guidance, Available = False is only necessary
// if a midnight admin page is required. In the MCO land, nothing quite reaches
// that level of severity. Most MCO errors typically fall into the degrade category
// (which imply a working hours admin page)
// See https://issues.redhat.com/browse/OCPBUGS-9108 for more information.

coStatus := configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorAvailable,
Status: available,
Message: message,
Reason: reason,
Status: configv1.ConditionTrue,
Message: fmt.Sprintf("Cluster has deployed %s", co.Status.Versions),
Reason: asExpectedReason,
}

return optr.updateStatus(co, coStatus)
Expand Down
26 changes: 13 additions & 13 deletions pkg/operator/status_test.go
Expand Up @@ -250,8 +250,8 @@ func TestOperatorSyncStatus(t *testing.T) {
},
{
Type: configv1.OperatorAvailable,
Status: configv1.ConditionFalse,
Reason: "fn1Failed",
Status: configv1.ConditionTrue,
Reason: asExpectedReason,
},
{
Type: configv1.OperatorProgressing,
Expand Down Expand Up @@ -398,7 +398,7 @@ func TestOperatorSyncStatus(t *testing.T) {
},
},
},
// 3. test that if progressing fails, we report available=false because state of the operator
// 3. test that if progressing fails, we report degraded=false because state of the operator
// might have changed in the various sync calls
{
syncs: []syncCase{
Expand Down Expand Up @@ -439,8 +439,8 @@ func TestOperatorSyncStatus(t *testing.T) {
},
{
Type: configv1.OperatorAvailable,
Status: configv1.ConditionFalse,
Reason: "fn1Failed",
Status: configv1.ConditionTrue,
Reason: asExpectedReason,
},
{
Type: configv1.OperatorDegraded,
Expand All @@ -463,7 +463,7 @@ func TestOperatorSyncStatus(t *testing.T) {
},
},
},
// 4. test that if progressing fails during bringup, we still report degraded and not available
// 4. test that if progressing fails during bringup, we still report degraded
{
syncs: []syncCase{
{
Expand All @@ -475,8 +475,8 @@ func TestOperatorSyncStatus(t *testing.T) {
},
{
Type: configv1.OperatorAvailable,
Status: configv1.ConditionFalse,
Reason: "fn1Failed",
Status: configv1.ConditionTrue,
Reason: asExpectedReason,
},
{
Type: configv1.OperatorDegraded,
Expand Down Expand Up @@ -507,8 +507,8 @@ func TestOperatorSyncStatus(t *testing.T) {
},
{
Type: configv1.OperatorAvailable,
Status: configv1.ConditionFalse,
Reason: "fn1Failed",
Status: configv1.ConditionTrue,
Reason: asExpectedReason,
},
{
Type: configv1.OperatorDegraded,
Expand All @@ -531,7 +531,7 @@ func TestOperatorSyncStatus(t *testing.T) {
},
},
},
// 5. test status flipping between available and degraded
// 5. test status flipping between not degraded and degraded
{
syncs: []syncCase{
{
Expand Down Expand Up @@ -571,8 +571,8 @@ func TestOperatorSyncStatus(t *testing.T) {
},
{
Type: configv1.OperatorAvailable,
Status: configv1.ConditionFalse,
Reason: "fn1Failed",
Status: configv1.ConditionTrue,
Reason: asExpectedReason,
},
{
Type: configv1.OperatorDegraded,
Expand Down

0 comments on commit f648930

Please sign in to comment.