diff --git a/pkg/cvo/internal/operatorstatus.go b/pkg/cvo/internal/operatorstatus.go index d013905d8..256bb7dcd 100644 --- a/pkg/cvo/internal/operatorstatus.go +++ b/pkg/cvo/internal/operatorstatus.go @@ -201,14 +201,20 @@ func checkOperatorHealth(ctx context.Context, client ClusterOperatorsGetter, exp } } - // during initialization we allow degraded - if degraded && mode != resourcebuilder.InitializingMode { + if degraded { if degradedCondition != nil && len(degradedCondition.Message) > 0 { nestedMessage = fmt.Errorf("cluster operator %s is %s=%s: %s, %s", actual.Name, degradedCondition.Type, degradedCondition.Status, degradedCondition.Reason, degradedCondition.Message) } + var updateEffect payload.UpdateEffectType + + if mode == resourcebuilder.InitializingMode { + updateEffect = payload.UpdateEffectReport + } else { + updateEffect = payload.UpdateEffectFailAfterInterval + } return &payload.UpdateError{ Nested: nestedMessage, - UpdateEffect: payload.UpdateEffectFailAfterInterval, + UpdateEffect: updateEffect, Reason: "ClusterOperatorDegraded", PluralReason: "ClusterOperatorsDegraded", Message: fmt.Sprintf("Cluster operator %s is degraded", actual.Name), diff --git a/pkg/cvo/internal/operatorstatus_test.go b/pkg/cvo/internal/operatorstatus_test.go index 6a714f045..875540ded 100644 --- a/pkg/cvo/internal/operatorstatus_test.go +++ b/pkg/cvo/internal/operatorstatus_test.go @@ -439,6 +439,42 @@ func Test_checkOperatorHealth(t *testing.T) { PluralMessageFormat: "Cluster operators %s are degraded", Name: "test-co", }, + }, { + name: "cluster operator reporting available=true degraded=true in InitializingMode", + actual: &configv1.ClusterOperator{ + ObjectMeta: metav1.ObjectMeta{Name: "test-co"}, + Status: configv1.ClusterOperatorStatus{ + Versions: []configv1.OperandVersion{{ + Name: "operator", Version: "v1", + }, { + Name: "operand-1", Version: "v1", + }}, + Conditions: []configv1.ClusterOperatorStatusCondition{ + {Type: configv1.OperatorAvailable, Status: configv1.ConditionTrue}, + {Type: configv1.OperatorDegraded, Status: configv1.ConditionTrue, Reason: "RandomReason", Message: "random error"}, + }, + }, + }, + mode: resourcebuilder.InitializingMode, + exp: &configv1.ClusterOperator{ + ObjectMeta: metav1.ObjectMeta{Name: "test-co"}, + Status: configv1.ClusterOperatorStatus{ + Versions: []configv1.OperandVersion{{ + Name: "operator", Version: "v1", + }, { + Name: "operand-1", Version: "v1", + }}, + }, + }, + expErr: &payload.UpdateError{ + Nested: fmt.Errorf("cluster operator test-co is Degraded=True: RandomReason, random error"), + UpdateEffect: payload.UpdateEffectReport, + Reason: "ClusterOperatorDegraded", + PluralReason: "ClusterOperatorsDegraded", + Message: "Cluster operator test-co is degraded", + PluralMessageFormat: "Cluster operators %s are degraded", + Name: "test-co", + }, }, { name: "cluster operator reporting available=true progressing=true degraded=true", actual: &configv1.ClusterOperator{ @@ -475,6 +511,43 @@ func Test_checkOperatorHealth(t *testing.T) { PluralMessageFormat: "Cluster operators %s are degraded", Name: "test-co", }, + }, { + name: "cluster operator reporting available=true progressing=true degraded=true in InitializingMode", + actual: &configv1.ClusterOperator{ + ObjectMeta: metav1.ObjectMeta{Name: "test-co"}, + Status: configv1.ClusterOperatorStatus{ + Versions: []configv1.OperandVersion{{ + Name: "operator", Version: "v1", + }, { + Name: "operand-1", Version: "v1", + }}, + Conditions: []configv1.ClusterOperatorStatusCondition{ + {Type: configv1.OperatorAvailable, Status: configv1.ConditionTrue}, + {Type: configv1.OperatorProgressing, Status: configv1.ConditionTrue}, + {Type: configv1.OperatorDegraded, Status: configv1.ConditionTrue, Reason: "RandomReason", Message: "random error"}, + }, + }, + }, + mode: resourcebuilder.InitializingMode, + exp: &configv1.ClusterOperator{ + ObjectMeta: metav1.ObjectMeta{Name: "test-co"}, + Status: configv1.ClusterOperatorStatus{ + Versions: []configv1.OperandVersion{{ + Name: "operator", Version: "v1", + }, { + Name: "operand-1", Version: "v1", + }}, + }, + }, + expErr: &payload.UpdateError{ + Nested: fmt.Errorf("cluster operator test-co is Degraded=True: RandomReason, random error"), + UpdateEffect: payload.UpdateEffectReport, + Reason: "ClusterOperatorDegraded", + PluralReason: "ClusterOperatorsDegraded", + Message: "Cluster operator test-co is degraded", + PluralMessageFormat: "Cluster operators %s are degraded", + Name: "test-co", + }, }, { name: "cluster operator reporting available=true no progressing or degraded", actual: &configv1.ClusterOperator{ diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index 047f5153a..b7969d5fa 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -430,8 +430,9 @@ func setDesiredReleaseAcceptedCondition(config *configv1.ClusterVersion, status // convertErrorToProgressing returns true if the provided status indicates a failure condition can be interpreted as // still making internal progress. The general error we try to suppress is an operator or operators still being // progressing AND the general payload task making progress towards its goal. The error's UpdateEffect determines -// whether an error should be considered a failure and, if so, whether the operator should be given up to 40 minutes -// to recover from the error. +// how an update error is interpreted. An error may simply need to be reported but does not indicate the update is +// failing. An error may indicate the update is failing or that if the error continues for a defined interval the +// update is failing. func convertErrorToProgressing(history []configv1.UpdateHistory, now time.Time, status *SyncWorkerStatus) (reason string, message string, ok bool) { if len(history) == 0 || status.Failure == nil || status.Reconciling { return "", "", false @@ -441,6 +442,8 @@ func convertErrorToProgressing(history []configv1.UpdateHistory, now time.Time, return "", "", false } switch uErr.UpdateEffect { + case payload.UpdateEffectReport: + return uErr.Reason, uErr.Error(), false case payload.UpdateEffectNone: return uErr.Reason, fmt.Sprintf("waiting on %s", uErr.Name), true case payload.UpdateEffectFail: diff --git a/pkg/cvo/sync_worker.go b/pkg/cvo/sync_worker.go index 01e51e0b8..348f01adb 100644 --- a/pkg/cvo/sync_worker.go +++ b/pkg/cvo/sync_worker.go @@ -920,6 +920,8 @@ func (w *SyncWorker) apply(ctx context.Context, work *SyncWork, maxWorkers int, } capabilities := capability.GetCapabilitiesStatus(work.Capabilities) + var reportEffectErrors []error + // update each object errs := payload.RunGraph(ctx, graph, maxWorkers, func(ctx context.Context, tasks []*payload.Task) error { // in specific modes, attempt to precreate a set of known types (currently ClusterOperator) without @@ -967,7 +969,12 @@ func (w *SyncWorker) apply(ctx context.Context, work *SyncWork, maxWorkers int, continue } if err := task.Run(ctx, payloadUpdate.Release.Version, w.builder, work.State); err != nil { - return err + if uErr, ok := err.(*payload.UpdateError); ok && uErr.UpdateEffect == payload.UpdateEffectReport { + // do not fail the task on this manifest, just record it for later complaining + reportEffectErrors = append(reportEffectErrors, err) + } else { + return err + } } cr.Inc() klog.V(2).Infof("Done syncing for %s", task) @@ -983,7 +990,7 @@ func (w *SyncWorker) apply(ctx context.Context, work *SyncWork, maxWorkers int, // update the status cr.Complete() - return nil + return apierrors.NewAggregate(reportEffectErrors) } var ( @@ -1173,7 +1180,7 @@ func condenseClusterOperators(errs []error) []error { } nested := make([]error, 0, len(reasonErrors)) names := make([]string, 0, len(reasonErrors)) - updateEffect := payload.UpdateEffectNone + updateEffect := payload.UpdateEffectReport for _, err := range reasonErrors { nested = append(nested, err) if len(err.Name) > 0 { @@ -1181,13 +1188,17 @@ func condenseClusterOperators(errs []error) []error { } switch err.UpdateEffect { + case payload.UpdateEffectReport: case payload.UpdateEffectNone: - case payload.UpdateEffectFail: - updateEffect = payload.UpdateEffectFail + if updateEffect == payload.UpdateEffectReport { + updateEffect = payload.UpdateEffectNone + } case payload.UpdateEffectFailAfterInterval: if updateEffect != payload.UpdateEffectFail { updateEffect = payload.UpdateEffectFailAfterInterval } + case payload.UpdateEffectFail: + updateEffect = payload.UpdateEffectFail } } sort.Strings(names) diff --git a/pkg/payload/task.go b/pkg/payload/task.go index 8600db1c1..3b9ebce8a 100644 --- a/pkg/payload/task.go +++ b/pkg/payload/task.go @@ -147,6 +147,10 @@ func (st *Task) Run(ctx context.Context, version string, builder ResourceBuilder type UpdateEffectType string const ( + // UpdateEffectReport defines an error that requires reporting but does not + // block reconciliation from completing. + UpdateEffectReport UpdateEffectType = "Report" + // UpdateEffectNone defines an error as having no affect on the update state. UpdateEffectNone UpdateEffectType = "None"