Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OTA-362: pkg/monitortests/clusterversionoperator: Fatal unless Available=False in allow-list #27231

Merged
merged 17 commits into from Dec 3, 2023
Merged
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
51fda72
pkg/monitortests/clusterversionoperator: Fatal unless Available=False…
wking Jun 7, 2022
d4c85ef
pkg/monitortests/clusterversionoperator: Add an exception for authent…
wking Oct 3, 2023
d7e13ad
pkg/monitortests/clusterversionoperator: Add an exception for control…
wking Oct 3, 2023
c1ae513
pkg/monitortests/clusterversionoperator: Add an exception for kube-st…
wking Oct 3, 2023
c08a0d5
pkg/monitortests/clusterversionoperator: Allow Available=False for no…
wking Oct 6, 2023
ef2a6f9
pkg/monitortests/clusterversionoperator/legacycvomonitortests: Pivot …
wking Nov 21, 2023
9da7989
pkg/monitortests/clusterversionoperator: Add an exception for machine…
wking Nov 22, 2023
ae668ee
pkg/monitortests/clusterversionoperator: Add an exception for operato…
wking Nov 23, 2023
6e79269
pkg/monitortests/clusterversionoperator: Add an exception for monitor…
wking Nov 23, 2023
53f239e
pkg/monitortests/clusterversionoperator: Add an exception for openshi…
wking Nov 23, 2023
a8ec3fe
pkg/monitortests/clusterversionoperator/legacycvomonitortests: Return…
wking Nov 28, 2023
ca46d5d
pkg/monitortests/clusterversionoperator/legacycvomonitortests: Expand…
wking Nov 28, 2023
5573b55
pkg/monitortests/clusterversionoperator/legacycvomonitortests: Expand…
wking Nov 28, 2023
b319118
pkg/monitortests/clusterversionoperator: Add an exception for console…
wking Nov 28, 2023
ba523b5
pkg/monitortests/clusterversionoperator/legacycvomonitortests: Expand…
wking Nov 29, 2023
c842f63
pkg/monitortests/clusterversionoperator/legacycvomonitortests: Even m…
wking Nov 29, 2023
0b4bb1d
pkg/monitortests/clusterversionoperator: Add an exception for machine…
wking Nov 29, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -16,14 +16,89 @@ import (
"k8s.io/client-go/rest"
)

// exceptionCallback consumes a suspicious condition and returns an
// exception string if does not think the condition should be fatal.
type exceptionCallback func(operator string, condition *configv1.ClusterOperatorStatusCondition) (string, error)

func testStableSystemOperatorStateTransitions(events monitorapi.Intervals) []*junitapi.JUnitTestCase {
return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded})
except := func(_ string, condition *configv1.ClusterOperatorStatusCondition) (string, error) {
if condition.Status == configv1.ConditionTrue {
if condition.Type == configv1.OperatorAvailable {
return fmt.Sprintf("%s=%s is the happy case", condition.Type, condition.Status), nil
}
} else if condition.Status == configv1.ConditionFalse {
if condition.Type == configv1.OperatorDegraded {
return fmt.Sprintf("%s=%s is the happy case", condition.Type, condition.Status), nil
}
}

return "We are not worried about Available=False or Degraded=True blips for stable-system tests yet.", nil
}

return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except)
}

func testUpgradeOperatorStateTransitions(events monitorapi.Intervals) []*junitapi.JUnitTestCase {
return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded})
except := func(operator string, condition *configv1.ClusterOperatorStatusCondition) (string, error) {
if condition.Status == configv1.ConditionTrue {
if condition.Type == configv1.OperatorAvailable {
return fmt.Sprintf("%s=%s is the happy case", condition.Type, condition.Status), nil
}
} else if condition.Status == configv1.ConditionFalse {
if condition.Type == configv1.OperatorDegraded {
return fmt.Sprintf("%s=%s is the happy case", condition.Type, condition.Status), nil
}
}

if condition.Type == configv1.OperatorDegraded {
return "We are not worried about Degraded=True blips for update tests yet.", nil
}

switch operator {
case "authentication":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && (condition.Reason == "APIServices_Error" || condition.Reason == "APIServerDeployment_NoDeployment" || condition.Reason == "APIServerDeployment_NoPod" || condition.Reason == "APIServerDeployment_PreconditionNotFulfilled" || condition.Reason == "APIServices_PreconditionNotReady" || condition.Reason == "OAuthServerDeployment_NoDeployment" || condition.Reason == "OAuthServerRouteEndpointAccessibleController_EndpointUnavailable" || condition.Reason == "OAuthServerServiceEndpointAccessibleController_EndpointUnavailable" || condition.Reason == "WellKnown_NotReady") {
return "https://issues.redhat.com/browse/OCPBUGS-20056", nil
}
case "console":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && (condition.Reason == "RouteHealth_FailedGet" || condition.Reason == "RouteHealth_RouteNotAdmitted" || condition.Reason == "RouteHealth_StatusError") {
return "https://issues.redhat.com/browse/OCPBUGS-24041", nil
}
case "control-plane-machine-set":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && condition.Reason == "UnavailableReplicas" {
return "https://issues.redhat.com/browse/OCPBUGS-20061", nil
}
case "kube-storage-version-migrator":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && condition.Reason == "KubeStorageVersionMigrator_Deploying" {
return "https://issues.redhat.com/browse/OCPBUGS-20062", nil
}
case "machine-config":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && condition.Reason == "MachineConfigControllerFailed" && strings.Contains(condition.Message, "notAfter: Required value") {
return "https://issues.redhat.com/browse/OCPBUGS-22364", nil
}
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && strings.Contains(condition.Message, "missing HTTP content-type") {
return "https://issues.redhat.com/browse/OCPBUGS-24228", nil
}
case "monitoring":
if condition.Type == configv1.OperatorAvailable && (condition.Status == configv1.ConditionFalse && (condition.Reason == "PlatformTasksFailed" || condition.Reason == "UpdatingAlertmanagerFailed" || condition.Reason == "UpdatingConsolePluginComponentsFailed" || condition.Reason == "UpdatingPrometheusK8SFailed" || condition.Reason == "UpdatingPrometheusOperatorFailed")) || (condition.Status == configv1.ConditionUnknown && condition.Reason == "UpdatingPrometheusFailed") {
return "https://issues.redhat.com/browse/OCPBUGS-23745", nil
}
case "openshift-apiserver":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && (condition.Reason == "APIServerDeployment_NoDeployment" || condition.Reason == "APIServerDeployment_NoPod" || condition.Reason == "APIServerDeployment_PreconditionNotFulfilled" || condition.Reason == "APIServices_Error") {
return "https://issues.redhat.com/browse/OCPBUGS-23746", nil
}
case "operator-lifecycle-manager-packageserver":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && condition.Reason == "ClusterServiceVersionNotSucceeded" {
return "https://issues.redhat.com/browse/OCPBUGS-23744", nil
}
}

return "", nil
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are moving to structured intervals and this kind of regex matching has gotten us into trouble in other places in that effort. The intervals backing this test have not yet been ported but I can make that my next priority, I'll post here as soon as I can get that in. You'll need to do some kind of func based matching perhaps that can examine the type/fields of an interval for a match.

Big +1 in pinning those we know have problems. The test results are pretty striking.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://issues.redhat.com/browse/TRT-1314 I'll try to get to this this week.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like these were actually already ported, just not at the point you last ran tests in here, or at least not in the job I looked at.

A new interval for this would look like:

        {
            "level": "Error",
            "locator": "clusteroperator/control-plane-machine-set",
            "message": "condition/Available reason/UnavailableReplicas status/False Missing 1 available replica(s)",
            "tempSource": "ClusterOperatorMonitor",
            "tempStructuredLocator": {
                "type": "ClusterOperator",
                "keys": {
                    "clusteroperator": "control-plane-machine-set"
                }
            },
            "tempStructuredMessage": {
                "reason": "UnavailableReplicas",
                "cause": "",
                "humanMessage": "Missing 1 available replica(s)",
                "annotations": {
                    "condition": "Available",
                    "reason": "UnavailableReplicas",
                    "status": "False"
                }
            },
            "from": "2023-10-18T19:40:38Z",
            "to": "2023-10-18T19:40:38Z"
        },

You'll want to match on the StructuredLocator.Type, StructuredLocator.Keys["clusteroperator"], StructuredMessageAnnotations condition+status+reason. I know reason is duplicated in there and I'm not sure which we keep, but we'll figure that out soon, I suspect it'll be the annotation not the actual field. Locator Keys and message Annotations should all have constants in monitorapi/types.go.


return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except)
}
func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType) []*junitapi.JUnitTestCase {

func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType, except exceptionCallback) []*junitapi.JUnitTestCase {
ret := []*junitapi.JUnitTestCase{}

var start, stop time.Time
Expand All @@ -39,13 +114,13 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []

eventsByOperator := getEventsByOperator(events)
e2eEventIntervals := operatorstateanalyzer.E2ETestEventIntervals(events)
for _, condition := range conditionTypes {
for _, conditionType := range conditionTypes {
for _, operatorName := range platformidentification.KnownOperators.List() {
bzComponent := platformidentification.GetBugzillaComponentForOperator(operatorName)
if bzComponent == "Unknown" {
bzComponent = operatorName
}
testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should not change condition/%v", bzComponent, operatorName, condition)
testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should not change condition/%v", bzComponent, operatorName, conditionType)
operatorEvents := eventsByOperator[operatorName]
if len(operatorEvents) == 0 {
ret = append(ret, &junitapi.JUnitTestCase{
Expand All @@ -55,19 +130,78 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
continue
}

failures := testOperatorState(condition, operatorEvents, e2eEventIntervals)
if len(failures) > 0 {
excepted := []string{}
fatal := []string{}

for _, eventInterval := range operatorEvents {
condition := monitorapi.GetOperatorConditionStatus(eventInterval)
if condition == nil {
continue // ignore non-condition intervals
}
if len(condition.Type) == 0 {
fatal = append(fatal, fmt.Sprintf("failed to convert %v into a condition with a type", eventInterval))
}

if condition.Type != conditionType {
continue
}

// if there was any switch, it was wrong/unexpected at some point
failure := fmt.Sprintf("%v", eventInterval)

overlappingE2EIntervals := operatorstateanalyzer.FindOverlap(e2eEventIntervals, eventInterval.From, eventInterval.From)
concurrentE2E := []string{}
for _, overlap := range overlappingE2EIntervals {
if overlap.Level == monitorapi.Info {
continue
}
e2eTest, ok := monitorapi.E2ETestFromLocator(overlap.StructuredLocator)
if !ok {
continue
}
concurrentE2E = append(concurrentE2E, fmt.Sprintf("%v", e2eTest))
}

if len(concurrentE2E) > 0 {
failure = fmt.Sprintf("%s\n%d tests failed during this blip (%v to %v): %v", failure, len(concurrentE2E), eventInterval.From, eventInterval.From, strings.Join(concurrentE2E, "\n"))
}

exception, err := except(operatorName, condition)
if err != nil || exception == "" {
fatal = append(fatal, failure)
} else {
excepted = append(excepted, fmt.Sprintf("%s (exception: %s)", failure, exception))
}
}

output := fmt.Sprintf("%d unexpected clusteroperator state transitions during e2e test run", len(fatal))
if len(fatal) > 0 {
output = fmt.Sprintf("%s. These did not match any known exceptions, so they cause this test-case to fail:\n\n%v\n", output, strings.Join(fatal, "\n"))
} else {
output = fmt.Sprintf("%s, as desired.", output)
}
output = fmt.Sprintf("%s\n%d unwelcome but acceptable clusteroperator state transitions during e2e test run", output, len(excepted))
if len(excepted) > 0 {
output = fmt.Sprintf("%s. These should not happen, but because they are tied to exceptions, the fact that they did happen is not sufficient to cause this test-case to fail:\n\n%v\n", output, strings.Join(excepted, "\n"))
} else {
output = fmt.Sprintf("%s, as desired.", output)
}

if len(fatal) > 0 || len(excepted) > 0 {
ret = append(ret, &junitapi.JUnitTestCase{
Name: testName,
Duration: duration,
SystemOut: strings.Join(failures, "\n"),
SystemOut: output,
FailureOutput: &junitapi.FailureOutput{
Output: fmt.Sprintf("%d unexpected clusteroperator state transitions during e2e test run \n\n%v", len(failures), strings.Join(failures, "\n")),
Output: output,
},
})
}
// always add a success so we flake and not fail
ret = append(ret, &junitapi.JUnitTestCase{Name: testName})

if len(fatal) == 0 {
// add a success so we flake (or pass) and don't fail
ret = append(ret, &junitapi.JUnitTestCase{Name: testName})
}
}
}

Expand Down Expand Up @@ -267,44 +401,3 @@ func getEventsByOperator(events monitorapi.Intervals) map[string]monitorapi.Inte
}
return eventsByClusterOperator
}

func testOperatorState(interestingCondition configv1.ClusterStatusConditionType, eventIntervals monitorapi.Intervals, e2eEventIntervals monitorapi.Intervals) []string {
failures := []string{}

for _, eventInterval := range eventIntervals {
// ignore non-interval eventInterval intervals
if eventInterval.From == eventInterval.To {
continue
}

condition := monitorapi.GetOperatorConditionStatus(eventInterval)
if condition == nil {
continue
}

if condition.Type != interestingCondition {
continue
}

// if there was any switch, it was wrong/unexpected at some point
failures = append(failures, fmt.Sprintf("%v", eventInterval))

overlappingE2EIntervals := operatorstateanalyzer.FindOverlap(e2eEventIntervals, eventInterval.From, eventInterval.From)
concurrentE2E := []string{}
for _, overlap := range overlappingE2EIntervals {
if overlap.Level == monitorapi.Info {
continue
}
e2eTest, ok := monitorapi.E2ETestFromLocator(overlap.StructuredLocator)
if !ok {
continue
}
concurrentE2E = append(concurrentE2E, fmt.Sprintf("%v", e2eTest))
}

if len(concurrentE2E) > 0 {
failures = append(failures, fmt.Sprintf("%d tests failed during this blip (%v to %v): %v", len(concurrentE2E), eventInterval.From, eventInterval.From, strings.Join(concurrentE2E, "\n")))
}
}
return failures
}