diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go index a9c15f72e122..25f7789a3ea4 100644 --- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go +++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go @@ -223,10 +223,6 @@ var KnownEventsBugs = []KnownProblem{ Regexp: regexp.MustCompile("ns/openshift-etcd-operator namespace/openshift-etcd-operator -.*rpc error: code = Canceled desc = grpc: the client connection is closing.*"), BZ: "https://bugzilla.redhat.com/show_bug.cgi?id=2006975", }, - { - Regexp: regexp.MustCompile("reason/TopologyAwareHintsDisabled"), - BZ: "https://issues.redhat.com/browse/OCPBUGS-13366", - }, { Regexp: regexp.MustCompile("ns/.*reason/.*APICheckFailed.*503.*"), BZ: "https://bugzilla.redhat.com/show_bug.cgi?id=2017435", diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events.go index 941b264212ca..8947dd27b087 100644 --- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events.go +++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events.go @@ -6,6 +6,7 @@ import ( "regexp" "strconv" "strings" + "time" "github.com/sirupsen/logrus" e2e "k8s.io/kubernetes/test/e2e/framework" @@ -235,6 +236,13 @@ func (d duplicateEventsEvaluator) testDuplicatedEvents(testName string, flakeOnl }) logrus.Infof("found %d NodeUpdate intervals", len(nodeUpdateIntervals)) + type timeRange struct { + from time.Time + to time.Time + } + buildTopologyHintAllowedTimeRanges := true + topologyHintAllowedTimeRanges := []*timeRange{} + displayToCount := map[string]*pathologicalEvents{} for _, event := range events { // TODO: port to use structured message reason once kube event intervals are ported over @@ -252,9 +260,73 @@ func (d duplicateEventsEvaluator) testDuplicatedEvents(testName string, flakeOnl continue } } + if strings.Contains(event.Message, "reason/TopologyAwareHintsDisabled") { + // Build the allowed time range only once + if buildTopologyHintAllowedTimeRanges { + taintManagerTestIntervals := events.Filter(func(eventInterval monitorapi.Interval) bool { + return eventInterval.Source == monitorapi.SourceE2ETest && + strings.Contains(eventInterval.StructuredLocator.Keys[monitorapi.LocatorE2ETestKey], "NoExecuteTaintManager") + }) + // Start the allowed time range from time range of the tests. But events lag behind the tests since the tests do not wait + // until all dns pods are properly scheduled and reach ready state. So we will need to expand the allowed time range after. + for _, test := range taintManagerTestIntervals { + topologyHintAllowedTimeRanges = append(topologyHintAllowedTimeRanges, &timeRange{from: test.From, to: test.To}) + logrus.WithField("from", test.From).WithField("to", test.To).Infof("found time range for test: %s", testName) + } + dnsUpdateIntervals := events.Filter(func(eventInterval monitorapi.Interval) bool { + return eventInterval.Source == monitorapi.SourcePodState && + (eventInterval.StructuredLocator.Type == monitorapi.LocatorTypePod || eventInterval.StructuredLocator.Type == monitorapi.LocatorTypeContainer) && + eventInterval.StructuredLocator.Keys[monitorapi.LocatorNamespaceKey] == "openshift-dns" && + eventInterval.StructuredMessage.Annotations[monitorapi.AnnotationConstructed] == monitorapi.ConstructionOwnerPodLifecycle + }) + + // Now expand the allowed time range until the replacement dns pod gets ready + for _, r := range topologyHintAllowedTimeRanges { + var lastReadyTime time.Time + count := 0 + for _, interval := range dnsUpdateIntervals { + if interval.From.Before(r.from) { + continue + } + // If there is a GracefulDelete of dns-default pod, we will have to wait until the replacement dns container becomes ready + if interval.StructuredMessage.Reason == monitorapi.PodReasonGracefulDeleteStarted && + strings.Contains(interval.StructuredLocator.Keys[monitorapi.LocatorPodKey], "dns-default") { + count++ + } + if interval.StructuredMessage.Reason == monitorapi.ContainerReasonReady && + interval.StructuredLocator.Keys[monitorapi.LocatorContainerKey] == "dns" && count > 0 { + lastReadyTime = interval.From + count-- + } + if interval.From.After(r.to) && count == 0 { + if lastReadyTime.After(r.to) { + r.to = lastReadyTime + } + break + } + } + } + // Log final adjusted time ranges + for _, test := range taintManagerTestIntervals { + logrus.WithField("from", test.From).WithField("to", test.To).Infof("adjusted time range for test: %s", testName) + } + buildTopologyHintAllowedTimeRanges = false + } + // Filter out TopologyAwareHintsDisabled events within allowed time range + var allowed bool + for _, r := range topologyHintAllowedTimeRanges { + if r.from.Before(event.From) && r.to.After(event.To) { + logrus.Infof("%s was found to fall into the allowed time range %+v, ignoring pathological event as we expect these during NoExecuteTaintManager test", event, r) + allowed = true + break + } + } + if allowed { + continue + } + } eventDisplayMessage, times := GetTimesAnEventHappened(fmt.Sprintf("%s - %s", event.Locator, event.Message)) if times > DuplicateEventThreshold { - // If we marked this message earlier in recordAddOrUpdateEvent as interesting/true, we know it matched one of // the existing patterns or one of the AllowedRepeatedEventFns functions returned true. if strings.Contains(eventDisplayMessage, InterestingMark) { diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go index c5e17e0fbcd7..e148e359bc83 100644 --- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go +++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go @@ -558,3 +558,173 @@ func TestMakeProbeTestEventsGroup(t *testing.T) { }) } } + +func TestPathologicalEventsTopologyAwareHintsDisabled(t *testing.T) { + evaluator := duplicateEventsEvaluator{ + allowedRepeatedEventPatterns: AllowedRepeatedEventPatterns, + knownRepeatedEventsBugs: []KnownProblem{}, + } + from := time.Unix(872827200, 0).In(time.UTC) + to := time.Unix(872827200, 0).In(time.UTC) + + tests := []struct { + name string + namespace string + platform v1.PlatformType + topology v1.TopologyMode + intervals []monitorapi.Interval + expectedMessage string + }{ + { + // This is ignored because the node is tainted by test + name: "ignore TopologyAwareHintsDisabled before dns container ready", + intervals: []monitorapi.Interval{ + { + Condition: monitorapi.Condition{ + Level: monitorapi.Info, + StructuredLocator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeE2ETest, + Keys: map[monitorapi.LocatorKey]string{ + monitorapi.LocatorE2ETestKey: "[sig-node] NoExecuteTaintManager Single Pod [Serial] doesn't evict pod with tolerations from tainted nodes [Skipped:SingleReplicaTopology] [Suite:openshift/conformance/serial] [Suite:k8s]", + }, + }, + StructuredMessage: monitorapi.Message{}, + }, + Source: monitorapi.SourceE2ETest, + From: from.Add(-10 * time.Minute), + To: to.Add(10 * time.Minute), + }, + { + Condition: monitorapi.Condition{ + Level: monitorapi.Info, + StructuredLocator: monitorapi.Locator{ + Type: monitorapi.LocatorTypePod, + Keys: map[monitorapi.LocatorKey]string{ + monitorapi.LocatorNamespaceKey: "openshift-dns", + monitorapi.LocatorPodKey: "dns-default-jq2qn", + }, + }, + StructuredMessage: monitorapi.Message{ + Reason: monitorapi.PodReasonGracefulDeleteStarted, + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationConstructed: "pod-lifecycle-constructor", + monitorapi.AnnotationReason: "GracefulDelete", + }, + }, + }, + Source: monitorapi.SourcePodState, + From: from.Add(-5 * time.Minute), + To: to.Add(1 * time.Minute), + }, + { + Condition: monitorapi.Condition{ + Level: monitorapi.Info, + Message: "pathological/true reason/TopologyAwareHintsDisabled Unable to allocate minimum required endpoints to each zone without exceeding overload threshold (5 endpoints, 3 zones), addressType: IPv4 (23 times)", + }, + From: from.Add(11 * time.Minute), + To: to.Add(12 * time.Minute), + }, + { + Condition: monitorapi.Condition{ + Level: monitorapi.Info, + StructuredLocator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeContainer, + Keys: map[monitorapi.LocatorKey]string{ + monitorapi.LocatorNamespaceKey: "openshift-dns", + monitorapi.LocatorContainerKey: "dns", + monitorapi.LocatorPodKey: "dns-default-jq2qn", + }, + }, + StructuredMessage: monitorapi.Message{ + Reason: monitorapi.ContainerReasonReady, + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationConstructed: "pod-lifecycle-constructor", + monitorapi.AnnotationReason: "Ready", + }, + }, + }, + Source: monitorapi.SourcePodState, + From: from.Add(15 * time.Minute), + To: to.Add(16 * time.Minute), + }, + }, + namespace: "openshift-dns", + expectedMessage: "", + }, + { + // This is not ignored because there is no dns ready following + name: "fire TopologyAwareHintsDisabled when there is no dns container ready", + intervals: []monitorapi.Interval{ + { + Condition: monitorapi.Condition{ + Level: monitorapi.Info, + StructuredLocator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeE2ETest, + Keys: map[monitorapi.LocatorKey]string{ + monitorapi.LocatorE2ETestKey: "[sig-node] NoExecuteTaintManager Single Pod [Serial] doesn't evict pod with tolerations from tainted nodes [Skipped:SingleReplicaTopology] [Suite:openshift/conformance/serial] [Suite:k8s]", + }, + }, + StructuredMessage: monitorapi.Message{}, + }, + Source: monitorapi.SourceE2ETest, + From: from.Add(-10 * time.Minute), + To: to.Add(10 * time.Minute), + }, + { + Condition: monitorapi.Condition{ + Level: monitorapi.Info, + StructuredLocator: monitorapi.Locator{ + Type: monitorapi.LocatorTypePod, + Keys: map[monitorapi.LocatorKey]string{ + monitorapi.LocatorNamespaceKey: "openshift-dns", + monitorapi.LocatorPodKey: "dns-default-jq2qn", + }, + }, + StructuredMessage: monitorapi.Message{ + Reason: monitorapi.PodReasonGracefulDeleteStarted, + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationConstructed: "pod-lifecycle-constructor", + monitorapi.AnnotationReason: "GracefulDelete", + }, + }, + }, + Source: monitorapi.SourcePodState, + From: from.Add(-5 * time.Minute), + To: to.Add(1 * time.Minute), + }, + { + Condition: monitorapi.Condition{ + Level: monitorapi.Info, + Locator: "ns/openshift-dns service/dns-default hmsg/ade328ddf3", + Message: "pathological/true reason/TopologyAwareHintsDisabled Unable to allocate minimum required endpoints to each zone without exceeding overload threshold (5 endpoints, 3 zones), addressType: IPv4 (23 times)", + }, + From: from.Add(11 * time.Minute), + To: to.Add(12 * time.Minute), + }, + }, + namespace: "openshift-dns", + expectedMessage: "1 events happened too frequently\n\nevent happened 23 times, something is wrong: ns/openshift-dns service/dns-default hmsg/ade328ddf3 - pathological/true reason/TopologyAwareHintsDisabled Unable to allocate minimum required endpoints to each zone without exceeding overload threshold (5 endpoints, 3 zones), addressType: IPv4 From: 04:11:00Z To: 04:12:00Z result=reject ", + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + + events := monitorapi.Intervals(test.intervals) + + testName := "events should not repeat" + junits := evaluator.testDuplicatedEvents(testName, false, events, nil, false) + jUnitName := getJUnitName(testName, test.namespace) + for _, junit := range junits { + if (junit.Name == jUnitName) && (test.expectedMessage != "") { + assert.Equal(t, test.expectedMessage, junit.FailureOutput.Output) + } else { + if !assert.Nil(t, junit.FailureOutput, "expected success but got failure output") { + t.Logf(junit.FailureOutput.Output) + } + } + } + + }) + } +} diff --git a/pkg/monitortests/testframework/watchevents/event_test.go b/pkg/monitortests/testframework/watchevents/event_test.go index e7422ec036c0..448fac72a0ad 100644 --- a/pkg/monitortests/testframework/watchevents/event_test.go +++ b/pkg/monitortests/testframework/watchevents/event_test.go @@ -98,25 +98,26 @@ func Test_recordAddOrUpdateEvent(t *testing.T) { expectedMessage: "pathological/true interesting/true reason/SomethingHappened Readiness probe failed (40 times)", }, { - name: "allowed pathological event with known bug", + name: "allowed pathological event with known bug (BZ 2000234)", args: args{ ctx: context.TODO(), m: monitor.NewRecorder(), kubeEvent: &corev1.Event{ Count: 40, - Reason: "TopologyAwareHintsDisabled", + Reason: "ns/openshift-etcd pod/etcd-quorum-guard-42 node/worker-42 - reason/Unhealthy", InvolvedObject: corev1.ObjectReference{ Kind: "Pod", - Namespace: "any", - Name: "any", + Namespace: "openshift-etcd", + Name: "etcd-quorum-guard-42", }, - Message: "irrelevant", + Message: "Readiness probe failed:", LastTimestamp: metav1.Now(), }, significantlyBeforeNow: now.UTC().Add(-15 * time.Minute), }, - expectedLocator: "ns/any pod/any hmsg/e13faa98ab", - expectedMessage: "pathological/true interesting/true reason/TopologyAwareHintsDisabled irrelevant (40 times)", + // hmsg in expectedLocator is the hash of the entire expectedMessage except the number of times + expectedLocator: "ns/openshift-etcd pod/etcd-quorum-guard-42 hmsg/9100aa725d", + expectedMessage: "pathological/true interesting/true reason/ns/openshift-etcd pod/etcd-quorum-guard-42 node/worker-42 - reason/Unhealthy Readiness probe failed: (40 times)", }, } for _, tt := range tests {