Merge pull request #28386 from xueqzhan/revert-28381-revert-28233-169…

…9276048783 Revert "TRT-1339: Revert #28233 "ignore repeated TopologyAwareHintsDisabled events""
openshift · Nov 10, 2023 · fa60863 · fa60863
2 parents 9aa824d + 89829e1
commit fa60863
Show file tree

Hide file tree

Showing 4 changed files with 251 additions and 12 deletions.
diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
@@ -223,10 +223,6 @@ var KnownEventsBugs = []KnownProblem{
 		Regexp: regexp.MustCompile("ns/openshift-etcd-operator namespace/openshift-etcd-operator -.*rpc error: code = Canceled desc = grpc: the client connection is closing.*"),
 		BZ:     "https://bugzilla.redhat.com/show_bug.cgi?id=2006975",
 	},
-	{
-		Regexp: regexp.MustCompile("reason/TopologyAwareHintsDisabled"),
-		BZ:     "https://issues.redhat.com/browse/OCPBUGS-13366",
-	},
 	{
 		Regexp:   regexp.MustCompile("ns/.*reason/.*APICheckFailed.*503.*"),
 		BZ:       "https://bugzilla.redhat.com/show_bug.cgi?id=2017435",

diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events.go
@@ -6,6 +6,7 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
+	"time"
 
 	"github.com/sirupsen/logrus"
 	e2e "k8s.io/kubernetes/test/e2e/framework"
@@ -235,6 +236,13 @@ func (d duplicateEventsEvaluator) testDuplicatedEvents(testName string, flakeOnl
 	})
 	logrus.Infof("found %d NodeUpdate intervals", len(nodeUpdateIntervals))
 
+	type timeRange struct {
+		from time.Time
+		to   time.Time
+	}
+	buildTopologyHintAllowedTimeRanges := true
+	topologyHintAllowedTimeRanges := []*timeRange{}
+
 	displayToCount := map[string]*pathologicalEvents{}
 	for _, event := range events {
 		// TODO: port to use structured message reason once kube event intervals are ported over
@@ -252,9 +260,73 @@ func (d duplicateEventsEvaluator) testDuplicatedEvents(testName string, flakeOnl
 				continue
 			}
 		}
+		if strings.Contains(event.Message, "reason/TopologyAwareHintsDisabled") {
+			// Build the allowed time range only once
+			if buildTopologyHintAllowedTimeRanges {
+				taintManagerTestIntervals := events.Filter(func(eventInterval monitorapi.Interval) bool {
+					return eventInterval.Source == monitorapi.SourceE2ETest &&
+						strings.Contains(eventInterval.StructuredLocator.Keys[monitorapi.LocatorE2ETestKey], "NoExecuteTaintManager")
+				})
+				// Start the allowed time range from time range of the tests. But events lag behind the tests since the tests do not wait
+				// until all dns pods are properly scheduled and reach ready state. So we will need to expand the allowed time range after.
+				for _, test := range taintManagerTestIntervals {
+					topologyHintAllowedTimeRanges = append(topologyHintAllowedTimeRanges, &timeRange{from: test.From, to: test.To})
+					logrus.WithField("from", test.From).WithField("to", test.To).Infof("found time range for test: %s", testName)
+				}
+				dnsUpdateIntervals := events.Filter(func(eventInterval monitorapi.Interval) bool {
+					return eventInterval.Source == monitorapi.SourcePodState &&
+						(eventInterval.StructuredLocator.Type == monitorapi.LocatorTypePod || eventInterval.StructuredLocator.Type == monitorapi.LocatorTypeContainer) &&
+						eventInterval.StructuredLocator.Keys[monitorapi.LocatorNamespaceKey] == "openshift-dns" &&
+						eventInterval.StructuredMessage.Annotations[monitorapi.AnnotationConstructed] == monitorapi.ConstructionOwnerPodLifecycle
+				})
+
+				// Now expand the allowed time range until the replacement dns pod gets ready
+				for _, r := range topologyHintAllowedTimeRanges {
+					var lastReadyTime time.Time
+					count := 0
+					for _, interval := range dnsUpdateIntervals {
+						if interval.From.Before(r.from) {
+							continue
+						}
+						// If there is a GracefulDelete of dns-default pod, we will have to wait until the replacement dns container becomes ready
+						if interval.StructuredMessage.Reason == monitorapi.PodReasonGracefulDeleteStarted &&
+							strings.Contains(interval.StructuredLocator.Keys[monitorapi.LocatorPodKey], "dns-default") {
+							count++
+						}
+						if interval.StructuredMessage.Reason == monitorapi.ContainerReasonReady &&
+							interval.StructuredLocator.Keys[monitorapi.LocatorContainerKey] == "dns" && count > 0 {
+							lastReadyTime = interval.From
+							count--
+						}
+						if interval.From.After(r.to) && count == 0 {
+							if lastReadyTime.After(r.to) {
+								r.to = lastReadyTime
+							}
+							break
+						}
+					}
+				}
+				// Log final adjusted time ranges
+				for _, test := range taintManagerTestIntervals {
+					logrus.WithField("from", test.From).WithField("to", test.To).Infof("adjusted time range for test: %s", testName)
+				}
+				buildTopologyHintAllowedTimeRanges = false
+			}
+			// Filter out TopologyAwareHintsDisabled events within allowed time range
+			var allowed bool
+			for _, r := range topologyHintAllowedTimeRanges {
+				if r.from.Before(event.From) && r.to.After(event.To) {
+					logrus.Infof("%s was found to fall into the allowed time range %+v, ignoring pathological event as we expect these during NoExecuteTaintManager test", event, r)
+					allowed = true
+					break
+				}
+			}
+			if allowed {
+				continue
+			}
+		}
 		eventDisplayMessage, times := GetTimesAnEventHappened(fmt.Sprintf("%s - %s", event.Locator, event.Message))
 		if times > DuplicateEventThreshold {
-
 			// If we marked this message earlier in recordAddOrUpdateEvent as interesting/true, we know it matched one of
 			// the existing patterns or one of the AllowedRepeatedEventFns functions returned true.
 			if strings.Contains(eventDisplayMessage, InterestingMark) {

diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go
@@ -558,3 +558,173 @@ func TestMakeProbeTestEventsGroup(t *testing.T) {
 		})
 	}
 }
+
+func TestPathologicalEventsTopologyAwareHintsDisabled(t *testing.T) {
+	evaluator := duplicateEventsEvaluator{
+		allowedRepeatedEventPatterns: AllowedRepeatedEventPatterns,
+		knownRepeatedEventsBugs:      []KnownProblem{},
+	}
+	from := time.Unix(872827200, 0).In(time.UTC)
+	to := time.Unix(872827200, 0).In(time.UTC)
+
+	tests := []struct {
+		name            string
+		namespace       string
+		platform        v1.PlatformType
+		topology        v1.TopologyMode
+		intervals       []monitorapi.Interval
+		expectedMessage string
+	}{
+		{
+			// This is ignored because the node is tainted by test
+			name: "ignore TopologyAwareHintsDisabled before dns container ready",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Level: monitorapi.Info,
+						StructuredLocator: monitorapi.Locator{
+							Type: monitorapi.LocatorTypeE2ETest,
+							Keys: map[monitorapi.LocatorKey]string{
+								monitorapi.LocatorE2ETestKey: "[sig-node] NoExecuteTaintManager Single Pod [Serial] doesn't evict pod with tolerations from tainted nodes [Skipped:SingleReplicaTopology] [Suite:openshift/conformance/serial] [Suite:k8s]",
+							},
+						},
+						StructuredMessage: monitorapi.Message{},
+					},
+					Source: monitorapi.SourceE2ETest,
+					From:   from.Add(-10 * time.Minute),
+					To:     to.Add(10 * time.Minute),
+				},
+				{
+					Condition: monitorapi.Condition{
+						Level: monitorapi.Info,
+						StructuredLocator: monitorapi.Locator{
+							Type: monitorapi.LocatorTypePod,
+							Keys: map[monitorapi.LocatorKey]string{
+								monitorapi.LocatorNamespaceKey: "openshift-dns",
+								monitorapi.LocatorPodKey:       "dns-default-jq2qn",
+							},
+						},
+						StructuredMessage: monitorapi.Message{
+							Reason: monitorapi.PodReasonGracefulDeleteStarted,
+							Annotations: map[monitorapi.AnnotationKey]string{
+								monitorapi.AnnotationConstructed: "pod-lifecycle-constructor",
+								monitorapi.AnnotationReason:      "GracefulDelete",
+							},
+						},
+					},
+					Source: monitorapi.SourcePodState,
+					From:   from.Add(-5 * time.Minute),
+					To:     to.Add(1 * time.Minute),
+				},
+				{
+					Condition: monitorapi.Condition{
+						Level:   monitorapi.Info,
+						Message: "pathological/true reason/TopologyAwareHintsDisabled Unable to allocate minimum required endpoints to each zone without exceeding overload threshold (5 endpoints, 3 zones), addressType: IPv4 (23 times)",
+					},
+					From: from.Add(11 * time.Minute),
+					To:   to.Add(12 * time.Minute),
+				},
+				{
+					Condition: monitorapi.Condition{
+						Level: monitorapi.Info,
+						StructuredLocator: monitorapi.Locator{
+							Type: monitorapi.LocatorTypeContainer,
+							Keys: map[monitorapi.LocatorKey]string{
+								monitorapi.LocatorNamespaceKey: "openshift-dns",
+								monitorapi.LocatorContainerKey: "dns",
+								monitorapi.LocatorPodKey:       "dns-default-jq2qn",
+							},
+						},
+						StructuredMessage: monitorapi.Message{
+							Reason: monitorapi.ContainerReasonReady,
+							Annotations: map[monitorapi.AnnotationKey]string{
+								monitorapi.AnnotationConstructed: "pod-lifecycle-constructor",
+								monitorapi.AnnotationReason:      "Ready",
+							},
+						},
+					},
+					Source: monitorapi.SourcePodState,
+					From:   from.Add(15 * time.Minute),
+					To:     to.Add(16 * time.Minute),
+				},
+			},
+			namespace:       "openshift-dns",
+			expectedMessage: "",
+		},
+		{
+			// This is not ignored because there is no dns ready following
+			name: "fire TopologyAwareHintsDisabled when there is no dns container ready",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Level: monitorapi.Info,
+						StructuredLocator: monitorapi.Locator{
+							Type: monitorapi.LocatorTypeE2ETest,
+							Keys: map[monitorapi.LocatorKey]string{
+								monitorapi.LocatorE2ETestKey: "[sig-node] NoExecuteTaintManager Single Pod [Serial] doesn't evict pod with tolerations from tainted nodes [Skipped:SingleReplicaTopology] [Suite:openshift/conformance/serial] [Suite:k8s]",
+							},
+						},
+						StructuredMessage: monitorapi.Message{},
+					},
+					Source: monitorapi.SourceE2ETest,
+					From:   from.Add(-10 * time.Minute),
+					To:     to.Add(10 * time.Minute),
+				},
+				{
+					Condition: monitorapi.Condition{
+						Level: monitorapi.Info,
+						StructuredLocator: monitorapi.Locator{
+							Type: monitorapi.LocatorTypePod,
+							Keys: map[monitorapi.LocatorKey]string{
+								monitorapi.LocatorNamespaceKey: "openshift-dns",
+								monitorapi.LocatorPodKey:       "dns-default-jq2qn",
+							},
+						},
+						StructuredMessage: monitorapi.Message{
+							Reason: monitorapi.PodReasonGracefulDeleteStarted,
+							Annotations: map[monitorapi.AnnotationKey]string{
+								monitorapi.AnnotationConstructed: "pod-lifecycle-constructor",
+								monitorapi.AnnotationReason:      "GracefulDelete",
+							},
+						},
+					},
+					Source: monitorapi.SourcePodState,
+					From:   from.Add(-5 * time.Minute),
+					To:     to.Add(1 * time.Minute),
+				},
+				{
+					Condition: monitorapi.Condition{
+						Level:   monitorapi.Info,
+						Locator: "ns/openshift-dns service/dns-default hmsg/ade328ddf3",
+						Message: "pathological/true reason/TopologyAwareHintsDisabled Unable to allocate minimum required endpoints to each zone without exceeding overload threshold (5 endpoints, 3 zones), addressType: IPv4 (23 times)",
+					},
+					From: from.Add(11 * time.Minute),
+					To:   to.Add(12 * time.Minute),
+				},
+			},
+			namespace:       "openshift-dns",
+			expectedMessage: "1 events happened too frequently\n\nevent happened 23 times, something is wrong: ns/openshift-dns service/dns-default hmsg/ade328ddf3 - pathological/true reason/TopologyAwareHintsDisabled Unable to allocate minimum required endpoints to each zone without exceeding overload threshold (5 endpoints, 3 zones), addressType: IPv4 From: 04:11:00Z To: 04:12:00Z result=reject ",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+
+			events := monitorapi.Intervals(test.intervals)
+
+			testName := "events should not repeat"
+			junits := evaluator.testDuplicatedEvents(testName, false, events, nil, false)
+			jUnitName := getJUnitName(testName, test.namespace)
+			for _, junit := range junits {
+				if (junit.Name == jUnitName) && (test.expectedMessage != "") {
+					assert.Equal(t, test.expectedMessage, junit.FailureOutput.Output)
+				} else {
+					if !assert.Nil(t, junit.FailureOutput, "expected success but got failure output") {
+						t.Logf(junit.FailureOutput.Output)
+					}
+				}
+			}
+
+		})
+	}
+}
diff --git a/pkg/monitortests/testframework/watchevents/event_test.go b/pkg/monitortests/testframework/watchevents/event_test.go
@@ -98,25 +98,26 @@ func Test_recordAddOrUpdateEvent(t *testing.T) {
 			expectedMessage: "pathological/true interesting/true reason/SomethingHappened Readiness probe failed (40 times)",
 		},
 		{
-			name: "allowed pathological event with known bug",
+			name: "allowed pathological event with known bug (BZ 2000234)",
 			args: args{
 				ctx: context.TODO(),
 				m:   monitor.NewRecorder(),
 				kubeEvent: &corev1.Event{
 					Count:  40,
-					Reason: "TopologyAwareHintsDisabled",
+					Reason: "ns/openshift-etcd pod/etcd-quorum-guard-42 node/worker-42 - reason/Unhealthy",
 					InvolvedObject: corev1.ObjectReference{
 						Kind:      "Pod",
-						Namespace: "any",
-						Name:      "any",
+						Namespace: "openshift-etcd",
+						Name:      "etcd-quorum-guard-42",
 					},
-					Message:       "irrelevant",
+					Message:       "Readiness probe failed:",
 					LastTimestamp: metav1.Now(),
 				},
 				significantlyBeforeNow: now.UTC().Add(-15 * time.Minute),
 			},
-			expectedLocator: "ns/any pod/any hmsg/e13faa98ab",
-			expectedMessage: "pathological/true interesting/true reason/TopologyAwareHintsDisabled irrelevant (40 times)",
+			// hmsg in expectedLocator is the hash of the entire expectedMessage except the number of times
+			expectedLocator: "ns/openshift-etcd pod/etcd-quorum-guard-42 hmsg/9100aa725d",
+			expectedMessage: "pathological/true interesting/true reason/ns/openshift-etcd pod/etcd-quorum-guard-42 node/worker-42 - reason/Unhealthy Readiness probe failed: (40 times)",
 		},
 	}
 	for _, tt := range tests {