Skip to content

Commit

Permalink
Merge pull request #28386 from xueqzhan/revert-28381-revert-28233-169…
Browse files Browse the repository at this point in the history
…9276048783

Revert "TRT-1339: Revert #28233 "ignore repeated TopologyAwareHintsDisabled events""
  • Loading branch information
openshift-merge-bot[bot] committed Nov 10, 2023
2 parents 9aa824d + 89829e1 commit fa60863
Show file tree
Hide file tree
Showing 4 changed files with 251 additions and 12 deletions.
Expand Up @@ -223,10 +223,6 @@ var KnownEventsBugs = []KnownProblem{
Regexp: regexp.MustCompile("ns/openshift-etcd-operator namespace/openshift-etcd-operator -.*rpc error: code = Canceled desc = grpc: the client connection is closing.*"),
BZ: "https://bugzilla.redhat.com/show_bug.cgi?id=2006975",
},
{
Regexp: regexp.MustCompile("reason/TopologyAwareHintsDisabled"),
BZ: "https://issues.redhat.com/browse/OCPBUGS-13366",
},
{
Regexp: regexp.MustCompile("ns/.*reason/.*APICheckFailed.*503.*"),
BZ: "https://bugzilla.redhat.com/show_bug.cgi?id=2017435",
Expand Down
Expand Up @@ -6,6 +6,7 @@ import (
"regexp"
"strconv"
"strings"
"time"

"github.com/sirupsen/logrus"
e2e "k8s.io/kubernetes/test/e2e/framework"
Expand Down Expand Up @@ -235,6 +236,13 @@ func (d duplicateEventsEvaluator) testDuplicatedEvents(testName string, flakeOnl
})
logrus.Infof("found %d NodeUpdate intervals", len(nodeUpdateIntervals))

type timeRange struct {
from time.Time
to time.Time
}
buildTopologyHintAllowedTimeRanges := true
topologyHintAllowedTimeRanges := []*timeRange{}

displayToCount := map[string]*pathologicalEvents{}
for _, event := range events {
// TODO: port to use structured message reason once kube event intervals are ported over
Expand All @@ -252,9 +260,73 @@ func (d duplicateEventsEvaluator) testDuplicatedEvents(testName string, flakeOnl
continue
}
}
if strings.Contains(event.Message, "reason/TopologyAwareHintsDisabled") {
// Build the allowed time range only once
if buildTopologyHintAllowedTimeRanges {
taintManagerTestIntervals := events.Filter(func(eventInterval monitorapi.Interval) bool {
return eventInterval.Source == monitorapi.SourceE2ETest &&
strings.Contains(eventInterval.StructuredLocator.Keys[monitorapi.LocatorE2ETestKey], "NoExecuteTaintManager")
})
// Start the allowed time range from time range of the tests. But events lag behind the tests since the tests do not wait
// until all dns pods are properly scheduled and reach ready state. So we will need to expand the allowed time range after.
for _, test := range taintManagerTestIntervals {
topologyHintAllowedTimeRanges = append(topologyHintAllowedTimeRanges, &timeRange{from: test.From, to: test.To})
logrus.WithField("from", test.From).WithField("to", test.To).Infof("found time range for test: %s", testName)
}
dnsUpdateIntervals := events.Filter(func(eventInterval monitorapi.Interval) bool {
return eventInterval.Source == monitorapi.SourcePodState &&
(eventInterval.StructuredLocator.Type == monitorapi.LocatorTypePod || eventInterval.StructuredLocator.Type == monitorapi.LocatorTypeContainer) &&
eventInterval.StructuredLocator.Keys[monitorapi.LocatorNamespaceKey] == "openshift-dns" &&
eventInterval.StructuredMessage.Annotations[monitorapi.AnnotationConstructed] == monitorapi.ConstructionOwnerPodLifecycle
})

// Now expand the allowed time range until the replacement dns pod gets ready
for _, r := range topologyHintAllowedTimeRanges {
var lastReadyTime time.Time
count := 0
for _, interval := range dnsUpdateIntervals {
if interval.From.Before(r.from) {
continue
}
// If there is a GracefulDelete of dns-default pod, we will have to wait until the replacement dns container becomes ready
if interval.StructuredMessage.Reason == monitorapi.PodReasonGracefulDeleteStarted &&
strings.Contains(interval.StructuredLocator.Keys[monitorapi.LocatorPodKey], "dns-default") {
count++
}
if interval.StructuredMessage.Reason == monitorapi.ContainerReasonReady &&
interval.StructuredLocator.Keys[monitorapi.LocatorContainerKey] == "dns" && count > 0 {
lastReadyTime = interval.From
count--
}
if interval.From.After(r.to) && count == 0 {
if lastReadyTime.After(r.to) {
r.to = lastReadyTime
}
break
}
}
}
// Log final adjusted time ranges
for _, test := range taintManagerTestIntervals {
logrus.WithField("from", test.From).WithField("to", test.To).Infof("adjusted time range for test: %s", testName)
}
buildTopologyHintAllowedTimeRanges = false
}
// Filter out TopologyAwareHintsDisabled events within allowed time range
var allowed bool
for _, r := range topologyHintAllowedTimeRanges {
if r.from.Before(event.From) && r.to.After(event.To) {
logrus.Infof("%s was found to fall into the allowed time range %+v, ignoring pathological event as we expect these during NoExecuteTaintManager test", event, r)
allowed = true
break
}
}
if allowed {
continue
}
}
eventDisplayMessage, times := GetTimesAnEventHappened(fmt.Sprintf("%s - %s", event.Locator, event.Message))
if times > DuplicateEventThreshold {

// If we marked this message earlier in recordAddOrUpdateEvent as interesting/true, we know it matched one of
// the existing patterns or one of the AllowedRepeatedEventFns functions returned true.
if strings.Contains(eventDisplayMessage, InterestingMark) {
Expand Down
Expand Up @@ -558,3 +558,173 @@ func TestMakeProbeTestEventsGroup(t *testing.T) {
})
}
}

func TestPathologicalEventsTopologyAwareHintsDisabled(t *testing.T) {
evaluator := duplicateEventsEvaluator{
allowedRepeatedEventPatterns: AllowedRepeatedEventPatterns,
knownRepeatedEventsBugs: []KnownProblem{},
}
from := time.Unix(872827200, 0).In(time.UTC)
to := time.Unix(872827200, 0).In(time.UTC)

tests := []struct {
name string
namespace string
platform v1.PlatformType
topology v1.TopologyMode
intervals []monitorapi.Interval
expectedMessage string
}{
{
// This is ignored because the node is tainted by test
name: "ignore TopologyAwareHintsDisabled before dns container ready",
intervals: []monitorapi.Interval{
{
Condition: monitorapi.Condition{
Level: monitorapi.Info,
StructuredLocator: monitorapi.Locator{
Type: monitorapi.LocatorTypeE2ETest,
Keys: map[monitorapi.LocatorKey]string{
monitorapi.LocatorE2ETestKey: "[sig-node] NoExecuteTaintManager Single Pod [Serial] doesn't evict pod with tolerations from tainted nodes [Skipped:SingleReplicaTopology] [Suite:openshift/conformance/serial] [Suite:k8s]",
},
},
StructuredMessage: monitorapi.Message{},
},
Source: monitorapi.SourceE2ETest,
From: from.Add(-10 * time.Minute),
To: to.Add(10 * time.Minute),
},
{
Condition: monitorapi.Condition{
Level: monitorapi.Info,
StructuredLocator: monitorapi.Locator{
Type: monitorapi.LocatorTypePod,
Keys: map[monitorapi.LocatorKey]string{
monitorapi.LocatorNamespaceKey: "openshift-dns",
monitorapi.LocatorPodKey: "dns-default-jq2qn",
},
},
StructuredMessage: monitorapi.Message{
Reason: monitorapi.PodReasonGracefulDeleteStarted,
Annotations: map[monitorapi.AnnotationKey]string{
monitorapi.AnnotationConstructed: "pod-lifecycle-constructor",
monitorapi.AnnotationReason: "GracefulDelete",
},
},
},
Source: monitorapi.SourcePodState,
From: from.Add(-5 * time.Minute),
To: to.Add(1 * time.Minute),
},
{
Condition: monitorapi.Condition{
Level: monitorapi.Info,
Message: "pathological/true reason/TopologyAwareHintsDisabled Unable to allocate minimum required endpoints to each zone without exceeding overload threshold (5 endpoints, 3 zones), addressType: IPv4 (23 times)",
},
From: from.Add(11 * time.Minute),
To: to.Add(12 * time.Minute),
},
{
Condition: monitorapi.Condition{
Level: monitorapi.Info,
StructuredLocator: monitorapi.Locator{
Type: monitorapi.LocatorTypeContainer,
Keys: map[monitorapi.LocatorKey]string{
monitorapi.LocatorNamespaceKey: "openshift-dns",
monitorapi.LocatorContainerKey: "dns",
monitorapi.LocatorPodKey: "dns-default-jq2qn",
},
},
StructuredMessage: monitorapi.Message{
Reason: monitorapi.ContainerReasonReady,
Annotations: map[monitorapi.AnnotationKey]string{
monitorapi.AnnotationConstructed: "pod-lifecycle-constructor",
monitorapi.AnnotationReason: "Ready",
},
},
},
Source: monitorapi.SourcePodState,
From: from.Add(15 * time.Minute),
To: to.Add(16 * time.Minute),
},
},
namespace: "openshift-dns",
expectedMessage: "",
},
{
// This is not ignored because there is no dns ready following
name: "fire TopologyAwareHintsDisabled when there is no dns container ready",
intervals: []monitorapi.Interval{
{
Condition: monitorapi.Condition{
Level: monitorapi.Info,
StructuredLocator: monitorapi.Locator{
Type: monitorapi.LocatorTypeE2ETest,
Keys: map[monitorapi.LocatorKey]string{
monitorapi.LocatorE2ETestKey: "[sig-node] NoExecuteTaintManager Single Pod [Serial] doesn't evict pod with tolerations from tainted nodes [Skipped:SingleReplicaTopology] [Suite:openshift/conformance/serial] [Suite:k8s]",
},
},
StructuredMessage: monitorapi.Message{},
},
Source: monitorapi.SourceE2ETest,
From: from.Add(-10 * time.Minute),
To: to.Add(10 * time.Minute),
},
{
Condition: monitorapi.Condition{
Level: monitorapi.Info,
StructuredLocator: monitorapi.Locator{
Type: monitorapi.LocatorTypePod,
Keys: map[monitorapi.LocatorKey]string{
monitorapi.LocatorNamespaceKey: "openshift-dns",
monitorapi.LocatorPodKey: "dns-default-jq2qn",
},
},
StructuredMessage: monitorapi.Message{
Reason: monitorapi.PodReasonGracefulDeleteStarted,
Annotations: map[monitorapi.AnnotationKey]string{
monitorapi.AnnotationConstructed: "pod-lifecycle-constructor",
monitorapi.AnnotationReason: "GracefulDelete",
},
},
},
Source: monitorapi.SourcePodState,
From: from.Add(-5 * time.Minute),
To: to.Add(1 * time.Minute),
},
{
Condition: monitorapi.Condition{
Level: monitorapi.Info,
Locator: "ns/openshift-dns service/dns-default hmsg/ade328ddf3",
Message: "pathological/true reason/TopologyAwareHintsDisabled Unable to allocate minimum required endpoints to each zone without exceeding overload threshold (5 endpoints, 3 zones), addressType: IPv4 (23 times)",
},
From: from.Add(11 * time.Minute),
To: to.Add(12 * time.Minute),
},
},
namespace: "openshift-dns",
expectedMessage: "1 events happened too frequently\n\nevent happened 23 times, something is wrong: ns/openshift-dns service/dns-default hmsg/ade328ddf3 - pathological/true reason/TopologyAwareHintsDisabled Unable to allocate minimum required endpoints to each zone without exceeding overload threshold (5 endpoints, 3 zones), addressType: IPv4 From: 04:11:00Z To: 04:12:00Z result=reject ",
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {

events := monitorapi.Intervals(test.intervals)

testName := "events should not repeat"
junits := evaluator.testDuplicatedEvents(testName, false, events, nil, false)
jUnitName := getJUnitName(testName, test.namespace)
for _, junit := range junits {
if (junit.Name == jUnitName) && (test.expectedMessage != "") {
assert.Equal(t, test.expectedMessage, junit.FailureOutput.Output)
} else {
if !assert.Nil(t, junit.FailureOutput, "expected success but got failure output") {
t.Logf(junit.FailureOutput.Output)
}
}
}

})
}
}
15 changes: 8 additions & 7 deletions pkg/monitortests/testframework/watchevents/event_test.go
Expand Up @@ -98,25 +98,26 @@ func Test_recordAddOrUpdateEvent(t *testing.T) {
expectedMessage: "pathological/true interesting/true reason/SomethingHappened Readiness probe failed (40 times)",
},
{
name: "allowed pathological event with known bug",
name: "allowed pathological event with known bug (BZ 2000234)",
args: args{
ctx: context.TODO(),
m: monitor.NewRecorder(),
kubeEvent: &corev1.Event{
Count: 40,
Reason: "TopologyAwareHintsDisabled",
Reason: "ns/openshift-etcd pod/etcd-quorum-guard-42 node/worker-42 - reason/Unhealthy",
InvolvedObject: corev1.ObjectReference{
Kind: "Pod",
Namespace: "any",
Name: "any",
Namespace: "openshift-etcd",
Name: "etcd-quorum-guard-42",
},
Message: "irrelevant",
Message: "Readiness probe failed:",
LastTimestamp: metav1.Now(),
},
significantlyBeforeNow: now.UTC().Add(-15 * time.Minute),
},
expectedLocator: "ns/any pod/any hmsg/e13faa98ab",
expectedMessage: "pathological/true interesting/true reason/TopologyAwareHintsDisabled irrelevant (40 times)",
// hmsg in expectedLocator is the hash of the entire expectedMessage except the number of times
expectedLocator: "ns/openshift-etcd pod/etcd-quorum-guard-42 hmsg/9100aa725d",
expectedMessage: "pathological/true interesting/true reason/ns/openshift-etcd pod/etcd-quorum-guard-42 node/worker-42 - reason/Unhealthy Readiness probe failed: (40 times)",
},
}
for _, tt := range tests {
Expand Down

0 comments on commit fa60863

Please sign in to comment.