Flake failed sandboxes from bug in new guard pods

more info in https://bugzilla.redhat.com/show_bug.cgi?id=2038481 essentially, two new guard pods are being started in 4.10+ and are incorrectly being restarted on a cordoned node and when the node is rebooted those pods fail to set up sandboxes right away before the underlying network config file is present. This can be reverted when the PR to fix this is merged: openshift/library-go#1287 Signed-off-by: Jamo Luhrsen <jluhrsen@gmail.com>
openshift · Jan 21, 2022 · 333d913 · 333d913
1 parent c70b8a9
commit 333d913
Showing 1 changed file with 15 additions and 0 deletions.
diff --git a/pkg/synthetictests/networking.go b/pkg/synthetictests/networking.go
@@ -70,12 +70,16 @@ func testPodSandboxCreation(events monitorapi.Intervals) []*junitapi.JUnitTestCa
 			}
 		} else {
 			timeBetweenDeleteAndFailure := event.From.Sub(*deletionTime)
+			nodeRebootTime := getNodeRebootTime(events, monitorapi.LocatorParts(event.Locator)["node"])
 			switch {
 			case timeBetweenDeleteAndFailure < 1*time.Second:
 				// nothing here, one second is close enough to be ok, the kubelet and CNI just didn't know
 			case timeBetweenDeleteAndFailure < 5*time.Second:
 				// withing five seconds, it ought to be long enough to know, but it's close enough to flake and not fail
 				flakes = append(flakes, fmt.Sprintf("%v - %0.2f seconds after deletion - %v", event.Locator, timeBetweenDeleteAndFailure.Seconds(), event.Message))
+			case nodeRebootTime != nil && nodeRebootTime.After(*deletionTime) && strings.Contains(event.Locator, "guard"):
+				flakes = append(flakes, fmt.Sprintf("the deletion time %v came before a reboot at %v and error is coming "+
+					"from a guard pod. see https://bugzilla.redhat.com/show_bug.cgi?id=2038481", deletionTime, nodeRebootTime))
 			case deletionTime.Before(event.From):
 				// something went wrong.  More than five seconds after the pod ws deleted, the CNI is trying to set up pod sandboxes and can't
 				failures = append(failures, fmt.Sprintf("%v - %0.2f seconds after deletion - %v", event.Locator, timeBetweenDeleteAndFailure.Seconds(), event.Message))
@@ -184,3 +188,14 @@ func getPodDeletionTime(events monitorapi.Intervals, podLocator string) *time.Ti
 	}
 	return nil
 }
+
+func getNodeRebootTime(events monitorapi.Intervals, node string) *time.Time {
+
+	for _, event := range events {
+		eventNodeName, _ := monitorapi.NodeFromLocator(event.Locator)
+		if eventNodeName == node && strings.Contains(event.Message, "reason/Rebooted") {
+			return &event.From
+		}
+	}
+	return nil
+}