Skip to content

Commit

Permalink
Flake failed sandboxes from bug in new guard pods
Browse files Browse the repository at this point in the history
more info in https://bugzilla.redhat.com/show_bug.cgi?id=2038481

essentially, two new guard pods are being started in 4.10+ and
are incorrectly being restarted on a cordoned node and when the
node is rebooted those pods fail to set up sandboxes right away
before the underlying network config file is present. This can
be reverted when the PR to fix this is merged:
  openshift/library-go#1287

Signed-off-by: Jamo Luhrsen <jluhrsen@gmail.com>
  • Loading branch information
jluhrsen authored and deads2k committed Jan 21, 2022
1 parent c70b8a9 commit 333d913
Showing 1 changed file with 15 additions and 0 deletions.
15 changes: 15 additions & 0 deletions pkg/synthetictests/networking.go
Expand Up @@ -70,12 +70,16 @@ func testPodSandboxCreation(events monitorapi.Intervals) []*junitapi.JUnitTestCa
}
} else {
timeBetweenDeleteAndFailure := event.From.Sub(*deletionTime)
nodeRebootTime := getNodeRebootTime(events, monitorapi.LocatorParts(event.Locator)["node"])
switch {
case timeBetweenDeleteAndFailure < 1*time.Second:
// nothing here, one second is close enough to be ok, the kubelet and CNI just didn't know
case timeBetweenDeleteAndFailure < 5*time.Second:
// withing five seconds, it ought to be long enough to know, but it's close enough to flake and not fail
flakes = append(flakes, fmt.Sprintf("%v - %0.2f seconds after deletion - %v", event.Locator, timeBetweenDeleteAndFailure.Seconds(), event.Message))
case nodeRebootTime != nil && nodeRebootTime.After(*deletionTime) && strings.Contains(event.Locator, "guard"):
flakes = append(flakes, fmt.Sprintf("the deletion time %v came before a reboot at %v and error is coming "+
"from a guard pod. see https://bugzilla.redhat.com/show_bug.cgi?id=2038481", deletionTime, nodeRebootTime))
case deletionTime.Before(event.From):
// something went wrong. More than five seconds after the pod ws deleted, the CNI is trying to set up pod sandboxes and can't
failures = append(failures, fmt.Sprintf("%v - %0.2f seconds after deletion - %v", event.Locator, timeBetweenDeleteAndFailure.Seconds(), event.Message))
Expand Down Expand Up @@ -184,3 +188,14 @@ func getPodDeletionTime(events monitorapi.Intervals, podLocator string) *time.Ti
}
return nil
}

func getNodeRebootTime(events monitorapi.Intervals, node string) *time.Time {

for _, event := range events {
eventNodeName, _ := monitorapi.NodeFromLocator(event.Locator)
if eventNodeName == node && strings.Contains(event.Message, "reason/Rebooted") {
return &event.From
}
}
return nil
}

0 comments on commit 333d913

Please sign in to comment.