Skip to content

Commit

Permalink
Merge pull request kubernetes#106544 from ehashman/fix-flake-restart
Browse files Browse the repository at this point in the history
Deflake "Kubelet should correctly account for terminated pods after restart"
  • Loading branch information
k8s-ci-robot committed Nov 20, 2021
2 parents 9a1d901 + 6ddf86d commit 21d3acc
Showing 1 changed file with 11 additions and 5 deletions.
16 changes: 11 additions & 5 deletions test/e2e_node/restart_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() {
podCount = 100
podCreationInterval = 100 * time.Millisecond
recoverTimeout = 5 * time.Minute
startTimeout = 5 * time.Minute
startTimeout = 3 * time.Minute
// restartCount is chosen so even with minPods we exhaust the default
// allocation of a /24.
minPods = 50
Expand Down Expand Up @@ -150,6 +150,10 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() {
if numCpus < 1 {
e2eskipper.Skipf("insufficient CPU available for kubelet restart test")
}
if numCpus > 18 {
// 950m * 19 = 1805 CPUs -> not enough to block the scheduling of another 950m pod
e2eskipper.Skipf("test will return false positives on a machine with >18 cores")
}

// create as many restartNever pods as there are allocatable CPU
// nodes; if they are not correctly accounted for as terminated
Expand All @@ -161,7 +165,7 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() {
pod.Spec.RestartPolicy = "Never"
pod.Spec.Containers[0].Command = []string{"echo", "hi"}
pod.Spec.Containers[0].Resources.Limits = v1.ResourceList{
v1.ResourceCPU: resource.MustParse("1"),
v1.ResourceCPU: resource.MustParse("950m"), // leave a little room for other workloads
}
}
createBatchPodWithRateControl(f, restartNeverPods, podCreationInterval)
Expand Down Expand Up @@ -199,9 +203,11 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() {
// restart may think these old pods are consuming CPU and we
// will get an OutOfCpu error.
ginkgo.By("verifying restartNever pods succeed and restartAlways pods stay running")
postRestartRunningPods := waitForPods(f, numAllPods, recoverTimeout)
if len(postRestartRunningPods) < numAllPods {
framework.Failf("less pods are running after node restart, got %d but expected %d", len(postRestartRunningPods), numAllPods)
for start := time.Now(); time.Since(start) < startTimeout; time.Sleep(10 * time.Second) {
postRestartRunningPods := waitForPods(f, numAllPods, recoverTimeout)
if len(postRestartRunningPods) < numAllPods {
framework.Failf("less pods are running after node restart, got %d but expected %d", len(postRestartRunningPods), numAllPods)
}
}
})
})
Expand Down

0 comments on commit 21d3acc

Please sign in to comment.