Skip to content

Commit

Permalink
e2e: check if shared CPUs survive Kubelet restart
Browse files Browse the repository at this point in the history
Signed-off-by: Talor Itzhak <titzhak@redhat.com>
  • Loading branch information
Tal-or committed Jan 2, 2024
1 parent 9bbd08d commit 12ffc0d
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 27 deletions.
48 changes: 48 additions & 0 deletions test/e2e/performanceprofile/functests/11_mixedcpus/mixedcpus.go
Expand Up @@ -40,6 +40,7 @@ const (
crioRuntimeConfigFile = "/etc/crio/crio.conf.d/99-runtimes.conf"
kubeletMixedCPUsConfigFile = "/etc/kubernetes/openshift-workload-mixed-cpus"
sharedCpusResource = "workload.openshift.io/enable-shared-cpus"
kubeletRestartCommandMCD = "chroot /rootfs systemctl restart kubelet"
)

var _ = Describe("Mixedcpus", Ordered, func() {
Expand Down Expand Up @@ -197,6 +198,43 @@ var _ = Describe("Mixedcpus", Ordered, func() {
Expect(shared.Equals(ppShared)).To(BeTrue(), "OPENSHIFT_SHARED_CPUS value not equal to what configure in the performance profile."+
"OPENSHIFT_SHARED_CPUS=%s spec.cpu.shared=%s", shared.String(), ppShared.String())
})
It("should contains the shared cpus after Kubelet restarts", func() {
rl := &corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("100Mi"),
sharedCpusResource: resource.MustParse("1"),
}
p, err := createPod(ctx, testclient.Client, testutils.NamespaceTesting,
withRequests(rl),
withLimits(rl),
withRuntime(components.GetComponentName(profile.Name, components.ComponentNamePrefix)))
Expect(err).ToNot(HaveOccurred())
cfg, err := getter.GetConfig(ctx, testclient.K8sClient, p, "")
Expect(err).ToNot(HaveOccurred())
cgroupCpuSet, err := cpuset.Parse(cfg.CPUSet)
Expect(err).ToNot(HaveOccurred())
shared, _ := cpuset.Parse(string(*profile.Spec.CPU.Shared))
Expect(cgroupCpuSet.Intersection(shared).List()).ToNot(BeEmpty(), "shared cpus are not in the pod cgroups; pod=%q, cgroupscpuset=%q sharedcpuset=%q",
fmt.Sprintf("%s/%s", p.Namespace, p.Name), cgroupCpuSet.String(), shared.String())

node := &corev1.Node{}
err = testclient.Client.Get(ctx, client.ObjectKey{Name: p.Spec.NodeName}, node)
Expect(err).ToNot(HaveOccurred())

cmd := kubeletRestartCmd()
// The command would fail since it aborts all the pods during restart
_, _ = nodes.ExecCommandOnNode(cmd, node)
// check that the node is ready after we restart Kubelet
nodes.WaitForReadyOrFail("post restart", node.Name, 20*time.Minute, 3*time.Second)

By("verifying that shared cpus are in the container's cgroup after kubelet restart")
cfg, err = getter.GetConfig(ctx, testclient.K8sClient, p, "")
Expect(err).ToNot(HaveOccurred())
cgroupCpuSet, err = cpuset.Parse(cfg.CPUSet)
Expect(err).ToNot(HaveOccurred())
Expect(cgroupCpuSet.Intersection(shared).List()).ToNot(BeEmpty(), "shared cpus are not in the pod cgroups; pod=%q, cgroupscpuset=%q sharedcpuset=%q",
fmt.Sprintf("%s/%s", p.Namespace, p.Name), cgroupCpuSet.String(), shared.String())
})
})
})

Expand Down Expand Up @@ -260,6 +298,16 @@ func isFileExistCmd(absoluteFileName string) []string {
}
}

func kubeletRestartCmd() []string {
return []string{
"chroot",
"/rootfs",
"/bin/bash",
"-c",
"systemctl restart kubelet",
}
}

func createPod(ctx context.Context, c client.Client, ns string, opts ...func(pod *corev1.Pod)) (*corev1.Pod, error) {
p := pods.GetTestPod()
p.Namespace = ns
Expand Down
29 changes: 2 additions & 27 deletions test/e2e/performanceprofile/functests/9_reboot/devices.go
Expand Up @@ -127,7 +127,7 @@ var _ = Describe("[disruptive][node][kubelet][devicemanager] Device management t
// because the apiserver is going down as well.
// we intentionally use a generous timeout.
// On Bare metal reboot can take a while.
waitForNodeReadyOrFail("post reboot", targetNode, 20*time.Minute, 3*time.Second)
testnodes.WaitForReadyOrFail("post reboot", targetNode, 20*time.Minute, 3*time.Second)

// are we really sure? we can't predict if we will have state flapping,
// we can't predict if pods go back to containercreating and ideally we
Expand Down Expand Up @@ -202,7 +202,7 @@ var _ = Describe("[disruptive][node][kubelet][devicemanager] Device management t
// phase3: the kubelet restart
runCommandOnNodeThroughMCD(node, "kubelet restart", kubeletRestartCommandMCD)

waitForNodeReadyOrFail("post restart", targetNode, 20*time.Minute, 3*time.Second)
testnodes.WaitForReadyOrFail("post restart", targetNode, 20*time.Minute, 3*time.Second)

// are we really sure? we can't predict if we will have state flapping,
// we can't predict if pods go back to containercreating and ideally we
Expand Down Expand Up @@ -359,22 +359,6 @@ func waitForNodeToReportResourcesOrFail(tag, nodeName, resourceName string, time
return allocatableDevs
}

func waitForNodeReadyOrFail(tag, nodeName string, timeout, polling time.Duration) {
testlog.Infof("%s: waiting for node %q: to be ready", tag, nodeName)
EventuallyWithOffset(1, func() (bool, error) {
node, err := testnodes.GetByName(nodeName)
if err != nil {
// intentionally tolerate error
testlog.Infof("wait for node %q ready: %v", nodeName, err)
return false, nil
}
ready := isNodeReady(*node)
testlog.Infof("node %q ready=%v", nodeName, ready)
return ready, nil
}).WithTimeout(timeout).WithPolling(polling).Should(BeTrue(), "post reboot: cannot get readiness status after reboot for node %q", nodeName)
testlog.Infof("%s: node %q: reported ready", tag, nodeName)
}

func runCommandOnNodeThroughMCD(node *corev1.Node, description, command string) (string, error) {
testlog.Infof("node %q: before %s", node.Name, description)
out, err := testnodes.ExecCommandOnMachineConfigDaemon(node, []string{"sh", "-c", command})
Expand All @@ -391,15 +375,6 @@ func countDeviceAllocatable(node *corev1.Node, resourceName string) int64 {
return val.Value()
}

func isNodeReady(node corev1.Node) bool {
for _, c := range node.Status.Conditions {
if c.Type == corev1.NodeReady {
return c.Status == corev1.ConditionTrue
}
}
return false
}

func isUnexpectedAdmissionError(pod corev1.Pod, resourceName string) bool {
if pod.Status.Phase != corev1.PodFailed {
return false
Expand Down
25 changes: 25 additions & 0 deletions test/e2e/performanceprofile/functests/utils/nodes/nodes.go
Expand Up @@ -553,3 +553,28 @@ func GetCgroupFs(node *corev1.Node) (string, error) {
cgroupFs := strings.TrimSpace(string(version))
return cgroupFs, nil
}

func WaitForReadyOrFail(tag, nodeName string, timeout, polling time.Duration) {
testlog.Infof("%s: waiting for node %q: to be ready", tag, nodeName)
EventuallyWithOffset(1, func() (bool, error) {
node, err := GetByName(nodeName)
if err != nil {
// intentionally tolerate error
testlog.Infof("wait for node %q ready: %v", nodeName, err)
return false, nil
}
ready := isNodeReady(*node)
testlog.Infof("node %q ready=%v", nodeName, ready)
return ready, nil
}).WithTimeout(timeout).WithPolling(polling).Should(BeTrue(), "post reboot: cannot get readiness status after reboot for node %q", nodeName)
testlog.Infof("%s: node %q: reported ready", tag, nodeName)
}

func isNodeReady(node corev1.Node) bool {
for _, c := range node.Status.Conditions {
if c.Type == corev1.NodeReady {
return c.Status == corev1.ConditionTrue
}
}
return false
}

0 comments on commit 12ffc0d

Please sign in to comment.