Skip to content

Commit

Permalink
Bug 2105123: tuned: disable irqbalance (#396)
Browse files Browse the repository at this point in the history
* test: perfprof: utils: make sure to unquote cpus

Make sure to unquote the cpumask output to prevent false negatives

Signed-off-by: Francesco Romani <fromani@redhat.com>

* perfprof: utils: robustness fixes

Add testcases and log enhancement emerged during the local testing.

Signed-off-by: Francesco Romani <fromani@redhat.com>

* perfprof: tuned: disable the irqbalance plugin

The tuned irqbalance plugin clears the irqbalance banned CPUs list
when tuned starts. The list is then managed dynamically by the runtime
handlers.

On node restart, the tuned pod can be started AFTER the workload pods
(kubelet nor kubernetes offers ordering guarantees when recovering the
node state); clearing the banned CPUs list while pods are running and
compromising the IRQ isolation guarantees. Same holds true if the
NTO pod restarts for whatever reason.

To prevent this disruption, we disable the irqbalance plugin entirely.
Another component in the stack must now clear the irqbalance cpu ban
list once per node reboot.

Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=2105123
Signed-off-by: Francesco Romani <fromani@redhat.com>

* e2e: perfprof: add tests for cpu ban list handling

Add a test to verify that a tuned restart will not clear
the irqbalance cpu ban list, which is the key reason
why we disabled the irqbalance tuned plugin earlier.

Note there is no guarantee that any component in the stack
will reset the irqbalance copu ban list once.

crio inconditionally does a restore depending on a snapshot
taken the first time the server runs, which is likely
but not guaranteed to be correct.

There's no way to declare or check the content of the value
crio would reset.

Signed-off-by: Francesco Romani <fromani@redhat.com>

* perfprof: functests: utils: fix cpu mask padding

Testing the irqbalance handling on tuned restart highlighted
a crio bug when handling odd-numbered cpu affinity masks.
We expect this bug to have little impact on production environments
because it's unlikely they will have a number of CPUs multiple by 4
but not multiple by 8, but still we filed
cri-o/cri-o#6145

In order to have a backportable change, we fix our utility code
to deal with incorrect padding.

Signed-off-by: Francesco Romani <fromani@redhat.com>

Signed-off-by: Francesco Romani <fromani@redhat.com>
  • Loading branch information
ffromani committed Aug 17, 2022
1 parent 0441499 commit c5cf0bd
Show file tree
Hide file tree
Showing 8 changed files with 321 additions and 14 deletions.
6 changes: 4 additions & 2 deletions assets/performanceprofile/tuned/openshift-node-performance
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,10 @@ transparent_hugepages=never

{{if not .GloballyDisableIrqLoadBalancing}}
[irqbalance]
#> Override the value set by cpu-partitioning with an empty one
banned_cpus=""
# Disable the plugin entirely, which was enabled by the parent profile `cpu-partitioning`.
# It can be racy if TuneD restarts for whatever reason.
#> cpu-partitioning
enabled=false
{{end}}

[scheduler]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,7 @@ func NewNodePerformance(profile *performancev2.PerformanceProfile) (*tunedv1.Tun
templateArgs[templateAdditionalArgs] = strings.Join(profile.Spec.AdditionalKernelArgs, cmdlineDelimiter)
}

if profile.Spec.GloballyDisableIrqLoadBalancing != nil &&
*profile.Spec.GloballyDisableIrqLoadBalancing == true {
if IsIRQBalancingGloballyDisabled(profile) {
templateArgs[templateGloballyDisableIrqLoadBalancing] = strconv.FormatBool(true)
}

Expand Down Expand Up @@ -218,3 +217,7 @@ func getProfileData(tunedTemplate string, data interface{}) (string, error) {
}
return profile.String(), nil
}

func IsIRQBalancingGloballyDisabled(profile *performancev2.PerformanceProfile) bool {
return profile.Spec.GloballyDisableIrqLoadBalancing != nil && *profile.Spec.GloballyDisableIrqLoadBalancing
}
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ func CPUMaskToCPUSet(cpuMask string) (cpuset.CPUSet, error) {
}
mask, err := strconv.ParseUint(chunk, 16, bitsInWord)
if err != nil {
return cpuset.NewCPUSet(), fmt.Errorf("failed to parse the CPU mask %q: %v", cpuMask, err)
return cpuset.NewCPUSet(), fmt.Errorf("failed to parse the CPU mask %q (chunk %q): %v", cpuMask, chunk, err)
}
for j := 0; j < bitsInWord; j++ {
if mask&1 == 1 {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ var cpuListToMask = []listToMask{
{"0", "00000001"},
{"2-3", "0000000c"},
{"3,4,53-55,61-63", "e0e00000,00000018"},
{"1,3-7", "000000fa"},
{"0-127", "ffffffff,ffffffff,ffffffff,ffffffff"},
{"0-255", "ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff"},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -781,7 +781,7 @@ func promotePodToGuaranteed(pod *corev1.Pod) *corev1.Pod {
return pod
}

func getTestPodWithAnnotations(annotations map[string]string, cpus int) *corev1.Pod {
func getTestPodWithProfileAndAnnotations(perfProf *performancev2.PerformanceProfile, annotations map[string]string, cpus int) *corev1.Pod {
testpod := pods.GetTestPod()
if len(annotations) > 0 {
testpod.Annotations = annotations
Expand All @@ -801,8 +801,17 @@ func getTestPodWithAnnotations(annotations map[string]string, cpus int) *corev1.
},
}

runtimeClassName := components.GetComponentName(profile.Name, components.ComponentNamePrefix)
testpod.Spec.RuntimeClassName = &runtimeClassName
if perfProf != nil {
runtimeClassName := components.GetComponentName(perfProf.Name, components.ComponentNamePrefix)
testpod.Spec.RuntimeClassName = &runtimeClassName
}

return testpod
}

func getTestPodWithAnnotations(annotations map[string]string, cpus int) *corev1.Pod {
testpod := getTestPodWithProfileAndAnnotations(profile, annotations, cpus)

testpod.Spec.NodeSelector = map[string]string{testutils.LabelHostname: workerRTNode.Name}

return testpod
Expand Down
247 changes: 247 additions & 0 deletions test/e2e/performanceprofile/functests/1_performance/irqbalance.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
package __performance

import (
"bufio"
"context"
"encoding/json"
"fmt"
"os"
"strings"
"time"

corev1 "k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"

. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"

machineconfigv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1"

performancev2 "github.com/openshift/cluster-node-tuning-operator/pkg/apis/performanceprofile/v2"
"github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/controller/performanceprofile/components"
"github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/controller/performanceprofile/components/tuned"
testutils "github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils"
testclient "github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/client"
"github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/discovery"
testlog "github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/log"
"github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/mcps"
"github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/nodes"
"github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/pods"
"github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/profiles"
"github.com/openshift/cluster-node-tuning-operator/test/e2e/util"
"github.com/openshift/cluster-node-tuning-operator/test/framework"
)

var (
cs = framework.NewClientSet()
)

var _ = Describe("[performance] Checking IRQBalance settings", func() {
var workerRTNodes []corev1.Node
var targetNode *corev1.Node
var profile *performancev2.PerformanceProfile
var performanceMCP string
var err error

BeforeEach(func() {
if discovery.Enabled() && testutils.ProfileNotFound {
Skip("Discovery mode enabled, performance profile not found")
}

workerRTNodes, err = nodes.GetByLabels(testutils.NodeSelectorLabels)
Expect(err).ToNot(HaveOccurred())
profile, err = profiles.GetByNodeLabels(testutils.NodeSelectorLabels)
Expect(err).ToNot(HaveOccurred())
performanceMCP, err = mcps.GetByProfile(profile)
Expect(err).ToNot(HaveOccurred())

// Verify that worker and performance MCP have updated state equals to true
for _, mcpName := range []string{testutils.RoleWorker, performanceMCP} {
mcps.WaitForCondition(mcpName, machineconfigv1.MachineConfigPoolUpdated, corev1.ConditionTrue)
}

nodeIdx := pickNodeIdx(workerRTNodes)
targetNode = &workerRTNodes[nodeIdx]
By(fmt.Sprintf("verifying worker node %q", targetNode.Name))
})

Context("Verify irqbalance configuration handling", func() {

It("Should not overwrite the banned CPU set on tuned restart", func() {
if profile.Status.RuntimeClass == nil {
Skip("runtime class not generated")
}

if tuned.IsIRQBalancingGloballyDisabled(profile) {
Skip("this test needs dynamic IRQ balancing")
}

Expect(targetNode).ToNot(BeNil(), "missing target node")

irqAffBegin, err := getIrqDefaultSMPAffinity(targetNode)
Expect(err).ToNot(HaveOccurred(), "failed to extract the default IRQ affinity from node %q", targetNode.Name)
testlog.Infof("IRQ Default affinity on %q when test begins: {%s}", targetNode.Name, irqAffBegin)

bannedCPUs, err := getIrqBalanceBannedCPUs(targetNode)
Expect(err).ToNot(HaveOccurred(), "failed to extract the banned CPUs from node %q", targetNode.Name)
testlog.Infof("banned CPUs on %q when test begins: {%s}", targetNode.Name, bannedCPUs.String())

smpAffinitySet, err := nodes.GetDefaultSmpAffinitySet(targetNode)
Expect(err).ToNot(HaveOccurred(), "failed to get default smp affinity")

onlineCPUsSet, err := nodes.GetOnlineCPUsSet(targetNode)
Expect(err).ToNot(HaveOccurred(), "failed to get Online CPUs list")

// expect no irqbalance run in the system already, AKA start from pristine conditions.
// This is not an hard requirement, just the easier state to manage and check
Expect(smpAffinitySet.Equals(onlineCPUsSet)).To(BeTrue(), "found default_smp_affinity %v, expected %v - IRQBalance already run?", smpAffinitySet, onlineCPUsSet)

cpuRequest := 2 // minimum amount to be reasonably sure we're SMT-aligned
annotations := map[string]string{
"irq-load-balancing.crio.io": "disable",
"cpu-quota.crio.io": "disable",
}
testpod := getTestPodWithProfileAndAnnotations(profile, annotations, cpuRequest)
testpod.Spec.NodeName = targetNode.Name

data, _ := json.Marshal(testpod)
testlog.Infof("using testpod:\n%s", string(data))

err = testclient.Client.Create(context.TODO(), testpod)
Expect(err).ToNot(HaveOccurred())
defer func() {
if testpod != nil {
testlog.Infof("deleting pod %q", testpod.Name)
deleteTestPod(testpod)
}
bannedCPUs, err := getIrqBalanceBannedCPUs(targetNode)
Expect(err).ToNot(HaveOccurred(), "failed to extract the banned CPUs from node %q", targetNode.Name)

testlog.Infof("banned CPUs on %q when test ends: {%s}", targetNode.Name, bannedCPUs.String())

irqAffBegin, err := getIrqDefaultSMPAffinity(targetNode)
Expect(err).ToNot(HaveOccurred(), "failed to extract the default IRQ affinity from node %q", targetNode.Name)

testlog.Infof("IRQ Default affinity on %q when test ends: {%s}", targetNode.Name, irqAffBegin)
}()

err = pods.WaitForCondition(testpod, corev1.PodReady, corev1.ConditionTrue, 10*time.Minute)
logEventsForPod(testpod)
Expect(err).ToNot(HaveOccurred())

// now we have something in the IRQBalance cpu list. Let's make sure the restart doesn't overwrite this data.
postCreateBannedCPUs, err := getIrqBalanceBannedCPUs(targetNode)
Expect(err).ToNot(HaveOccurred(), "failed to extract the banned CPUs from node %q", targetNode.Name)
testlog.Infof("banned CPUs on %q just before the tuned restart: {%s}", targetNode.Name, postCreateBannedCPUs.String())

Expect(postCreateBannedCPUs.IsEmpty()).To(BeFalse(), "banned CPUs %v should not be empty on node %q", postCreateBannedCPUs, targetNode.Name)

By(fmt.Sprintf("getting a TuneD Pod running on node %s", targetNode.Name))
pod, err := util.GetTunedForNode(cs, targetNode)
Expect(err).NotTo(HaveOccurred())

By(fmt.Sprintf("causing a restart of the tuned pod (deleting the pod) on %s", targetNode.Name))
_, _, err = util.ExecAndLogCommand("oc", "delete", "pod", "--wait=true", "-n", pod.Namespace, pod.Name)
Expect(err).NotTo(HaveOccurred())

Eventually(func() error {
By(fmt.Sprintf("getting again a TuneD Pod running on node %s", targetNode.Name))
pod, err = util.GetTunedForNode(cs, targetNode)
if err != nil {
return err
}

By(fmt.Sprintf("waiting for the TuneD daemon running on node %s", targetNode.Name))
_, err = util.WaitForCmdInPod(5*time.Second, 5*time.Minute, pod, "test", "-e", "/run/tuned/tuned.pid")
return err
}).WithTimeout(5 * time.Minute).WithPolling(10 * time.Second).ShouldNot(HaveOccurred())

By(fmt.Sprintf("re-verifying worker node %q after TuneD restart", targetNode.Name))
postRestartBannedCPUs, err := getIrqBalanceBannedCPUs(targetNode)
Expect(err).ToNot(HaveOccurred(), "failed to extract the banned CPUs from node %q", targetNode.Name)
testlog.Infof("banned CPUs on %q after the tuned restart: {%s}", targetNode.Name, postRestartBannedCPUs.String())

Expect(postRestartBannedCPUs.ToSlice()).To(Equal(postCreateBannedCPUs.ToSlice()), "banned CPUs changed post tuned restart on node %q", postRestartBannedCPUs.ToSlice(), targetNode.Name)
})
})
})

// nodes.BannedCPUs fails (!!!) if the current banned list is empty because, deep down, ExecCommandOnNode expects non-empty stdout.
// In turn, we do this to at least have a chance to detect failed commands vs failed to execute commands (we had this issue in
// not-so-distant past, legit command output lost somewhere in the communication). Fixing ExecCommandOnNode isn't trivial and
// require close attention. For the time being we reimplement a form of nodes.BannedCPUs which can handle empty ban list.
func getIrqBalanceBannedCPUs(node *corev1.Node) (cpuset.CPUSet, error) {
cmd := []string{"cat", "/rootfs/etc/sysconfig/irqbalance"}
conf, err := nodes.ExecCommandOnNode(cmd, node)
if err != nil {
return cpuset.NewCPUSet(), err
}

keyValue := findIrqBalanceBannedCPUsVarFromConf(conf)
if len(keyValue) == 0 {
// can happen: everything commented out (default if no tuning ever)
testlog.Warningf("cannot find the CPU ban list in the configuration (\n%s)\n", conf)
return cpuset.NewCPUSet(), nil
}

testlog.Infof("banned CPUs setting: %q", keyValue)

items := strings.FieldsFunc(keyValue, func(c rune) bool {
return c == '='
})
if len(items) != 2 {
return cpuset.NewCPUSet(), fmt.Errorf("malformed CPU ban list in the configuration")
}

bannedCPUs := unquote(strings.TrimSpace(items[1]))
testlog.Infof("banned CPUs: %q", bannedCPUs)

banned, err := components.CPUMaskToCPUSet(bannedCPUs)
if err != nil {
return cpuset.NewCPUSet(), fmt.Errorf("failed to parse the banned CPUs: %v", err)
}

return banned, nil
}

func getIrqDefaultSMPAffinity(node *corev1.Node) (string, error) {
cmd := []string{"cat", "/rootfs/proc/irq/default_smp_affinity"}
return nodes.ExecCommandOnNode(cmd, node)
}

func findIrqBalanceBannedCPUsVarFromConf(conf string) string {
scanner := bufio.NewScanner(strings.NewReader(conf))
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(line, "#") {
continue
}
if !strings.HasPrefix(line, "IRQBALANCE_BANNED_CPUS") {
continue
}
return line
}
return ""
}

func pickNodeIdx(nodes []corev1.Node) int {
name, ok := os.LookupEnv("E2E_PAO_TARGET_NODE")
if !ok {
return 0 // "random" default
}
for idx := range nodes {
if nodes[idx].Name == name {
testlog.Infof("node %q found among candidates, picking", name)
return idx
}
}
testlog.Infof("node %q not found among candidates, fall back to random one", name)
return 0 // "safe" default
}

func unquote(s string) string {
q := "\""
s = strings.TrimPrefix(s, q)
s = strings.TrimSuffix(s, q)
return s
}

0 comments on commit c5cf0bd

Please sign in to comment.