Skip to content

Commit

Permalink
Merge pull request kubernetes#118764 from iholder101/Swap/burstableQo…
Browse files Browse the repository at this point in the history
…S-impl

Add full cgroup v2 swap support with automatically calculated swap limit for LimitedSwap and Burstable QoS Pods
  • Loading branch information
k8s-ci-robot committed Jul 18, 2023
2 parents 1e086cc + 4321d8c commit da2fdf8
Show file tree
Hide file tree
Showing 10 changed files with 830 additions and 168 deletions.
9 changes: 9 additions & 0 deletions hack/local-up-cluster.sh
Expand Up @@ -47,6 +47,8 @@ CGROUP_DRIVER=${CGROUP_DRIVER:-""}
CGROUP_ROOT=${CGROUP_ROOT:-""}
# owner of client certs, default to current user if not specified
USER=${USER:-$(whoami)}
# if true, limited swap is being used instead of unlimited swap (default)
LIMITED_SWAP=${LIMITED_SWAP:-""}

# required for cni installation
CNI_CONFIG_DIR=${CNI_CONFIG_DIR:-/etc/cni/net.d}
Expand Down Expand Up @@ -832,6 +834,13 @@ tracing:
EOF
fi

if [[ "$LIMITED_SWAP" == "true" ]]; then
cat <<EOF >> "${TMP_DIR}"/kubelet.yaml
memorySwap:
swapBehavior: LimitedSwap
EOF
fi

{
# authentication
echo "authentication:"
Expand Down
5 changes: 3 additions & 2 deletions pkg/features/kube_features.go
Expand Up @@ -596,8 +596,9 @@ const (
// Allow pods to failover to a different node in case of non graceful node shutdown
NodeOutOfServiceVolumeDetach featuregate.Feature = "NodeOutOfServiceVolumeDetach"

// owner: @ehashman
// owner: @iholder101
// alpha: v1.22
// beta1: v1.28. For more info, please look at the KEP: https://kep.k8s.io/2400.
//
// Permits kubelet to run with swap enabled
NodeSwap featuregate.Feature = "NodeSwap"
Expand Down Expand Up @@ -1074,7 +1075,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS

NodeOutOfServiceVolumeDetach: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.31

NodeSwap: {Default: false, PreRelease: featuregate.Alpha},
NodeSwap: {Default: false, PreRelease: featuregate.Beta},

PDBUnhealthyPodEvictionPolicy: {Default: true, PreRelease: featuregate.Beta},

Expand Down
11 changes: 6 additions & 5 deletions pkg/kubelet/cm/cgroup_manager_linux.go
Expand Up @@ -45,11 +45,12 @@ import (
const (
// systemdSuffix is the cgroup name suffix for systemd
systemdSuffix string = ".slice"
// MemoryMin is memory.min for cgroup v2
MemoryMin string = "memory.min"
// MemoryHigh is memory.high for cgroup v2
MemoryHigh string = "memory.high"
Cgroup2MaxCpuLimit string = "max"
// Cgroup2MemoryMin is memory.min for cgroup v2
Cgroup2MemoryMin string = "memory.min"
// Cgroup2MemoryHigh is memory.high for cgroup v2
Cgroup2MemoryHigh string = "memory.high"
Cgroup2MaxCpuLimit string = "max"
Cgroup2MaxSwapFilename string = "memory.swap.max"
)

var RootCgroupName = CgroupName([]string{})
Expand Down
2 changes: 1 addition & 1 deletion pkg/kubelet/cm/helpers_linux.go
Expand Up @@ -196,7 +196,7 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64,
}
if memoryMin > 0 {
result.Unified = map[string]string{
MemoryMin: strconv.FormatInt(memoryMin, 10),
Cgroup2MemoryMin: strconv.FormatInt(memoryMin, 10),
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/kubelet/cm/node_container_manager_linux.go
Expand Up @@ -147,7 +147,7 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.
if rp.Unified == nil {
rp.Unified = make(map[string]string)
}
rp.Unified[MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
rp.Unified[Cgroup2MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
}
}

Expand Down
4 changes: 2 additions & 2 deletions pkg/kubelet/cm/qos_container_manager_linux.go
Expand Up @@ -292,15 +292,15 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou
if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil {
configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string)
}
configs[v1.PodQOSBurstable].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(burstableMin, 10)
configs[v1.PodQOSBurstable].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(burstableMin, 10)
klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memoryMin", burstableMin)
}

if guaranteedMin > 0 {
if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil {
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string)
}
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memoryMin", guaranteedMin)
}
}
Expand Down
118 changes: 104 additions & 14 deletions pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
Expand Up @@ -20,6 +20,9 @@ limitations under the License.
package kuberuntime

import (
"fmt"
cadvisorv1 "github.com/google/cadvisor/info/v1"
kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
"math"
"os"
"strconv"
Expand All @@ -46,7 +49,7 @@ func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config
enforceMemoryQoS := false
// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
libcontainercgroups.IsCgroup2UnifiedMode() {
isCgroup2UnifiedMode() {
enforceMemoryQoS = true
}
cl, err := m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS)
Expand Down Expand Up @@ -99,21 +102,17 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,

lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources)

if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
if swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo); utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
// NOTE(ehashman): Behaviour is defined in the opencontainers runtime spec:
// https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory
switch m.memorySwapBehavior {
case kubelettypes.UnlimitedSwap:
// -1 = unlimited swap
lcr.MemorySwapLimitInBytes = -1
case kubelettypes.LimitedSwap:
fallthrough
swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container)
default:
// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
// Some swapping is still possible.
// Note that if memory limit is 0, memory swap limit is ignored.
lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
swapConfigurationHelper.ConfigureUnlimitedSwap(lcr)
}
} else {
swapConfigurationHelper.ConfigureNoSwap(lcr)
}

// Set memory.min and memory.high to enforce MemoryQoS
Expand All @@ -122,7 +121,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
memoryRequest := container.Resources.Requests.Memory().Value()
memoryLimit := container.Resources.Limits.Memory().Value()
if memoryRequest != 0 {
unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10)
unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10)
}

// Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit.
Expand All @@ -148,7 +147,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
}
}
if memoryHigh != 0 && memoryHigh > memoryRequest {
unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
unified[cm.Cgroup2MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
}
}
if len(unified) > 0 {
Expand All @@ -171,7 +170,7 @@ func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, cont
enforceMemoryQoS := false
// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
libcontainercgroups.IsCgroup2UnifiedMode() {
isCgroup2UnifiedMode() {
enforceMemoryQoS = true
}
return &runtimeapi.ContainerResources{
Expand Down Expand Up @@ -216,7 +215,7 @@ func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit
}

// runc requires cgroupv2 for unified mode
if libcontainercgroups.IsCgroup2UnifiedMode() {
if isCgroup2UnifiedMode() {
resources.Unified = map[string]string{
// Ask the kernel to kill all processes in the container cgroup in case of OOM.
// See memory.oom.group in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html for
Expand Down Expand Up @@ -298,3 +297,94 @@ func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *k
}
return cStatusResources
}

// Note: this function variable is being added here so it would be possible to mock
// the cgroup version for unit tests by assigning a new mocked function into it. Without it,
// the cgroup version would solely depend on the environment running the test.
var isCgroup2UnifiedMode = func() bool {
return libcontainercgroups.IsCgroup2UnifiedMode()
}

type swapConfigurationHelper struct {
machineInfo cadvisorv1.MachineInfo
}

func newSwapConfigurationHelper(machineInfo cadvisorv1.MachineInfo) *swapConfigurationHelper {
return &swapConfigurationHelper{machineInfo: machineInfo}
}

func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) {
podQos := kubeapiqos.GetPodQOS(pod)
containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero()
memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0

if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || !isCgroup2UnifiedMode() || memoryRequestEqualsToLimit {
m.ConfigureNoSwap(lcr)
return
}

containerMemoryRequest := container.Resources.Requests.Memory()
swapLimit, err := calcSwapForBurstablePods(containerMemoryRequest.Value(), int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity))

if err != nil {
klog.ErrorS(err, "cannot calculate swap allocation amount; disallowing swap")
m.ConfigureNoSwap(lcr)
return
}

m.configureSwap(lcr, swapLimit)
}

func (m swapConfigurationHelper) ConfigureNoSwap(lcr *runtimeapi.LinuxContainerResources) {
if !isCgroup2UnifiedMode() {
// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
// Some swapping is still possible.
// Note that if memory limit is 0, memory swap limit is ignored.
lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
return
}

m.configureSwap(lcr, 0)
}

func (m swapConfigurationHelper) ConfigureUnlimitedSwap(lcr *runtimeapi.LinuxContainerResources) {
if !isCgroup2UnifiedMode() {
m.ConfigureNoSwap(lcr)
return
}

if lcr.Unified == nil {
lcr.Unified = map[string]string{}
}

lcr.Unified[cm.Cgroup2MaxSwapFilename] = "max"
}

func (m swapConfigurationHelper) configureSwap(lcr *runtimeapi.LinuxContainerResources, swapMemory int64) {
if !isCgroup2UnifiedMode() {
klog.ErrorS(fmt.Errorf("swap configuration is not supported with cgroup v1"), "swap configuration under cgroup v1 is unexpected")
return
}

if lcr.Unified == nil {
lcr.Unified = map[string]string{}
}

lcr.Unified[cm.Cgroup2MaxSwapFilename] = fmt.Sprintf("%d", swapMemory)
}

// The swap limit is calculated as (<containerMemoryRequest>/<nodeTotalMemory>)*<totalPodsSwapAvailable>.
// For more info, please look at the following KEP: https://kep.k8s.io/2400
func calcSwapForBurstablePods(containerMemoryRequest, nodeTotalMemory, totalPodsSwapAvailable int64) (int64, error) {
if nodeTotalMemory <= 0 {
return 0, fmt.Errorf("total node memory is 0")
}
if containerMemoryRequest > nodeTotalMemory {
return 0, fmt.Errorf("container request %d is larger than total node memory %d", containerMemoryRequest, nodeTotalMemory)
}

containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeTotalMemory)
swapAllocation := containerMemoryProportion * float64(totalPodsSwapAvailable)

return int64(swapAllocation), nil
}

0 comments on commit da2fdf8

Please sign in to comment.