Skip to content

Commit

Permalink
[PATCH] Allow CFS period to be tuned per-container
Browse files Browse the repository at this point in the history
Adds a monzo.com/cpu-period resource, which allows tuning the period of
time over which the kernel tracksw CPU throttling. In upstream Kubernetes
versions pre-1.12, this is not tunable and is hardcoded to the kernel
default (100ms).

We originally introduced this after seeing long GC pauses clustered
around 100ms [1], which was eventually traced to CFS throttling.
Essentially it's recommended for very latency sensitive & bursty
workloads (like HTTP microservices!) it's recommended to set the CFS
quota period lower. We mostly set ours at 5ms across the board. See [2]
and [3] for further discussion in the Kubernetes repository.

This is fixed in upstream 1.12 via a slightly different path [4]; the
period is now tunable via a kubelet CLI flag. This doesn't give us as
fine-grained control, but we can still set this and optimise for the
vast majority of our workloads.

[1] golang/go#19378
[2] kubernetes#51135
[3] kubernetes#67577
[4] kubernetes#63437
  • Loading branch information
milesbxf committed Mar 19, 2019
1 parent 4e209c9 commit a4c1c69
Show file tree
Hide file tree
Showing 6 changed files with 214 additions and 46 deletions.
25 changes: 15 additions & 10 deletions pkg/kubelet/cm/helpers_linux.go
Expand Up @@ -25,7 +25,7 @@ import (

libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"

"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/kubernetes/pkg/api/v1/resource"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
Expand All @@ -39,12 +39,13 @@ const (
MilliCPUToCPU = 1000

// 100000 is equivalent to 100ms
QuotaPeriod = 100000
MinQuotaPeriod = 1000
DefaultQuotaPeriod uint64 = 100000
MinQuotaPeriod int64 = 1000
)

// MilliCPUToQuota converts milliCPU to CFS quota and period values.
func MilliCPUToQuota(milliCPU int64) (quota int64, period uint64) {
// MilliCPUToQuota takes milliCPU (along with a CFS period, in usec) and returns
// a CFS quota value
func MilliCPUToQuota(milliCPU, period int64) (quota int64) {
// CFS quota is measured in two values:
// - cfs_period_us=100ms (the amount of time to measure usage across)
// - cfs_quota=20ms (the amount of cpu time allowed to be used across a period)
Expand All @@ -55,11 +56,8 @@ func MilliCPUToQuota(milliCPU int64) (quota int64, period uint64) {
return
}

// we set the period to 100ms by default
period = QuotaPeriod

// we then convert your milliCPU to a value normalized over a period
quota = (milliCPU * QuotaPeriod) / MilliCPUToCPU
quota = (milliCPU * period) / MilliCPUToCPU

// quota needs to be a minimum of 1ms.
if quota < MinQuotaPeriod {
Expand Down Expand Up @@ -109,20 +107,27 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool) *ResourceConfig {

cpuRequests := int64(0)
cpuLimits := int64(0)
cpuPeriod := DefaultQuotaPeriod
memoryLimits := int64(0)
if request, found := reqs[v1.ResourceCPU]; found {
cpuRequests = request.MilliValue()
}
if limit, found := limits[v1.ResourceCPU]; found {
cpuLimits = limit.MilliValue()
}
if limit, found := limits[v1.ResourceCPUPeriodUsec]; found {
cpuPeriod = uint64(limit.Value())
}
if limit, found := limits[v1.ResourceMemory]; found {
memoryLimits = limit.Value()
}

// convert to CFS values
cpuShares := MilliCPUToShares(cpuRequests)
cpuQuota, cpuPeriod := MilliCPUToQuota(cpuLimits)

// TODO: possibility of uint64 -> int64 overflow; we assume that users of
// monzo.com/cpu-period won't put in stupidly large numbers
cpuQuota := MilliCPUToQuota(cpuLimits, int64(cpuPeriod))

// track if limits were applied for each resource.
memoryLimitsDeclared := true
Expand Down
120 changes: 89 additions & 31 deletions pkg/kubelet/cm/helpers_linux_test.go
Expand Up @@ -22,9 +22,10 @@ import (
"reflect"
"testing"

"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"strconv"

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
)

// getResourceList returns a ResourceList with the
Expand Down Expand Up @@ -54,12 +55,15 @@ func TestResourceConfigForPod(t *testing.T) {
memoryQuantity := resource.MustParse("200Mi")
burstableMemory := memoryQuantity.Value()
burstablePartialShares := MilliCPUToShares(200)
burstableQuota, burstablePeriod := MilliCPUToQuota(200)
burstablePeriod := DefaultQuotaPeriod
burstableQuota := MilliCPUToQuota(200, burstablePeriod)
guaranteedShares := MilliCPUToShares(100)
guaranteedQuota, guaranteedPeriod := MilliCPUToQuota(100)
guaranteedPeriod := int64(10000)
guaranteedQuota := MilliCPUToQuota(100, guaranteedPeriod)
memoryQuantity = resource.MustParse("100Mi")
cpuNoLimit := int64(-1)
guaranteedMemory := memoryQuantity.Value()

testCases := map[string]struct {
pod *v1.Pod
expected *ResourceConfig
Expand All @@ -70,7 +74,10 @@ func TestResourceConfigForPod(t *testing.T) {
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("", ""), getResourceList("", "")),
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{},
Limits: v1.ResourceList{},
},
},
},
},
Expand All @@ -83,7 +90,13 @@ func TestResourceConfigForPod(t *testing.T) {
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("", "")),
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("100Mi"),
},
Limits: v1.ResourceList{},
},
},
},
},
Expand All @@ -96,7 +109,16 @@ func TestResourceConfigForPod(t *testing.T) {
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")),
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("100Mi"),
},
Limits: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("200m"),
v1.ResourceMemory: resource.MustParse("200Mi"),
},
},
},
},
},
Expand All @@ -122,10 +144,25 @@ func TestResourceConfigForPod(t *testing.T) {
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")),
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("100Mi"),
},
Limits: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("200m"),
v1.ResourceMemory: resource.MustParse("200Mi"),
},
},
},
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("", "")),
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("100Mi"),
},
Limits: v1.ResourceList{},
},
},
},
},
Expand All @@ -138,7 +175,18 @@ func TestResourceConfigForPod(t *testing.T) {
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("100Mi"),
v1.ResourceCPUPeriodUsec: resource.MustParse("10000"),
},
Limits: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("100Mi"),
v1.ResourceCPUPeriodUsec: resource.MustParse("10000"),
},
},
},
},
},
Expand Down Expand Up @@ -179,55 +227,65 @@ func TestResourceConfigForPod(t *testing.T) {

func TestMilliCPUToQuota(t *testing.T) {
testCases := []struct {
input int64
cpu int64
period int64
quota int64
period uint64
}{
{
input: int64(0),
cpu: int64(0),
period: int64(100000),
quota: int64(0),
period: uint64(0),
},
{
input: int64(5),
cpu: int64(5),
period: int64(100000),
quota: int64(1000),
period: uint64(100000),
},
{
input: int64(9),
cpu: int64(9),
period: int64(100000),
quota: int64(1000),
period: uint64(100000),
},
{
input: int64(10),
cpu: int64(10),
period: int64(100000),
quota: int64(1000),
period: uint64(100000),
},
{
input: int64(200),
cpu: int64(200),
period: int64(100000),
quota: int64(20000),
period: uint64(100000),
},
{
input: int64(500),
cpu: int64(500),
period: int64(100000),
quota: int64(50000),
period: uint64(100000),
},
{
input: int64(1000),
cpu: int64(1000),
period: int64(100000),
quota: int64(100000),
period: uint64(100000),
},
{
input: int64(1500),
cpu: int64(1500),
period: int64(100000),
quota: int64(150000),
period: uint64(100000),
},
{
cpu: int64(1500),
period: int64(10000),
quota: int64(15000),
},
{
cpu: int64(250),
period: int64(5000),
quota: int64(1250),
},
}
for _, testCase := range testCases {
quota, period := MilliCPUToQuota(testCase.input)
if quota != testCase.quota || period != testCase.period {
t.Errorf("Input %v, expected quota %v period %v, but got quota %v period %v", testCase.input, testCase.quota, testCase.period, quota, period)
quota := MilliCPUToQuota(testCase.cpu, testCase.period)
if quota != testCase.quota {
t.Errorf("Input (cpu=%d, period=%d), expected quota=%d but got quota=%d", testCase.cpu, testCase.period, testCase.quota, quota)
}
}
}
Expand Down
4 changes: 3 additions & 1 deletion pkg/kubelet/kubelet_node_status.go
Expand Up @@ -27,7 +27,7 @@ import (
"time"

"github.com/golang/glog"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -575,6 +575,7 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
node.Status.Capacity[v1.ResourceCPU] = *resource.NewMilliQuantity(0, resource.DecimalSI)
node.Status.Capacity[v1.ResourceMemory] = resource.MustParse("0Gi")
node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(int64(kl.maxPods), resource.DecimalSI)
node.Status.Capacity[v1.ResourceCPUPeriodUsec] = *resource.NewQuantity(int64(kl.maxPods)*10000, resource.DecimalSI)
glog.Errorf("Error getting machine info: %v", err)
} else {
node.Status.NodeInfo.MachineID = info.MachineID
Expand Down Expand Up @@ -680,6 +681,7 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
node.Status.Allocatable[v1.ResourceMemory] = allocatableMemory
}
}
node.Status.Allocatable[v1.ResourceCPUPeriodUsec] = *resource.NewQuantity(int64(kl.maxPods)*10000, resource.DecimalSI)
}

// Set versioninfo for the node.
Expand Down
51 changes: 50 additions & 1 deletion pkg/kubelet/kuberuntime/helpers.go
Expand Up @@ -23,14 +23,25 @@ import (
"strings"

"github.com/golang/glog"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/kubernetes/pkg/features"
runtimeapi "k8s.io/kubernetes/pkg/kubelet/apis/cri/runtime/v1alpha2"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
)

const (
// Taken from lmctfy https://github.com/google/lmctfy/blob/master/lmctfy/controllers/cpu_controller.cc
minShares = 2
sharesPerCPU = 1024
milliCPUToCPU = 1000

// 100000 is equivalent to 100ms
defaultQuotaPeriod int64 = 100000
minQuotaPeriod int64 = 1000
)

type podsByID []*kubecontainer.Pod

func (b podsByID) Len() int { return len(b) }
Expand Down Expand Up @@ -148,6 +159,44 @@ func isContainerFailed(status *kubecontainer.ContainerStatus) bool {
return false
}

// milliCPUToShares converts milliCPU to CPU shares
func milliCPUToShares(milliCPU int64) int64 {
if milliCPU == 0 {
// Return 2 here to really match kernel default for zero milliCPU.
return minShares
}
// Conceptually (milliCPU / milliCPUToCPU) * sharesPerCPU, but factored to improve rounding.
shares := (milliCPU * sharesPerCPU) / milliCPUToCPU
if shares < minShares {
return minShares
}
return shares
}

// milliCPUToQuota takes milliCPU (along with a CFS period, in usec) and returns
// a CFS quota value
func milliCPUToQuota(milliCPU, period int64) (quota int64) {
// CFS quota is measured in two values:
// - cfs_period_us=100ms (the amount of time to measure usage across)
// - cfs_quota=20ms (the amount of cpu time allowed to be used across a period)
// so in the above example, you are limited to 20% of a single CPU
// for multi-cpu environments, you just scale equivalent amounts

if milliCPU == 0 {
return
}

// we then convert your milliCPU to a value normalized over a period
quota = (milliCPU * period) / milliCPUToCPU

// quota needs to be a minimum of 1ms.
if quota < minQuotaPeriod {
quota = minQuotaPeriod
}

return
}

// getStableKey generates a key (string) to uniquely identify a
// (pod, container) tuple. The key should include the content of the
// container, so that any change to the container generates a new key.
Expand Down

0 comments on commit a4c1c69

Please sign in to comment.