forked from kubernetes/kubernetes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
nodeoutofdisk.go
269 lines (222 loc) · 10.2 KB
/
nodeoutofdisk.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package node
import (
"encoding/json"
"fmt"
"time"
cadvisorapi "github.com/google/cadvisor/info/v1"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/util/wait"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/kubernetes/test/e2e/framework"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)
const (
mb = 1024 * 1024
gb = 1024 * mb
// TODO(madhusudancs): find a way to query kubelet's disk space manager to obtain this value. 256MB
// is the default that is set today. This test might break if the default value changes. This value
// can be configured by setting the "low-diskspace-threshold-mb" flag while starting a kubelet.
// However, kubelets are started as part of the cluster start up, once, before any e2e test is run,
// and remain unchanged until all the tests are run and the cluster is brought down. Changing the
// flag value affects all the e2e tests. So we are hard-coding this value for now.
lowDiskSpaceThreshold uint64 = 256 * mb
nodeOODTimeOut = 5 * time.Minute
numNodeOODPods = 3
)
// Plan:
// 1. Fill disk space on all nodes except one. One node is left out so that we can schedule pods
// on that node. Arbitrarily choose that node to be node with index 0. This makes this a disruptive test.
// 2. Get the CPU capacity on unfilled node.
// 3. Divide the available CPU into one less than the number of pods we want to schedule. We want
// to schedule 3 pods, so divide CPU capacity by 2.
// 4. Request the divided CPU for each pod.
// 5. Observe that 2 of the pods schedule onto the node whose disk is not full, and the remaining
// pod stays pending and does not schedule onto the nodes whose disks are full nor the node
// with the other two pods, since there is not enough free CPU capacity there.
// 6. Recover disk space from one of the nodes whose disk space was previously filled. Arbritrarily
// choose that node to be node with index 1.
// 7. Observe that the pod in pending status schedules on that node.
//
// Flaky issue #20015. We have no clear path for how to test this functionality in a non-flaky way.
var _ = SIGDescribe("NodeOutOfDisk [Serial] [Flaky] [Disruptive]", func() {
var c clientset.Interface
var unfilledNodeName, recoveredNodeName string
f := framework.NewDefaultFramework("node-outofdisk")
BeforeEach(func() {
c = f.ClientSet
framework.Skipf("test is broken. #40249")
nodelist := framework.GetReadySchedulableNodesOrDie(c)
// Skip this test on small clusters. No need to fail since it is not a use
// case that any cluster of small size needs to support.
framework.SkipUnlessNodeCountIsAtLeast(2)
unfilledNodeName = nodelist.Items[0].Name
for _, node := range nodelist.Items[1:] {
fillDiskSpace(c, &node)
}
})
AfterEach(func() {
nodelist := framework.GetReadySchedulableNodesOrDie(c)
Expect(len(nodelist.Items)).ToNot(BeZero())
for _, node := range nodelist.Items {
if unfilledNodeName == node.Name || recoveredNodeName == node.Name {
continue
}
recoverDiskSpace(c, &node)
}
})
It("runs out of disk space", func() {
unfilledNode, err := c.Core().Nodes().Get(unfilledNodeName, metav1.GetOptions{})
framework.ExpectNoError(err)
By(fmt.Sprintf("Calculating CPU availability on node %s", unfilledNode.Name))
milliCpu, err := availCpu(c, unfilledNode)
framework.ExpectNoError(err)
// Per pod CPU should be just enough to fit only (numNodeOODPods - 1) pods on the given
// node. We compute this value by dividing the available CPU capacity on the node by
// (numNodeOODPods - 1) and subtracting ϵ from it. We arbitrarily choose ϵ to be 1%
// of the available CPU per pod, i.e. 0.01 * milliCpu/(numNodeOODPods-1). Instead of
// subtracting 1% from the value, we directly use 0.99 as the multiplier.
podCPU := int64(float64(milliCpu/(numNodeOODPods-1)) * 0.99)
ns := f.Namespace.Name
podClient := c.Core().Pods(ns)
By("Creating pods and waiting for all but one pods to be scheduled")
for i := 0; i < numNodeOODPods-1; i++ {
name := fmt.Sprintf("pod-node-outofdisk-%d", i)
createOutOfDiskPod(c, ns, name, podCPU)
framework.ExpectNoError(f.WaitForPodRunning(name))
pod, err := podClient.Get(name, metav1.GetOptions{})
framework.ExpectNoError(err)
Expect(pod.Spec.NodeName).To(Equal(unfilledNodeName))
}
pendingPodName := fmt.Sprintf("pod-node-outofdisk-%d", numNodeOODPods-1)
createOutOfDiskPod(c, ns, pendingPodName, podCPU)
By(fmt.Sprintf("Finding a failed scheduler event for pod %s", pendingPodName))
wait.Poll(2*time.Second, 5*time.Minute, func() (bool, error) {
selector := fields.Set{
"involvedObject.kind": "Pod",
"involvedObject.name": pendingPodName,
"involvedObject.namespace": ns,
"source": v1.DefaultSchedulerName,
"reason": "FailedScheduling",
}.AsSelector().String()
options := metav1.ListOptions{FieldSelector: selector}
schedEvents, err := c.Core().Events(ns).List(options)
framework.ExpectNoError(err)
if len(schedEvents.Items) > 0 {
return true, nil
} else {
return false, nil
}
})
nodelist := framework.GetReadySchedulableNodesOrDie(c)
Expect(len(nodelist.Items)).To(BeNumerically(">", 1))
nodeToRecover := nodelist.Items[1]
Expect(nodeToRecover.Name).ToNot(Equal(unfilledNodeName))
recoverDiskSpace(c, &nodeToRecover)
recoveredNodeName = nodeToRecover.Name
By(fmt.Sprintf("Verifying that pod %s schedules on node %s", pendingPodName, recoveredNodeName))
framework.ExpectNoError(f.WaitForPodRunning(pendingPodName))
pendingPod, err := podClient.Get(pendingPodName, metav1.GetOptions{})
framework.ExpectNoError(err)
Expect(pendingPod.Spec.NodeName).To(Equal(recoveredNodeName))
})
})
// createOutOfDiskPod creates a pod in the given namespace with the requested amount of CPU.
func createOutOfDiskPod(c clientset.Interface, ns, name string, milliCPU int64) {
podClient := c.Core().Pods(ns)
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: name,
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "pause",
Image: framework.GetPauseImageName(c),
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
// Request enough CPU to fit only two pods on a given node.
v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI),
},
},
},
},
},
}
_, err := podClient.Create(pod)
framework.ExpectNoError(err)
}
// availCpu calculates the available CPU on a given node by subtracting the CPU requested by
// all the pods from the total available CPU capacity on the node.
func availCpu(c clientset.Interface, node *v1.Node) (int64, error) {
podClient := c.Core().Pods(metav1.NamespaceAll)
selector := fields.Set{"spec.nodeName": node.Name}.AsSelector().String()
options := metav1.ListOptions{FieldSelector: selector}
pods, err := podClient.List(options)
if err != nil {
return 0, fmt.Errorf("failed to retrieve all the pods on node %s: %v", node.Name, err)
}
avail := node.Status.Capacity.Cpu().MilliValue()
for _, pod := range pods.Items {
for _, cont := range pod.Spec.Containers {
avail -= cont.Resources.Requests.Cpu().MilliValue()
}
}
return avail, nil
}
// availSize returns the available disk space on a given node by querying node stats which
// is in turn obtained internally from cadvisor.
func availSize(c clientset.Interface, node *v1.Node) (uint64, error) {
statsResource := fmt.Sprintf("api/v1/proxy/nodes/%s/stats/", node.Name)
framework.Logf("Querying stats for node %s using url %s", node.Name, statsResource)
res, err := c.Core().RESTClient().Get().AbsPath(statsResource).Timeout(time.Minute).Do().Raw()
if err != nil {
return 0, fmt.Errorf("error querying cAdvisor API: %v", err)
}
ci := cadvisorapi.ContainerInfo{}
err = json.Unmarshal(res, &ci)
if err != nil {
return 0, fmt.Errorf("couldn't unmarshal container info: %v", err)
}
return ci.Stats[len(ci.Stats)-1].Filesystem[0].Available, nil
}
// fillDiskSpace fills the available disk space on a given node by creating a large file. The disk
// space on the node is filled in such a way that the available space after filling the disk is just
// below the lowDiskSpaceThreshold mark.
func fillDiskSpace(c clientset.Interface, node *v1.Node) {
avail, err := availSize(c, node)
framework.ExpectNoError(err, "Node %s: couldn't obtain available disk size %v", node.Name, err)
fillSize := (avail - lowDiskSpaceThreshold + (100 * mb))
framework.Logf("Node %s: disk space available %d bytes", node.Name, avail)
By(fmt.Sprintf("Node %s: creating a file of size %d bytes to fill the available disk space", node.Name, fillSize))
cmd := fmt.Sprintf("fallocate -l %d test.img", fillSize)
framework.ExpectNoError(framework.IssueSSHCommand(cmd, framework.TestContext.Provider, node))
ood := framework.WaitForNodeToBe(c, node.Name, v1.NodeOutOfDisk, true, nodeOODTimeOut)
Expect(ood).To(BeTrue(), "Node %s did not run out of disk within %v", node.Name, nodeOODTimeOut)
avail, err = availSize(c, node)
framework.Logf("Node %s: disk space available %d bytes", node.Name, avail)
Expect(avail < lowDiskSpaceThreshold).To(BeTrue())
}
// recoverDiskSpace recovers disk space, filled by creating a large file, on a given node.
func recoverDiskSpace(c clientset.Interface, node *v1.Node) {
By(fmt.Sprintf("Recovering disk space on node %s", node.Name))
cmd := "rm -f test.img"
framework.ExpectNoError(framework.IssueSSHCommand(cmd, framework.TestContext.Provider, node))
ood := framework.WaitForNodeToBe(c, node.Name, v1.NodeOutOfDisk, false, nodeOODTimeOut)
Expect(ood).To(BeTrue(), "Node %s's out of disk condition status did not change to false within %v", node.Name, nodeOODTimeOut)
}