forked from kubernetes/kubernetes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
predicates.go
396 lines (354 loc) · 12.5 KB
/
predicates.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
/*
Copyright 2014 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package predicates
import (
"fmt"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
"github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm"
"github.com/golang/glog"
)
type NodeInfo interface {
GetNodeInfo(nodeID string) (*api.Node, error)
}
type StaticNodeInfo struct {
*api.NodeList
}
func (nodes StaticNodeInfo) GetNodeInfo(nodeID string) (*api.Node, error) {
for ix := range nodes.Items {
if nodes.Items[ix].Name == nodeID {
return &nodes.Items[ix], nil
}
}
return nil, fmt.Errorf("failed to find node: %s, %#v", nodeID, nodes)
}
type ClientNodeInfo struct {
*client.Client
}
func (nodes ClientNodeInfo) GetNodeInfo(nodeID string) (*api.Node, error) {
return nodes.Nodes().Get(nodeID)
}
func isVolumeConflict(volume api.Volume, pod *api.Pod) bool {
if volume.GCEPersistentDisk != nil {
disk := volume.GCEPersistentDisk
manifest := &(pod.Spec)
for ix := range manifest.Volumes {
if manifest.Volumes[ix].GCEPersistentDisk != nil &&
manifest.Volumes[ix].GCEPersistentDisk.PDName == disk.PDName &&
!(manifest.Volumes[ix].GCEPersistentDisk.ReadOnly && disk.ReadOnly) {
return true
}
}
}
if volume.AWSElasticBlockStore != nil {
volumeID := volume.AWSElasticBlockStore.VolumeID
manifest := &(pod.Spec)
for ix := range manifest.Volumes {
if manifest.Volumes[ix].AWSElasticBlockStore != nil &&
manifest.Volumes[ix].AWSElasticBlockStore.VolumeID == volumeID {
return true
}
}
}
return false
}
// NoDiskConflict evaluates if a pod can fit due to the volumes it requests, and those that
// are already mounted. Some times of volumes are mounted onto node machines. For now, these mounts
// are exclusive so if there is already a volume mounted on that node, another pod can't schedule
// there. This is GCE specific for now.
// TODO: migrate this into some per-volume specific code?
func NoDiskConflict(pod *api.Pod, existingPods []*api.Pod, node string) (bool, error) {
manifest := &(pod.Spec)
for ix := range manifest.Volumes {
for podIx := range existingPods {
if isVolumeConflict(manifest.Volumes[ix], existingPods[podIx]) {
return false, nil
}
}
}
return true, nil
}
type ResourceFit struct {
info NodeInfo
}
type resourceRequest struct {
milliCPU int64
memory int64
}
func getResourceRequest(pod *api.Pod) resourceRequest {
result := resourceRequest{}
for ix := range pod.Spec.Containers {
limits := pod.Spec.Containers[ix].Resources.Limits
result.memory += limits.Memory().Value()
result.milliCPU += limits.Cpu().MilliValue()
}
return result
}
func CheckPodsExceedingCapacity(pods []*api.Pod, capacity api.ResourceList) (fitting []*api.Pod, notFitting []*api.Pod) {
totalMilliCPU := capacity.Cpu().MilliValue()
totalMemory := capacity.Memory().Value()
milliCPURequested := int64(0)
memoryRequested := int64(0)
for _, pod := range pods {
podRequest := getResourceRequest(pod)
fitsCPU := totalMilliCPU == 0 || (totalMilliCPU-milliCPURequested) >= podRequest.milliCPU
fitsMemory := totalMemory == 0 || (totalMemory-memoryRequested) >= podRequest.memory
if !fitsCPU || !fitsMemory {
// the pod doesn't fit
notFitting = append(notFitting, pod)
continue
}
// the pod fits
milliCPURequested += podRequest.milliCPU
memoryRequested += podRequest.memory
fitting = append(fitting, pod)
}
return
}
// PodFitsResources calculates fit based on requested, rather than used resources
func (r *ResourceFit) PodFitsResources(pod *api.Pod, existingPods []*api.Pod, node string) (bool, error) {
podRequest := getResourceRequest(pod)
info, err := r.info.GetNodeInfo(node)
if err != nil {
return false, err
}
if podRequest.milliCPU == 0 && podRequest.memory == 0 {
return int64(len(existingPods)) < info.Status.Capacity.Pods().Value(), nil
}
pods := []*api.Pod{}
copy(pods, existingPods)
pods = append(existingPods, pod)
_, exceeding := CheckPodsExceedingCapacity(pods, info.Status.Capacity)
if len(exceeding) > 0 || int64(len(pods)) > info.Status.Capacity.Pods().Value() {
glog.V(4).Infof("Cannot schedule Pod %v, because Node %v is full, running %v out of %v Pods.", pod, node, len(pods)-1, info.Status.Capacity.Pods().Value())
return false, nil
}
glog.V(4).Infof("Schedule Pod %v on Node %v is allowed, Node is running only %v out of %v Pods.", pod, node, len(pods)-1, info.Status.Capacity.Pods().Value())
return true, nil
}
func NewResourceFitPredicate(info NodeInfo) algorithm.FitPredicate {
fit := &ResourceFit{
info: info,
}
return fit.PodFitsResources
}
func NewSelectorMatchPredicate(info NodeInfo) algorithm.FitPredicate {
selector := &NodeSelector{
info: info,
}
return selector.PodSelectorMatches
}
func PodMatchesNodeLabels(pod *api.Pod, node *api.Node) bool {
if len(pod.Spec.NodeSelector) == 0 {
return true
}
selector := labels.SelectorFromSet(pod.Spec.NodeSelector)
return selector.Matches(labels.Set(node.Labels))
}
type NodeSelector struct {
info NodeInfo
}
func (n *NodeSelector) PodSelectorMatches(pod *api.Pod, existingPods []*api.Pod, node string) (bool, error) {
minion, err := n.info.GetNodeInfo(node)
if err != nil {
return false, err
}
return PodMatchesNodeLabels(pod, minion), nil
}
func PodFitsHost(pod *api.Pod, existingPods []*api.Pod, node string) (bool, error) {
if len(pod.Spec.NodeName) == 0 {
return true, nil
}
return pod.Spec.NodeName == node, nil
}
type NodeLabelChecker struct {
info NodeInfo
labels []string
presence bool
}
func NewNodeLabelPredicate(info NodeInfo, labels []string, presence bool) algorithm.FitPredicate {
labelChecker := &NodeLabelChecker{
info: info,
labels: labels,
presence: presence,
}
return labelChecker.CheckNodeLabelPresence
}
// CheckNodeLabelPresence checks whether all of the specified labels exists on a minion or not, regardless of their value
// If "presence" is false, then returns false if any of the requested labels matches any of the minion's labels,
// otherwise returns true.
// If "presence" is true, then returns false if any of the requested labels does not match any of the minion's labels,
// otherwise returns true.
//
// Consider the cases where the minions are placed in regions/zones/racks and these are identified by labels
// In some cases, it is required that only minions that are part of ANY of the defined regions/zones/racks be selected
//
// Alternately, eliminating minions that have a certain label, regardless of value, is also useful
// A minion may have a label with "retiring" as key and the date as the value
// and it may be desirable to avoid scheduling new pods on this minion
func (n *NodeLabelChecker) CheckNodeLabelPresence(pod *api.Pod, existingPods []*api.Pod, node string) (bool, error) {
var exists bool
minion, err := n.info.GetNodeInfo(node)
if err != nil {
return false, err
}
minionLabels := labels.Set(minion.Labels)
for _, label := range n.labels {
exists = minionLabels.Has(label)
if (exists && !n.presence) || (!exists && n.presence) {
return false, nil
}
}
return true, nil
}
type ServiceAffinity struct {
podLister algorithm.PodLister
serviceLister algorithm.ServiceLister
nodeInfo NodeInfo
labels []string
}
func NewServiceAffinityPredicate(podLister algorithm.PodLister, serviceLister algorithm.ServiceLister, nodeInfo NodeInfo, labels []string) algorithm.FitPredicate {
affinity := &ServiceAffinity{
podLister: podLister,
serviceLister: serviceLister,
nodeInfo: nodeInfo,
labels: labels,
}
return affinity.CheckServiceAffinity
}
// CheckServiceAffinity ensures that only the minions that match the specified labels are considered for scheduling.
// The set of labels to be considered are provided to the struct (ServiceAffinity).
// The pod is checked for the labels and any missing labels are then checked in the minion
// that hosts the service pods (peers) for the given pod.
//
// We add an implicit selector requiring some particular value V for label L to a pod, if:
// - L is listed in the ServiceAffinity object that is passed into the function
// - the pod does not have any NodeSelector for L
// - some other pod from the same service is already scheduled onto a minion that has value V for label L
func (s *ServiceAffinity) CheckServiceAffinity(pod *api.Pod, existingPods []*api.Pod, node string) (bool, error) {
var affinitySelector labels.Selector
// check if the pod being scheduled has the affinity labels specified in its NodeSelector
affinityLabels := map[string]string{}
nodeSelector := labels.Set(pod.Spec.NodeSelector)
labelsExist := true
for _, l := range s.labels {
if nodeSelector.Has(l) {
affinityLabels[l] = nodeSelector.Get(l)
} else {
// the current pod does not specify all the labels, look in the existing service pods
labelsExist = false
}
}
// skip looking at other pods in the service if the current pod defines all the required affinity labels
if !labelsExist {
services, err := s.serviceLister.GetPodServices(pod)
if err == nil {
// just use the first service and get the other pods within the service
// TODO: a separate predicate can be created that tries to handle all services for the pod
selector := labels.SelectorFromSet(services[0].Spec.Selector)
servicePods, err := s.podLister.List(selector)
if err != nil {
return false, err
}
// consider only the pods that belong to the same namespace
nsServicePods := []*api.Pod{}
for _, nsPod := range servicePods {
if nsPod.Namespace == pod.Namespace {
nsServicePods = append(nsServicePods, nsPod)
}
}
if len(nsServicePods) > 0 {
// consider any service pod and fetch the minion its hosted on
otherMinion, err := s.nodeInfo.GetNodeInfo(nsServicePods[0].Spec.NodeName)
if err != nil {
return false, err
}
for _, l := range s.labels {
// If the pod being scheduled has the label value specified, do not override it
if _, exists := affinityLabels[l]; exists {
continue
}
if labels.Set(otherMinion.Labels).Has(l) {
affinityLabels[l] = labels.Set(otherMinion.Labels).Get(l)
}
}
}
}
}
// if there are no existing pods in the service, consider all minions
if len(affinityLabels) == 0 {
affinitySelector = labels.Everything()
} else {
affinitySelector = labels.Set(affinityLabels).AsSelector()
}
minion, err := s.nodeInfo.GetNodeInfo(node)
if err != nil {
return false, err
}
// check if the minion matches the selector
return affinitySelector.Matches(labels.Set(minion.Labels)), nil
}
func PodFitsPorts(pod *api.Pod, existingPods []*api.Pod, node string) (bool, error) {
existingPorts := getUsedPorts(existingPods...)
wantPorts := getUsedPorts(pod)
for wport := range wantPorts {
if wport == 0 {
continue
}
if existingPorts[wport] {
return false, nil
}
}
return true, nil
}
func getUsedPorts(pods ...*api.Pod) map[int]bool {
ports := make(map[int]bool)
for _, pod := range pods {
for _, container := range pod.Spec.Containers {
for _, podPort := range container.Ports {
ports[podPort.HostPort] = true
}
}
}
return ports
}
func filterNonRunningPods(pods []*api.Pod) []*api.Pod {
if len(pods) == 0 {
return pods
}
result := []*api.Pod{}
for _, pod := range pods {
if pod.Status.Phase == api.PodSucceeded || pod.Status.Phase == api.PodFailed {
continue
}
result = append(result, pod)
}
return result
}
// MapPodsToMachines obtains a list of pods and pivots that list into a map where the keys are host names
// and the values are the list of pods running on that host.
func MapPodsToMachines(lister algorithm.PodLister) (map[string][]*api.Pod, error) {
machineToPods := map[string][]*api.Pod{}
// TODO: perform more targeted query...
pods, err := lister.List(labels.Everything())
if err != nil {
return map[string][]*api.Pod{}, err
}
pods = filterNonRunningPods(pods)
for _, scheduledPod := range pods {
host := scheduledPod.Spec.NodeName
machineToPods[host] = append(machineToPods[host], scheduledPod)
}
return machineToPods, nil
}