forked from kubernetes/kubernetes
-
Notifications
You must be signed in to change notification settings - Fork 1
/
scalejob.go
161 lines (143 loc) · 5.47 KB
/
scalejob.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scale
import (
"fmt"
"strconv"
"time"
batch "k8s.io/api/batch/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
batchclient "k8s.io/client-go/kubernetes/typed/batch/v1"
)
// ScalePrecondition is a deprecated precondition
type ScalePrecondition struct {
Size int
ResourceVersion string
}
// RetryParams is a deprecated retry struct
type RetryParams struct {
Interval, Timeout time.Duration
}
// PreconditionError is a deprecated error
type PreconditionError struct {
Precondition string
ExpectedValue string
ActualValue string
}
func (pe PreconditionError) Error() string {
return fmt.Sprintf("Expected %s to be %s, was %s", pe.Precondition, pe.ExpectedValue, pe.ActualValue)
}
// ScaleCondition is a closure around Scale that facilitates retries via util.wait
func scaleCondition(r *JobPsuedoScaler, precondition *ScalePrecondition, namespace, name string, count uint, updatedResourceVersion *string) wait.ConditionFunc {
return func() (bool, error) {
rv, err := r.ScaleSimple(namespace, name, precondition, count)
if updatedResourceVersion != nil {
*updatedResourceVersion = rv
}
// Retry only on update conflicts.
if errors.IsConflict(err) {
return false, nil
}
if err != nil {
return false, err
}
return true, nil
}
}
// JobPsuedoScaler is a deprecated scale-similar thing that doesn't obey scale semantics
type JobPsuedoScaler struct {
JobsClient batchclient.JobsGetter
}
// ScaleSimple is responsible for updating job's parallelism. It returns the
// resourceVersion of the job if the update is successful.
func (scaler *JobPsuedoScaler) ScaleSimple(namespace, name string, preconditions *ScalePrecondition, newSize uint) (string, error) {
job, err := scaler.JobsClient.Jobs(namespace).Get(name, metav1.GetOptions{})
if err != nil {
return "", err
}
if preconditions != nil {
if err := validateJob(job, preconditions); err != nil {
return "", err
}
}
parallelism := int32(newSize)
job.Spec.Parallelism = ¶llelism
updatedJob, err := scaler.JobsClient.Jobs(namespace).Update(job)
if err != nil {
return "", err
}
return updatedJob.ObjectMeta.ResourceVersion, nil
}
// Scale updates a Job to a new size, with optional precondition check (if preconditions is not nil),
// optional retries (if retry is not nil), and then optionally waits for parallelism to reach desired
// number, which can be less than requested based on job's current progress.
func (scaler *JobPsuedoScaler) Scale(namespace, name string, newSize uint, preconditions *ScalePrecondition, retry, waitForReplicas *RetryParams) error {
if preconditions == nil {
preconditions = &ScalePrecondition{-1, ""}
}
if retry == nil {
// Make it try only once, immediately
retry = &RetryParams{Interval: time.Millisecond, Timeout: time.Millisecond}
}
cond := scaleCondition(scaler, preconditions, namespace, name, newSize, nil)
if err := wait.PollImmediate(retry.Interval, retry.Timeout, cond); err != nil {
return err
}
if waitForReplicas != nil {
job, err := scaler.JobsClient.Jobs(namespace).Get(name, metav1.GetOptions{})
if err != nil {
return err
}
err = wait.PollImmediate(waitForReplicas.Interval, waitForReplicas.Timeout, jobHasDesiredParallelism(scaler.JobsClient, job))
if err == wait.ErrWaitTimeout {
return fmt.Errorf("timed out waiting for %q to be synced", name)
}
return err
}
return nil
}
// JobHasDesiredParallelism returns a condition that will be true if the desired parallelism count
// for a job equals the current active counts or is less by an appropriate successful/unsuccessful count.
func jobHasDesiredParallelism(jobClient batchclient.JobsGetter, job *batch.Job) wait.ConditionFunc {
return func() (bool, error) {
job, err := jobClient.Jobs(job.Namespace).Get(job.Name, metav1.GetOptions{})
if err != nil {
return false, err
}
// desired parallelism can be either the exact number, in which case return immediately
if job.Status.Active == *job.Spec.Parallelism {
return true, nil
}
if job.Spec.Completions == nil {
// A job without specified completions needs to wait for Active to reach Parallelism.
return false, nil
}
// otherwise count successful
progress := *job.Spec.Completions - job.Status.Active - job.Status.Succeeded
return progress <= 0, nil
}
}
func validateJob(job *batch.Job, precondition *ScalePrecondition) error {
if precondition.Size != -1 && job.Spec.Parallelism == nil {
return PreconditionError{"parallelism", strconv.Itoa(precondition.Size), "nil"}
}
if precondition.Size != -1 && int(*job.Spec.Parallelism) != precondition.Size {
return PreconditionError{"parallelism", strconv.Itoa(precondition.Size), strconv.Itoa(int(*job.Spec.Parallelism))}
}
if len(precondition.ResourceVersion) != 0 && job.ResourceVersion != precondition.ResourceVersion {
return PreconditionError{"resource version", precondition.ResourceVersion, job.ResourceVersion}
}
return nil
}