-
Notifications
You must be signed in to change notification settings - Fork 14
/
config.go
237 lines (192 loc) · 8.14 KB
/
config.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
package plugin
import (
"encoding/json"
"errors"
"fmt"
"os"
"golang.org/x/exp/slices"
"k8s.io/apimachinery/pkg/api/resource"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
)
//////////////////
// CONFIG TYPES //
//////////////////
type Config struct {
// NodeConfig defines our policies around node resources and scoring
NodeConfig nodeConfig `json:"nodeConfig"`
// SchedulerName informs the scheduler of its name, so that it can identify pods that a previous
// version handled.
SchedulerName string `json:"schedulerName"`
// EventQueueWorkers sets the number of worker threads responsible for handling items from the
// event queue.
EventQueueWorkers int `json:"eventQueueWorkers"`
// RandomizeScores, if true, will cause the scheduler to score a node with a random number in
// the range [minScore + 1, trueScore], instead of the trueScore
RandomizeScores bool `json:"randomizeScores"`
// MigrationDeletionRetrySeconds gives the duration, in seconds, we should wait between retrying
// a failed attempt to delete a VirtualMachineMigration that's finished.
MigrationDeletionRetrySeconds uint `json:"migrationDeletionRetrySeconds"`
// DoMigration, if provided, allows VM migration to be disabled
//
// This flag is intended to be temporary, just until NeonVM supports mgirations and we can
// re-enable it.
DoMigration *bool `json:"doMigration"`
// K8sNodeGroupLabel, if provided, gives the label to use when recording k8s node groups in the
// metrics (like for autoscaling_plugin_node_{cpu,mem}_resources_current)
K8sNodeGroupLabel string `json:"k8sNodeGroupLabel"`
// K8sAvailabilityZoneLabel, if provided, gives the label to use when recording nodes'
// availability zones in the metrics (like for autoscaling_plugin_node_{cpu,mem}_resources_current)
K8sAvailabilityZoneLabel string `json:"k8sAvailabilityZoneLabel"`
// IgnoreNamespaces, if provided, gives a list of namespaces that the plugin should completely
// ignore, as if pods from those namespaces do not exist.
//
// This is specifically designed for our "overprovisioning" namespace, which creates paused pods
// to trigger cluster-autoscaler.
//
// The only exception to this rule is during Filter method calls, where we do still count the
// resources from such pods. The reason to do that is so that these overprovisioning pods can be
// evicted, which will allow cluster-autoscaler to trigger scale-up.
IgnoreNamespaces []string `json:"ignoreNamespaces"`
// DumpState, if provided, enables a server to dump internal state
DumpState *dumpStateConfig `json:"dumpState"`
// JSONString is the JSON string that was used to generate this config struct
JSONString string `json:"-"`
}
type nodeConfig struct {
Cpu resourceConfig `json:"cpu"`
Memory resourceConfig `json:"memory"`
// Details about node scoring:
// See also: https://www.desmos.com/calculator/wg8s0yn63s
// In the desmos, the value f(x,s) gives the score (from 0 to 1) of a node that's x amount full
// (where x is a fraction from 0 to 1), with a total size that is equal to the maximum size node
// times s (i.e. s (or: "scale") gives the ratio between this nodes's size and the biggest one).
// MinUsageScore gives the ratio of the score at the minimum usage (i.e. 0) relative to the
// score at the midpoint, which will have the maximum.
//
// This corresponds to y₀ in the desmos link above.
MinUsageScore float64 `json:"minUsageScore"`
// MaxUsageScore gives the ratio of the score at the maximum usage (i.e. full) relative to the
// score at the midpoint, which will have the maximum.
//
// This corresponds to y₁ in the desmos link above.
MaxUsageScore float64 `json:"maxUsageScore"`
// ScorePeak gives the fraction at which the "target" or highest score should be, with the score
// sloping down on either side towards MinUsageScore at 0 and MaxUsageScore at 1.
//
// This corresponds to xₚ in the desmos link.
ScorePeak float64 `json:"scorePeak"`
}
// resourceConfig configures the amount of a particular resource we're willing to allocate to VMs,
// both the soft limit (Watermark) and the hard limit (via System)
type resourceConfig struct {
// Watermark is the fraction of non-system resource allocation above which we should be
// migrating VMs away to reduce usage
//
// If empty, the watermark is set as equal to the "hard" limit from system resources.
//
// The word "watermark" was originally used by @zoete as a temporary stand-in term during a
// meeting, and so it has intentionally been made permanent to spite the concept of "temporary" 😛
Watermark float32 `json:"watermark,omitempty"`
}
func (c *Config) migrationEnabled() bool {
return c.DoMigration == nil || *c.DoMigration
}
///////////////////////
// CONFIG VALIDATION //
///////////////////////
// if the returned error is not nil, the string is a JSON path to the invalid value
func (c *Config) validate() (string, error) {
if path, err := c.NodeConfig.validate(); err != nil {
return fmt.Sprintf("nodeConfig.%s", path), err
}
if c.SchedulerName == "" {
return "schedulerName", errors.New("string cannot be empty")
}
if c.EventQueueWorkers <= 0 {
return "eventQueueWorkers", errors.New("value must be > 0")
}
if c.DumpState != nil {
if path, err := c.DumpState.validate(); err != nil {
return fmt.Sprintf("dumpState.%s", path), err
}
}
if c.MigrationDeletionRetrySeconds == 0 {
return "migrationDeletionRetrySeconds", errors.New("value must be > 0")
}
return "", nil
}
func (c *nodeConfig) validate() (string, error) {
if path, err := c.Cpu.validate(); err != nil {
return fmt.Sprintf("cpu.%s", path), err
}
if path, err := c.Memory.validate(); err != nil {
return fmt.Sprintf("memory.%s", path), err
}
if c.MinUsageScore < 0 || c.MinUsageScore > 1 {
return "minUsageScore", errors.New("value must be between 0 and 1, inclusive")
} else if c.MaxUsageScore < 0 || c.MaxUsageScore > 1 {
return "maxUsageScore", errors.New("value must be between 0 and 1, inclusive")
} else if c.ScorePeak < 0 || c.ScorePeak > 1 {
return "scorePeak", errors.New("value must be between 0 and 1, inclusive")
}
return "", nil
}
func (c *resourceConfig) validate() (string, error) {
if c.Watermark <= 0.0 {
return "watermark", errors.New("value must be > 0")
} else if c.Watermark > 1.0 {
return "watermark", errors.New("value must be <= 1")
}
return "", nil
}
////////////////////
// CONFIG READING //
////////////////////
const DefaultConfigPath = "/etc/scheduler-plugin-config/autoscale-enforcer-config.json"
func ReadConfig(path string) (*Config, error) {
file, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("Error opening config file %q: %w", path, err)
}
defer file.Close()
var config Config
jsonDecoder := json.NewDecoder(file)
jsonDecoder.DisallowUnknownFields()
if err = jsonDecoder.Decode(&config); err != nil {
return nil, fmt.Errorf("Error decoding JSON config in %q: %w", path, err)
}
if path, err = config.validate(); err != nil {
return nil, fmt.Errorf("Invalid config at %s: %w", path, err)
}
return &config, nil
}
//////////////////////////////////////
// HELPER METHODS FOR USING CONFIGS //
//////////////////////////////////////
// ignoredNamespace returns whether items in the namespace should be treated as if they don't exist
func (c *Config) ignoredNamespace(namespace string) bool {
return slices.Contains(c.IgnoreNamespaces, namespace)
}
func (c *nodeConfig) vCpuLimits(total *resource.Quantity) nodeResourceState[vmapi.MilliCPU] {
totalMilli := total.MilliValue()
return nodeResourceState[vmapi.MilliCPU]{
Total: vmapi.MilliCPU(totalMilli),
Watermark: vmapi.MilliCPU(c.Cpu.Watermark * float32(totalMilli)),
Reserved: 0,
Buffer: 0,
CapacityPressure: 0,
PressureAccountedFor: 0,
}
}
func (c *nodeConfig) memoryLimits(total *resource.Quantity) nodeResourceState[api.Bytes] {
totalBytes := total.Value()
return nodeResourceState[api.Bytes]{
Total: api.Bytes(totalBytes),
Watermark: api.Bytes(c.Memory.Watermark * float32(totalBytes)),
Reserved: 0,
Buffer: 0,
CapacityPressure: 0,
PressureAccountedFor: 0,
}
}