/
status.go
245 lines (204 loc) · 9.52 KB
/
status.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
/*
Copyright 2022 Red Hat, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controlplanemachineset
import (
"context"
"fmt"
"github.com/go-logr/logr"
machinev1 "github.com/openshift/api/machine/v1"
"github.com/openshift/cluster-control-plane-machine-set-operator/pkg/machineproviders"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)
const (
// updatingStatus is a log message used to inform users that the ControlPlaneMachineSet status is being updated.
updatingStatus = "Updating control plane machine set status"
// maxContinuousErrors is the maximum number of identical consecutive errors that may occur before an error condition is
// set on the ControlPlaneMachineSet status.
// Choose 15 as the limit because, using the default backoff which is 5ms*2^x, where x is the number of consecutive errors,
// the delay between the first error and the 15th error (14 errors) is approximately 2x5ms*2^14 = 163840ms = 2.6 minutes.
// However, because we update the status on the first error, that adds another event, so we only actually backoff for 13 errors.
// This means that the real delay between the first error and the 15th error is approximately 2x5ms*2^13 = 81920ms = 1.3 minutes.
maxContinuousErrors = 15
// notUpdatingStatus is a log message used to inform users that the ControlPlaneMachineSet status is not being updated.
notUpdatingStatus = "No update to control plane machine set status required"
)
// updateControlPlaneMachineSetStatus ensures that the status of the ControlPlaneMachineSet is up to date after
// the resource has been reconciled.
func (r *ControlPlaneMachineSetReconciler) updateControlPlaneMachineSetStatus(ctx context.Context, logger logr.Logger, cpms *machinev1.ControlPlaneMachineSet, patchBase client.Patch) error {
data, err := patchBase.Data(cpms)
if err != nil {
return fmt.Errorf("cannot calculate patch data from control plane machine set object: %w", err)
}
// Apply changes only if the patch is not empty
if string(data) == "{}" {
logger.V(3).Info(notUpdatingStatus)
return nil
}
if err := r.Status().Update(ctx, cpms); err != nil {
return fmt.Errorf("failed to sync status for control plane machine set object: %w", err)
}
logger.V(3).Info(updatingStatus, "data", string(data))
return nil
}
// reconcileStatusWithMachineInfo takes the information gathered in the machineInfos and reconciles the status of the
// ControlPlaneMachineSet to match the data gathered.
// In particular, it will update the ObservedGeneration, Replicas, ReadyReplicas, UnavailableReplicas and UpdatedReplicas
// fields based on the information gathered, and then set any relevant conditions if applicable.
// It observes the following rules for setting the status:
// - Replicas is the number of Machines present
// - ReadyReplicas is the number of the above Replicas which are reporting as Ready
// - UpdatedReplicas is the number of Ready Replicas that do not need an update (this should be at most 1 per index).
// - UnavailableReplicas is the number of Machines required to satisfy the requirement of at least 1 Ready Replica per
// index. Eg. if one index has no ready replicas, this is 1, if an index has 2 ready replicas, this does not count as
// 2 available replicas.
func reconcileStatusWithMachineInfo(logger logr.Logger, cpms *machinev1.ControlPlaneMachineSet, machineInfosByIndex map[int32][]machineproviders.MachineInfo) error {
replicas := int32(0)
readyReplicas := int32(0)
updatedReplicas := int32(0)
unavailableReplicas := int32(0)
for _, machineInfosInIndex := range machineInfosByIndex {
hasUnavailableReplicaInIndex := false
hasAvailableReplicaInIndex := false
for _, machineInfo := range machineInfosInIndex {
replicas += 1
if machineInfo.Ready {
readyReplicas += 1
hasAvailableReplicaInIndex = true
if !machineInfo.NeedsUpdate {
updatedReplicas += 1
}
} else {
hasUnavailableReplicaInIndex = true
}
}
if len(machineInfosInIndex) == 0 || hasUnavailableReplicaInIndex && !hasAvailableReplicaInIndex {
// Count this index as unavailable if it has no machines or if it has machines but all of them are unavailable.
unavailableReplicas += 1
}
}
cpms.Status.ObservedGeneration = cpms.Generation
cpms.Status.Replicas = replicas
cpms.Status.ReadyReplicas = readyReplicas
cpms.Status.UnavailableReplicas = unavailableReplicas
cpms.Status.UpdatedReplicas = updatedReplicas
logger.Info("Observed Machine Configuration",
"observedGeneration", cpms.Status.ObservedGeneration,
"replicas", cpms.Status.Replicas,
"readyReplicas", cpms.Status.ReadyReplicas,
"updatedReplicas", cpms.Status.UpdatedReplicas,
"unavailableReplicas", cpms.Status.UnavailableReplicas,
)
if err := setConditions(cpms); err != nil {
return fmt.Errorf("could not set control plane machine set conditions: %w", err)
}
return nil
}
// setConditions sets Available, Degraded and Progressing conditions on the ControlPlaneMachineSet.
func setConditions(cpms *machinev1.ControlPlaneMachineSet) error {
availableCondition := getAvailableCondition(cpms)
meta.SetStatusCondition(&cpms.Status.Conditions, availableCondition)
degradedCondition := getDegradedCondition(cpms)
meta.SetStatusCondition(&cpms.Status.Conditions, degradedCondition)
progressingCondition, err := getProgressingCondition(cpms)
if err != nil {
return fmt.Errorf("could not set progressing condition: %w", err)
}
meta.SetStatusCondition(&cpms.Status.Conditions, progressingCondition)
return nil
}
// getProgressingCondition computes Available condition based on the current ControlPlaneMachineSet status.
func getAvailableCondition(cpms *machinev1.ControlPlaneMachineSet) metav1.Condition {
if cpms.Status.UnavailableReplicas != 0 {
return metav1.Condition{
Type: conditionAvailable,
Status: metav1.ConditionFalse,
Reason: reasonUnavailableReplicas,
Message: fmt.Sprintf("Missing %d available replica(s)", cpms.Status.UnavailableReplicas),
ObservedGeneration: cpms.Generation,
}
}
return metav1.Condition{
Type: conditionAvailable,
Status: metav1.ConditionTrue,
Reason: reasonAllReplicasAvailable,
ObservedGeneration: cpms.Generation,
}
}
// getProgressingCondition computes Degraded condition based on the current ControlPlaneMachineSet status.
func getDegradedCondition(cpms *machinev1.ControlPlaneMachineSet) metav1.Condition {
if cpms.Status.ReadyReplicas == 0 {
return metav1.Condition{
Type: conditionDegraded,
Status: metav1.ConditionTrue,
Reason: reasonNoReadyMachines,
ObservedGeneration: cpms.Generation,
}
}
return metav1.Condition{
Type: conditionDegraded,
Status: metav1.ConditionFalse,
Reason: reasonAsExpected,
ObservedGeneration: cpms.Generation,
}
}
// getErrorCondition returns an error condition based on the given error and the status of the tracked last errors.
func getErrorCondition(cpms *machinev1.ControlPlaneMachineSet, lastError *lastErrorTracker) metav1.Condition {
if lastError == nil || lastError.count < maxContinuousErrors {
return metav1.Condition{
Type: conditionError,
Status: metav1.ConditionFalse,
Reason: reasonAsExpected,
ObservedGeneration: cpms.Generation,
}
}
return metav1.Condition{
Type: conditionError,
Status: metav1.ConditionTrue,
Reason: reasonContinuousErrors,
Message: fmt.Sprintf("The control plane machine set has experienced the following error more than %d consecutive times since %s: %v", maxContinuousErrors, lastError.lastErrorTime, lastError.lastError),
ObservedGeneration: cpms.Generation,
}
}
// getProgressingCondition computes Progressing condition based on the current ControlPlaneMachineSet status.
func getProgressingCondition(cpms *machinev1.ControlPlaneMachineSet) (metav1.Condition, error) {
if cpms.Spec.Replicas == nil {
return metav1.Condition{}, errReplicasRequired
}
desiredReplicas := *cpms.Spec.Replicas
if desiredReplicas > cpms.Status.UpdatedReplicas {
return metav1.Condition{
Type: conditionProgressing,
Status: metav1.ConditionTrue,
Reason: reasonNeedsUpdateReplicas,
Message: fmt.Sprintf("Observed %d replica(s) in need of update", desiredReplicas-cpms.Status.UpdatedReplicas),
ObservedGeneration: cpms.Generation,
}, nil
}
if desiredReplicas < cpms.Status.ReadyReplicas {
return metav1.Condition{
Type: conditionProgressing,
Status: metav1.ConditionTrue,
Reason: reasonExcessReplicas,
Message: fmt.Sprintf("Waiting for %d old replica(s) to be removed", cpms.Status.ReadyReplicas-cpms.Status.UpdatedReplicas),
ObservedGeneration: cpms.Generation,
}, nil
}
return metav1.Condition{
Type: conditionProgressing,
Status: metav1.ConditionFalse,
Reason: reasonAllReplicasUpdated,
ObservedGeneration: cpms.Generation,
}, nil
}