-
Notifications
You must be signed in to change notification settings - Fork 201
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[OCPCLOUD-907] Add remediationsAllowed
field to MHC status
#652
Changes from all commits
1bcdf67
25edfeb
9da1a78
bdb06c3
5431a50
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
/* | ||
Copyright 2020 The Kubernetes Authors. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package v1beta1 | ||
|
||
// Conditions and condition Reasons for the MachineHealthCheck object | ||
|
||
const ( | ||
// RemediationAllowedCondition is set on MachineHealthChecks to show the status of whether the MachineHealthCheck is | ||
// allowed to remediate any Machines or whether it is blocked from remediating any further. | ||
RemediationAllowedCondition ConditionType = "RemediationAllowed" | ||
|
||
// TooManyUnhealthy is the reason used when too many Machines are unhealthy and the MachineHealthCheck is blocked | ||
// from making any further remediations. | ||
TooManyUnhealthyReason = "TooManyUnhealthy" | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
/* | ||
Copyright 2020 The Kubernetes Authors. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package v1beta1 | ||
|
||
import ( | ||
corev1 "k8s.io/api/core/v1" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
) | ||
|
||
// ConditionSeverity expresses the severity of a Condition Type failing. | ||
type ConditionSeverity string | ||
|
||
const ( | ||
// ConditionSeverityError specifies that a condition with `Status=False` is an error. | ||
ConditionSeverityError ConditionSeverity = "Error" | ||
|
||
// ConditionSeverityWarning specifies that a condition with `Status=False` is a warning. | ||
ConditionSeverityWarning ConditionSeverity = "Warning" | ||
|
||
// ConditionSeverityInfo specifies that a condition with `Status=False` is informative. | ||
ConditionSeverityInfo ConditionSeverity = "Info" | ||
|
||
// ConditionSeverityNone should apply only to conditions with `Status=True`. | ||
ConditionSeverityNone ConditionSeverity = "" | ||
) | ||
|
||
// ConditionType is a valid value for Condition.Type. | ||
type ConditionType string | ||
|
||
// Condition defines an observation of a Machine API resource operational state. | ||
type Condition struct { | ||
// Type of condition in CamelCase or in foo.example.com/CamelCase. | ||
// Many .condition.type values are consistent across resources like Available, but because arbitrary conditions | ||
// can be useful (see .node.status.conditions), the ability to deconflict is important. | ||
// +required | ||
Type ConditionType `json:"type"` | ||
|
||
// Status of the condition, one of True, False, Unknown. | ||
// +required | ||
Status corev1.ConditionStatus `json:"status"` | ||
|
||
// Severity provides an explicit classification of Reason code, so the users or machines can immediately | ||
// understand the current situation and act accordingly. | ||
// The Severity field MUST be set only when Status=False. | ||
// +optional | ||
Severity ConditionSeverity `json:"severity,omitempty"` | ||
|
||
// Last time the condition transitioned from one status to another. | ||
// This should be when the underlying condition changed. If that is not known, then using the time when | ||
// the API field changed is acceptable. | ||
// +required | ||
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"` | ||
|
||
// The reason for the condition's last transition in CamelCase. | ||
// The specific API may choose whether or not this field is considered a guaranteed API. | ||
// This field may not be empty. | ||
// +optional | ||
Reason string `json:"reason,omitempty"` | ||
|
||
// A human readable message indicating details about the transition. | ||
// This field may be empty. | ||
// +optional | ||
Message string `json:"message,omitempty"` | ||
} | ||
|
||
// Conditions provide observations of the operational state of a Machine API resource. | ||
type Conditions []Condition |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -161,6 +161,9 @@ func (r *ReconcileMachineHealthCheck) Reconcile(request reconcile.Request) (reco | |
return reconcile.Result{}, err | ||
} | ||
|
||
// Create a base from which the MHC status patch will be calculated | ||
mergeBase := client.MergeFrom(mhc.DeepCopy()) | ||
|
||
// fetch all targets | ||
klog.V(3).Infof("Reconciling %s: finding targets", request.String()) | ||
targets, err := r.getTargetsFromMHC(*mhc) | ||
|
@@ -171,10 +174,9 @@ func (r *ReconcileMachineHealthCheck) Reconcile(request reconcile.Request) (reco | |
|
||
// health check all targets and reconcile mhc status | ||
currentHealthy, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(targets, mhc.Spec.NodeStartupTimeout.Duration) | ||
if err := r.reconcileStatus(mhc, totalTargets, currentHealthy); err != nil { | ||
klog.Errorf("Reconciling %s: error patching status: %v", request.String(), err) | ||
return reconcile.Result{}, err | ||
} | ||
mhc.Status.CurrentHealthy = ¤tHealthy | ||
mhc.Status.ExpectedMachines = &totalTargets | ||
unhealthyCount := totalTargets - currentHealthy | ||
|
||
// check MHC current health against MaxUnhealthy | ||
if !isAllowedRemediation(mhc) { | ||
|
@@ -184,23 +186,50 @@ func (r *ReconcileMachineHealthCheck) Reconcile(request reconcile.Request) (reco | |
mhc.Spec.MaxUnhealthy, | ||
totalTargets-currentHealthy, | ||
) | ||
|
||
message := fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy (total: %v, unhealthy: %v, maxUnhealthy: %v)", | ||
totalTargets, | ||
unhealthyCount, | ||
mhc.Spec.MaxUnhealthy, | ||
) | ||
|
||
// Remediation not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy | ||
mhc.Status.RemediationsAllowed = 0 | ||
conditions.Set(mhc, &mapiv1.Condition{ | ||
Type: mapiv1.RemediationAllowedCondition, | ||
Status: corev1.ConditionFalse, | ||
Severity: mapiv1.ConditionSeverityWarning, | ||
Reason: mapiv1.TooManyUnhealthyReason, | ||
Message: message, | ||
}) | ||
|
||
if err := r.reconcileStatus(mergeBase, mhc); err != nil { | ||
klog.Errorf("Reconciling %s: error patching status: %v", request.String(), err) | ||
return reconcile.Result{}, err | ||
} | ||
|
||
r.recorder.Eventf( | ||
mhc, | ||
corev1.EventTypeWarning, | ||
EventRemediationRestricted, | ||
"Remediation restricted due to exceeded number of unhealthy machines (total: %v, unhealthy: %v, maxUnhealthy: %v)", | ||
totalTargets, | ||
totalTargets-currentHealthy, | ||
unhealthyCount, | ||
mhc.Spec.MaxUnhealthy, | ||
) | ||
return reconcile.Result{Requeue: true}, nil | ||
} | ||
klog.V(3).Infof("Reconciling %s: monitoring MHC: total targets: %v, maxUnhealthy: %v, unhealthy: %v. Remediations are allowed", | ||
klog.V(3).Infof("Remediations are allowed for %s: total targets: %v, max unhealthy: %v, unhealthy targets: %v", | ||
request.String(), | ||
totalTargets, | ||
mhc.Spec.MaxUnhealthy, | ||
totalTargets-currentHealthy, | ||
unhealthyCount, | ||
) | ||
conditions.MarkTrue(mhc, mapiv1.RemediationAllowedCondition) | ||
if err := r.reconcileStatus(mergeBase, mhc); err != nil { | ||
klog.Errorf("Reconciling %s: error patching status: %v", request.String(), err) | ||
return reconcile.Result{}, err | ||
} | ||
|
||
// remediate | ||
for _, t := range needRemediationTargets { | ||
|
@@ -228,22 +257,37 @@ func (r *ReconcileMachineHealthCheck) Reconcile(request reconcile.Request) (reco | |
} | ||
|
||
func isAllowedRemediation(mhc *mapiv1.MachineHealthCheck) bool { | ||
maxUnhealthy, err := getMaxUnhealthy(mhc) | ||
if err != nil { | ||
return false | ||
} | ||
|
||
// If unhealthy is above maxUnhealthy, short circuit any further remediation | ||
return unhealthyMachineCount(mhc) <= maxUnhealthy | ||
} | ||
|
||
func getMaxUnhealthy(mhc *mapiv1.MachineHealthCheck) (int, error) { | ||
if mhc.Spec.MaxUnhealthy == nil { | ||
return true | ||
// This value should be defaulted, but if not, 100% is the default | ||
return derefInt(mhc.Status.ExpectedMachines), nil | ||
} | ||
maxUnhealthy, err := getValueFromIntOrPercent(mhc.Spec.MaxUnhealthy, derefInt(mhc.Status.ExpectedMachines), false) | ||
if err != nil { | ||
klog.Errorf("%s: error decoding maxUnhealthy, remediation won't be allowed: %v", namespacedName(mhc), err) | ||
return false | ||
return 0, err | ||
} | ||
|
||
if maxUnhealthy < 0 { | ||
maxUnhealthy = 0 | ||
} | ||
|
||
// if noHealthy are above maxUnhealthy we short circuit any farther remediation | ||
noHealthy := derefInt(mhc.Status.ExpectedMachines) - derefInt(mhc.Status.CurrentHealthy) | ||
return (maxUnhealthy - noHealthy) >= 0 | ||
return maxUnhealthy, nil | ||
} | ||
|
||
// unhealthyMachineCount calculates the number of presently unhealthy or missing machines | ||
// ie the delta between the expected number of machines and the current number deemed healthy | ||
func unhealthyMachineCount(mhc *mapiv1.MachineHealthCheck) int { | ||
return derefInt(mhc.Status.ExpectedMachines) - derefInt(mhc.Status.CurrentHealthy) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it possible that either of these There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it will default to 0 if the deref fails (I think that's the point of the function), but no matter the behaviour, this hasn't actually changed from before this PR, just moved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok perfect. thanks! |
||
} | ||
|
||
func derefInt(i *int) int { | ||
|
@@ -253,10 +297,15 @@ func derefInt(i *int) int { | |
return 0 | ||
} | ||
|
||
func (r *ReconcileMachineHealthCheck) reconcileStatus(mhc *mapiv1.MachineHealthCheck, targets, currentHealthy int) error { | ||
baseToPatch := client.MergeFrom(mhc.DeepCopy()) | ||
mhc.Status.ExpectedMachines = &targets | ||
mhc.Status.CurrentHealthy = ¤tHealthy | ||
func (r *ReconcileMachineHealthCheck) reconcileStatus(baseToPatch client.Patch, mhc *mapiv1.MachineHealthCheck) error { | ||
maxUnhealthy, err := getMaxUnhealthy(mhc) | ||
if err != nil { | ||
return fmt.Errorf("failed to get value for maxUnhealthy: %v", err) | ||
} | ||
mhc.Status.RemediationsAllowed = int32(maxUnhealthy - unhealthyMachineCount(mhc)) | ||
if mhc.Status.RemediationsAllowed < 0 { | ||
mhc.Status.RemediationsAllowed = 0 | ||
} | ||
|
||
if err := r.client.Status().Patch(context.Background(), mhc, baseToPatch); err != nil { | ||
return err | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
does this get updated on the object in etcd at some point?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch, I've moved the reconcile status so it happens for all code paths, ideally I would spend time to do a bigger refactor of this logic so it's more obvious what's going on here
I will make sure to update the tests before we merge this