Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ spec:
- type: integer
description: Any farther remediation is only allowed if at most "MaxUnhealthy"
machines selected by "selector" are not healthy.
nodeStartupTimeout:
description: Machines older than this duration without a node will be
considered to have failed and will be remediated.
type: string
selector:
description: Label selector to match machines whose health will be exercised
properties:
Expand Down
12 changes: 12 additions & 0 deletions pkg/apis/machine/v1beta1/machinehealthcheck_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,18 @@ type MachineHealthCheckSpec struct {
// "selector" are not healthy.
// +optional
MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"`

// It would be preferable for nodeStartupTimeout to be a metav1.Duration, but
// there's no good way to validate the format here. Invalid input would cause
// problems with marshaling, so it's better to just make it a string and
// handle the conversion in the controller.
//
// Intentional blank line to keep this out of the OpenAPI description...

// Machines older than this duration without a node will be considered to have
// failed and will be remediated.
// +optional
NodeStartupTimeout string `json:"nodeStartupTimeout,omitempty"`
}

// UnhealthyCondition represents a Node condition type and value with a timeout
Expand Down
28 changes: 23 additions & 5 deletions pkg/controller/machinehealthcheck/machinehealthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ const (
machinePhaseFailed = "Failed"
remediationStrategyAnnotation = "machine.openshift.io/remediation-strategy"
remediationStrategyExternal = mapiv1.RemediationStrategyType("external-baremetal")
timeoutForMachineToHaveNode = 10 * time.Minute
defaultNodeStartupTimeout = 10 * time.Minute
machineNodeNameIndex = "machineNodeNameIndex"
controllerName = "machinehealthcheck-controller"

Expand Down Expand Up @@ -166,8 +166,14 @@ func (r *ReconcileMachineHealthCheck) Reconcile(request reconcile.Request) (reco
}
totalTargets := len(targets)

nodeStartupTimeout, err := getNodeStartupTimeout(mhc)
if err != nil {
glog.Errorf("Reconciling %s: error getting NodeStartupTimeout: %v", request.String(), err)
return reconcile.Result{}, err
}

// health check all targets and reconcile mhc status
currentHealthy, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(targets)
currentHealthy, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(targets, nodeStartupTimeout)
if err := r.reconcileStatus(mhc, totalTargets, currentHealthy); err != nil {
glog.Errorf("Reconciling %s: error patching status: %v", request.String(), err)
return reconcile.Result{}, err
Expand Down Expand Up @@ -259,14 +265,14 @@ func (r *ReconcileMachineHealthCheck) reconcileStatus(mhc *mapiv1.MachineHealthC

// healthCheckTargets health checks a slice of targets
// and gives a data to measure the average health
func (r *ReconcileMachineHealthCheck) healthCheckTargets(targets []target) (int, []target, []time.Duration, []error) {
func (r *ReconcileMachineHealthCheck) healthCheckTargets(targets []target, timeoutForMachineToHaveNode time.Duration) (int, []target, []time.Duration, []error) {
var nextCheckTimes []time.Duration
var errList []error
var needRemediationTargets []target
var currentHealthy int
for _, t := range targets {
glog.V(3).Infof("Reconciling %s: health checking", t.string())
needsRemediation, nextCheck, err := t.needsRemediation()
needsRemediation, nextCheck, err := t.needsRemediation(timeoutForMachineToHaveNode)
if err != nil {
glog.Errorf("Reconciling %s: error health checking: %v", t.string(), err)
errList = append(errList, err)
Expand Down Expand Up @@ -549,7 +555,7 @@ func (t *target) isMaster() bool {
return false
}

func (t *target) needsRemediation() (bool, time.Duration, error) {
func (t *target) needsRemediation(timeoutForMachineToHaveNode time.Duration) (bool, time.Duration, error) {
var nextCheckTimes []time.Duration
now := time.Now()

Expand Down Expand Up @@ -673,3 +679,15 @@ func hasMatchingLabels(machineHealthCheck *mapiv1.MachineHealthCheck, machine *m
}
return true
}

func getNodeStartupTimeout(mhc *mapiv1.MachineHealthCheck) (time.Duration, error) {
if mhc.Spec.NodeStartupTimeout == "" {
return defaultNodeStartupTimeout, nil
}

timeout, err := time.ParseDuration(mhc.Spec.NodeStartupTimeout)
if err != nil {
return time.Duration(0), fmt.Errorf("error parsing NodeStartupTimeout: %v", err)
}
return timeout, nil
}
147 changes: 103 additions & 44 deletions pkg/controller/machinehealthcheck/machinehealthcheck_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ func TestReconcile(t *testing.T) {
machineWithoutNodeRef.Status.NodeRef = nil

machineHealthCheck := maotesting.NewMachineHealthCheck("machineHealthCheck")
nodeStartupTimeout := 15 * time.Minute
machineHealthCheck.Spec.NodeStartupTimeout = nodeStartupTimeout.String()

// remediationExternal
nodeUnhealthyForTooLong := maotesting.NewNode("nodeUnhealthyForTooLong", false)
Expand Down Expand Up @@ -280,7 +282,7 @@ func TestReconcile(t *testing.T) {
node: nodeAnnotatedWithMachineWithoutNodeReference,
expected: expectedReconcile{
result: reconcile.Result{
RequeueAfter: timeoutForMachineToHaveNode,
RequeueAfter: nodeStartupTimeout,
},
error: false,
},
Expand Down Expand Up @@ -1244,11 +1246,12 @@ func TestNeedsRemediation(t *testing.T) {
knownDate := metav1.Time{Time: time.Date(1985, 06, 03, 0, 0, 0, 0, time.Local)}
machineFailed := machinePhaseFailed
testCases := []struct {
testCase string
target *target
expectedNeedsRemediation bool
expectedNextCheck time.Duration
expectedError bool
testCase string
target *target
timeoutForMachineToHaveNode time.Duration
expectedNeedsRemediation bool
expectedNextCheck time.Duration
expectedError bool
}{
{
testCase: "healthy: does not met conditions criteria",
Expand Down Expand Up @@ -1307,9 +1310,10 @@ func TestNeedsRemediation(t *testing.T) {
Status: mapiv1beta1.MachineHealthCheckStatus{},
},
},
expectedNeedsRemediation: false,
expectedNextCheck: time.Duration(0),
expectedError: false,
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
expectedNeedsRemediation: false,
expectedNextCheck: time.Duration(0),
expectedError: false,
},
{
testCase: "unhealthy: node does not exist",
Expand Down Expand Up @@ -1346,9 +1350,10 @@ func TestNeedsRemediation(t *testing.T) {
Status: mapiv1beta1.MachineHealthCheckStatus{},
},
},
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
},
{
testCase: "unhealthy: nodeRef nil longer than timeout",
Expand All @@ -1364,7 +1369,7 @@ func TestNeedsRemediation(t *testing.T) {
},
Spec: mapiv1beta1.MachineSpec{},
Status: mapiv1beta1.MachineStatus{
LastUpdated: &metav1.Time{Time: time.Now().Add(time.Duration(-timeoutForMachineToHaveNode) - 1*time.Second)},
LastUpdated: &metav1.Time{Time: time.Now().Add(time.Duration(-defaultNodeStartupTimeout) - 1*time.Second)},
},
},
Node: nil,
Expand Down Expand Up @@ -1398,9 +1403,10 @@ func TestNeedsRemediation(t *testing.T) {
Status: mapiv1beta1.MachineHealthCheckStatus{},
},
},
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
},
{
testCase: "unhealthy: meet conditions criteria",
Expand All @@ -1416,7 +1422,7 @@ func TestNeedsRemediation(t *testing.T) {
},
Spec: mapiv1beta1.MachineSpec{},
Status: mapiv1beta1.MachineStatus{
LastUpdated: &metav1.Time{Time: time.Now().Add(time.Duration(-timeoutForMachineToHaveNode) - 1*time.Second)},
LastUpdated: &metav1.Time{Time: time.Now().Add(time.Duration(-defaultNodeStartupTimeout) - 1*time.Second)},
},
},
Node: &corev1.Node{
Expand Down Expand Up @@ -1472,9 +1478,10 @@ func TestNeedsRemediation(t *testing.T) {
Status: mapiv1beta1.MachineHealthCheckStatus{},
},
},
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
},
{
testCase: "unhealthy: machine phase failed",
Expand Down Expand Up @@ -1524,9 +1531,10 @@ func TestNeedsRemediation(t *testing.T) {
Status: mapiv1beta1.MachineHealthCheckStatus{},
},
},
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
},
{
testCase: "healthy: meet conditions criteria but timeout",
Expand All @@ -1542,7 +1550,7 @@ func TestNeedsRemediation(t *testing.T) {
},
Spec: mapiv1beta1.MachineSpec{},
Status: mapiv1beta1.MachineStatus{
LastUpdated: &metav1.Time{Time: time.Now().Add(time.Duration(-timeoutForMachineToHaveNode) - 1*time.Second)},
LastUpdated: &metav1.Time{Time: time.Now().Add(time.Duration(-defaultNodeStartupTimeout) - 1*time.Second)},
},
},
Node: &corev1.Node{
Expand Down Expand Up @@ -1598,9 +1606,10 @@ func TestNeedsRemediation(t *testing.T) {
Status: mapiv1beta1.MachineHealthCheckStatus{},
},
},
expectedNeedsRemediation: false,
expectedNextCheck: time.Duration(1 * time.Minute), // 300-200 rounded
expectedError: false,
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
expectedNeedsRemediation: false,
expectedNextCheck: time.Duration(1 * time.Minute), // 300-200 rounded
expectedError: false,
},
{
testCase: "error bad conditions timeout",
Expand All @@ -1616,7 +1625,7 @@ func TestNeedsRemediation(t *testing.T) {
},
Spec: mapiv1beta1.MachineSpec{},
Status: mapiv1beta1.MachineStatus{
LastUpdated: &metav1.Time{Time: time.Now().Add(time.Duration(-timeoutForMachineToHaveNode) - 1*time.Second)},
LastUpdated: &metav1.Time{Time: time.Now().Add(time.Duration(-defaultNodeStartupTimeout) - 1*time.Second)},
},
},
Node: &corev1.Node{
Expand Down Expand Up @@ -1672,15 +1681,16 @@ func TestNeedsRemediation(t *testing.T) {
Status: mapiv1beta1.MachineHealthCheckStatus{},
},
},
expectedNeedsRemediation: false,
expectedNextCheck: time.Duration(0),
expectedError: true,
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
expectedNeedsRemediation: false,
expectedNextCheck: time.Duration(0),
expectedError: true,
},
}

for _, tc := range testCases {
t.Run(tc.testCase, func(t *testing.T) {
needsRemediation, nextCheck, err := tc.target.needsRemediation()
needsRemediation, nextCheck, err := tc.target.needsRemediation(tc.timeoutForMachineToHaveNode)
if needsRemediation != tc.expectedNeedsRemediation {
t.Errorf("Case: %v. Got: %v, expected: %v", tc.testCase, needsRemediation, tc.expectedNeedsRemediation)
}
Expand Down Expand Up @@ -2050,12 +2060,13 @@ func TestReconcileStatus(t *testing.T) {
func TestHealthCheckTargets(t *testing.T) {
now := time.Now()
testCases := []struct {
testCase string
targets []target
currentHealthy int
needRemediationTargets []target
nextCheckTimesLen int
errList []error
testCase string
targets []target
timeoutForMachineToHaveNode time.Duration
currentHealthy int
needRemediationTargets []target
nextCheckTimesLen int
errList []error
}{
{
testCase: "one healthy, one unhealthy",
Expand Down Expand Up @@ -2193,7 +2204,8 @@ func TestHealthCheckTargets(t *testing.T) {
},
},
},
currentHealthy: 1,
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
currentHealthy: 1,
needRemediationTargets: []target{
{
Machine: mapiv1beta1.Machine{
Expand Down Expand Up @@ -2280,7 +2292,7 @@ func TestHealthCheckTargets(t *testing.T) {
},
Spec: mapiv1beta1.MachineSpec{},
Status: mapiv1beta1.MachineStatus{
LastUpdated: &metav1.Time{Time: now.Add(time.Duration(-timeoutForMachineToHaveNode) + 1*time.Minute)},
LastUpdated: &metav1.Time{Time: now.Add(time.Duration(-defaultNodeStartupTimeout) + 1*time.Minute)},
},
},
Node: nil,
Expand Down Expand Up @@ -2381,16 +2393,17 @@ func TestHealthCheckTargets(t *testing.T) {
},
},
},
currentHealthy: 0,
nextCheckTimesLen: 2,
errList: []error{},
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
currentHealthy: 0,
nextCheckTimesLen: 2,
errList: []error{},
},
}
for _, tc := range testCases {
recorder := record.NewFakeRecorder(2)
r := newFakeReconcilerWithCustomRecorder(recorder)
t.Run(tc.testCase, func(t *testing.T) {
currentHealhty, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(tc.targets)
currentHealhty, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(tc.targets, tc.timeoutForMachineToHaveNode)
if currentHealhty != tc.currentHealthy {
t.Errorf("Case: %v. Got: %v, expected: %v", tc.testCase, currentHealhty, tc.currentHealthy)
}
Expand Down Expand Up @@ -2533,6 +2546,52 @@ func TestIsAllowedRemediation(t *testing.T) {
}
}

func TestGetNodeStartupTimeout(t *testing.T) {
testCases := []struct {
name string
timeout string
expectError bool
expectedTimeout time.Duration
}{
{
name: "when the timeout is not set",
timeout: "",
expectError: false,
expectedTimeout: defaultNodeStartupTimeout,
},
{
name: "when the timeout is not parsable",
timeout: "a",
expectError: true,
expectedTimeout: time.Duration(0),
},
{
name: "when the timeout is parsable",
timeout: "10s",
expectError: false,
expectedTimeout: 10 * time.Second,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
mhc := &mapiv1beta1.MachineHealthCheck{
Spec: mapiv1beta1.MachineHealthCheckSpec{
NodeStartupTimeout: tc.timeout,
},
}

timeout, err := getNodeStartupTimeout(mhc)
if tc.expectError != (err != nil) {
t.Errorf("Case: %s. Got: %v, expected: %v", tc.name, err, tc.expectError)
}
if timeout != tc.expectedTimeout {
t.Errorf("Case: %s. Got: %v, expected: %v", tc.name, timeout.String(), tc.expectedTimeout.String())
}
})
}
}

func IntPtr(i int) *int {
return &i
}