Skip to content

Commit

Permalink
Use NodeStartupTimeout from MHC rather than const
Browse files Browse the repository at this point in the history
  • Loading branch information
JoelSpeed committed Feb 27, 2020
1 parent 08699b2 commit 9d826c6
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 49 deletions.
28 changes: 23 additions & 5 deletions pkg/controller/machinehealthcheck/machinehealthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ const (
machinePhaseFailed = "Failed"
remediationStrategyAnnotation = "machine.openshift.io/remediation-strategy"
remediationStrategyExternal = mapiv1.RemediationStrategyType("external-baremetal")
timeoutForMachineToHaveNode = 10 * time.Minute
defaultNodeStartupTimeout = 10 * time.Minute
machineNodeNameIndex = "machineNodeNameIndex"
controllerName = "machinehealthcheck-controller"

Expand Down Expand Up @@ -166,8 +166,14 @@ func (r *ReconcileMachineHealthCheck) Reconcile(request reconcile.Request) (reco
}
totalTargets := len(targets)

nodeStartupTimeout, err := getNodeStartupTimeout(mhc)
if err != nil {
glog.Errorf("Reconciling %s: error getting NodeStartupTimeout: %v", request.String(), err)
return reconcile.Result{}, err
}

// health check all targets and reconcile mhc status
currentHealthy, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(targets)
currentHealthy, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(targets, nodeStartupTimeout)
if err := r.reconcileStatus(mhc, totalTargets, currentHealthy); err != nil {
glog.Errorf("Reconciling %s: error patching status: %v", request.String(), err)
return reconcile.Result{}, err
Expand Down Expand Up @@ -259,14 +265,14 @@ func (r *ReconcileMachineHealthCheck) reconcileStatus(mhc *mapiv1.MachineHealthC

// healthCheckTargets health checks a slice of targets
// and gives a data to measure the average health
func (r *ReconcileMachineHealthCheck) healthCheckTargets(targets []target) (int, []target, []time.Duration, []error) {
func (r *ReconcileMachineHealthCheck) healthCheckTargets(targets []target, timeoutForMachineToHaveNode time.Duration) (int, []target, []time.Duration, []error) {
var nextCheckTimes []time.Duration
var errList []error
var needRemediationTargets []target
var currentHealthy int
for _, t := range targets {
glog.V(3).Infof("Reconciling %s: health checking", t.string())
needsRemediation, nextCheck, err := t.needsRemediation()
needsRemediation, nextCheck, err := t.needsRemediation(timeoutForMachineToHaveNode)
if err != nil {
glog.Errorf("Reconciling %s: error health checking: %v", t.string(), err)
errList = append(errList, err)
Expand Down Expand Up @@ -549,7 +555,7 @@ func (t *target) isMaster() bool {
return false
}

func (t *target) needsRemediation() (bool, time.Duration, error) {
func (t *target) needsRemediation(timeoutForMachineToHaveNode time.Duration) (bool, time.Duration, error) {
var nextCheckTimes []time.Duration
now := time.Now()

Expand Down Expand Up @@ -673,3 +679,15 @@ func hasMatchingLabels(machineHealthCheck *mapiv1.MachineHealthCheck, machine *m
}
return true
}

func getNodeStartupTimeout(mhc *mapiv1.MachineHealthCheck) (time.Duration, error) {
if mhc.Spec.NodeStartupTimeout == "" {
return defaultNodeStartupTimeout, nil
}

timeout, err := time.ParseDuration(mhc.Spec.NodeStartupTimeout)
if err != nil {
return time.Duration(0), fmt.Errorf("error parsing NodeStartupTimeout: %v", err)
}
return timeout, nil
}
147 changes: 103 additions & 44 deletions pkg/controller/machinehealthcheck/machinehealthcheck_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ func TestReconcile(t *testing.T) {
machineWithoutNodeRef.Status.NodeRef = nil

machineHealthCheck := maotesting.NewMachineHealthCheck("machineHealthCheck")
nodeStartupTimeout := 15 * time.Minute
machineHealthCheck.Spec.NodeStartupTimeout = nodeStartupTimeout.String()

// remediationExternal
nodeUnhealthyForTooLong := maotesting.NewNode("nodeUnhealthyForTooLong", false)
Expand Down Expand Up @@ -280,7 +282,7 @@ func TestReconcile(t *testing.T) {
node: nodeAnnotatedWithMachineWithoutNodeReference,
expected: expectedReconcile{
result: reconcile.Result{
RequeueAfter: timeoutForMachineToHaveNode,
RequeueAfter: nodeStartupTimeout,
},
error: false,
},
Expand Down Expand Up @@ -1244,11 +1246,12 @@ func TestNeedsRemediation(t *testing.T) {
knownDate := metav1.Time{Time: time.Date(1985, 06, 03, 0, 0, 0, 0, time.Local)}
machineFailed := machinePhaseFailed
testCases := []struct {
testCase string
target *target
expectedNeedsRemediation bool
expectedNextCheck time.Duration
expectedError bool
testCase string
target *target
timeoutForMachineToHaveNode time.Duration
expectedNeedsRemediation bool
expectedNextCheck time.Duration
expectedError bool
}{
{
testCase: "healthy: does not met conditions criteria",
Expand Down Expand Up @@ -1307,9 +1310,10 @@ func TestNeedsRemediation(t *testing.T) {
Status: mapiv1beta1.MachineHealthCheckStatus{},
},
},
expectedNeedsRemediation: false,
expectedNextCheck: time.Duration(0),
expectedError: false,
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
expectedNeedsRemediation: false,
expectedNextCheck: time.Duration(0),
expectedError: false,
},
{
testCase: "unhealthy: node does not exist",
Expand Down Expand Up @@ -1346,9 +1350,10 @@ func TestNeedsRemediation(t *testing.T) {
Status: mapiv1beta1.MachineHealthCheckStatus{},
},
},
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
},
{
testCase: "unhealthy: nodeRef nil longer than timeout",
Expand All @@ -1364,7 +1369,7 @@ func TestNeedsRemediation(t *testing.T) {
},
Spec: mapiv1beta1.MachineSpec{},
Status: mapiv1beta1.MachineStatus{
LastUpdated: &metav1.Time{Time: time.Now().Add(time.Duration(-timeoutForMachineToHaveNode) - 1*time.Second)},
LastUpdated: &metav1.Time{Time: time.Now().Add(time.Duration(-defaultNodeStartupTimeout) - 1*time.Second)},
},
},
Node: nil,
Expand Down Expand Up @@ -1398,9 +1403,10 @@ func TestNeedsRemediation(t *testing.T) {
Status: mapiv1beta1.MachineHealthCheckStatus{},
},
},
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
},
{
testCase: "unhealthy: meet conditions criteria",
Expand All @@ -1416,7 +1422,7 @@ func TestNeedsRemediation(t *testing.T) {
},
Spec: mapiv1beta1.MachineSpec{},
Status: mapiv1beta1.MachineStatus{
LastUpdated: &metav1.Time{Time: time.Now().Add(time.Duration(-timeoutForMachineToHaveNode) - 1*time.Second)},
LastUpdated: &metav1.Time{Time: time.Now().Add(time.Duration(-defaultNodeStartupTimeout) - 1*time.Second)},
},
},
Node: &corev1.Node{
Expand Down Expand Up @@ -1472,9 +1478,10 @@ func TestNeedsRemediation(t *testing.T) {
Status: mapiv1beta1.MachineHealthCheckStatus{},
},
},
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
},
{
testCase: "unhealthy: machine phase failed",
Expand Down Expand Up @@ -1524,9 +1531,10 @@ func TestNeedsRemediation(t *testing.T) {
Status: mapiv1beta1.MachineHealthCheckStatus{},
},
},
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
},
{
testCase: "healthy: meet conditions criteria but timeout",
Expand All @@ -1542,7 +1550,7 @@ func TestNeedsRemediation(t *testing.T) {
},
Spec: mapiv1beta1.MachineSpec{},
Status: mapiv1beta1.MachineStatus{
LastUpdated: &metav1.Time{Time: time.Now().Add(time.Duration(-timeoutForMachineToHaveNode) - 1*time.Second)},
LastUpdated: &metav1.Time{Time: time.Now().Add(time.Duration(-defaultNodeStartupTimeout) - 1*time.Second)},
},
},
Node: &corev1.Node{
Expand Down Expand Up @@ -1598,9 +1606,10 @@ func TestNeedsRemediation(t *testing.T) {
Status: mapiv1beta1.MachineHealthCheckStatus{},
},
},
expectedNeedsRemediation: false,
expectedNextCheck: time.Duration(1 * time.Minute), // 300-200 rounded
expectedError: false,
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
expectedNeedsRemediation: false,
expectedNextCheck: time.Duration(1 * time.Minute), // 300-200 rounded
expectedError: false,
},
{
testCase: "error bad conditions timeout",
Expand All @@ -1616,7 +1625,7 @@ func TestNeedsRemediation(t *testing.T) {
},
Spec: mapiv1beta1.MachineSpec{},
Status: mapiv1beta1.MachineStatus{
LastUpdated: &metav1.Time{Time: time.Now().Add(time.Duration(-timeoutForMachineToHaveNode) - 1*time.Second)},
LastUpdated: &metav1.Time{Time: time.Now().Add(time.Duration(-defaultNodeStartupTimeout) - 1*time.Second)},
},
},
Node: &corev1.Node{
Expand Down Expand Up @@ -1672,15 +1681,16 @@ func TestNeedsRemediation(t *testing.T) {
Status: mapiv1beta1.MachineHealthCheckStatus{},
},
},
expectedNeedsRemediation: false,
expectedNextCheck: time.Duration(0),
expectedError: true,
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
expectedNeedsRemediation: false,
expectedNextCheck: time.Duration(0),
expectedError: true,
},
}

for _, tc := range testCases {
t.Run(tc.testCase, func(t *testing.T) {
needsRemediation, nextCheck, err := tc.target.needsRemediation()
needsRemediation, nextCheck, err := tc.target.needsRemediation(tc.timeoutForMachineToHaveNode)
if needsRemediation != tc.expectedNeedsRemediation {
t.Errorf("Case: %v. Got: %v, expected: %v", tc.testCase, needsRemediation, tc.expectedNeedsRemediation)
}
Expand Down Expand Up @@ -2050,12 +2060,13 @@ func TestReconcileStatus(t *testing.T) {
func TestHealthCheckTargets(t *testing.T) {
now := time.Now()
testCases := []struct {
testCase string
targets []target
currentHealthy int
needRemediationTargets []target
nextCheckTimesLen int
errList []error
testCase string
targets []target
timeoutForMachineToHaveNode time.Duration
currentHealthy int
needRemediationTargets []target
nextCheckTimesLen int
errList []error
}{
{
testCase: "one healthy, one unhealthy",
Expand Down Expand Up @@ -2193,7 +2204,8 @@ func TestHealthCheckTargets(t *testing.T) {
},
},
},
currentHealthy: 1,
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
currentHealthy: 1,
needRemediationTargets: []target{
{
Machine: mapiv1beta1.Machine{
Expand Down Expand Up @@ -2280,7 +2292,7 @@ func TestHealthCheckTargets(t *testing.T) {
},
Spec: mapiv1beta1.MachineSpec{},
Status: mapiv1beta1.MachineStatus{
LastUpdated: &metav1.Time{Time: now.Add(time.Duration(-timeoutForMachineToHaveNode) + 1*time.Minute)},
LastUpdated: &metav1.Time{Time: now.Add(time.Duration(-defaultNodeStartupTimeout) + 1*time.Minute)},
},
},
Node: nil,
Expand Down Expand Up @@ -2381,16 +2393,17 @@ func TestHealthCheckTargets(t *testing.T) {
},
},
},
currentHealthy: 0,
nextCheckTimesLen: 2,
errList: []error{},
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
currentHealthy: 0,
nextCheckTimesLen: 2,
errList: []error{},
},
}
for _, tc := range testCases {
recorder := record.NewFakeRecorder(2)
r := newFakeReconcilerWithCustomRecorder(recorder)
t.Run(tc.testCase, func(t *testing.T) {
currentHealhty, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(tc.targets)
currentHealhty, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(tc.targets, tc.timeoutForMachineToHaveNode)
if currentHealhty != tc.currentHealthy {
t.Errorf("Case: %v. Got: %v, expected: %v", tc.testCase, currentHealhty, tc.currentHealthy)
}
Expand Down Expand Up @@ -2533,6 +2546,52 @@ func TestIsAllowedRemediation(t *testing.T) {
}
}

func TestGetNodeStartupTimeout(t *testing.T) {
testCases := []struct {
name string
timeout string
expectError bool
expectedTimeout time.Duration
}{
{
name: "when the timeout is not set",
timeout: "",
expectError: false,
expectedTimeout: defaultNodeStartupTimeout,
},
{
name: "when the timeout is not parsable",
timeout: "a",
expectError: true,
expectedTimeout: time.Duration(0),
},
{
name: "when the timeout is parsable",
timeout: "10s",
expectError: false,
expectedTimeout: 10 * time.Second,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
mhc := &mapiv1beta1.MachineHealthCheck{
Spec: mapiv1beta1.MachineHealthCheckSpec{
NodeStartupTimeout: tc.timeout,
},
}

timeout, err := getNodeStartupTimeout(mhc)
if tc.expectError != (err != nil) {
t.Errorf("Case: %s. Got: %v, expected: %v", tc.name, err, tc.expectError)
}
if timeout != tc.expectedTimeout {
t.Errorf("Case: %s. Got: %v, expected: %v", tc.name, timeout.String(), tc.expectedTimeout.String())
}
})
}
}

func IntPtr(i int) *int {
return &i
}

0 comments on commit 9d826c6

Please sign in to comment.