From 6cfe843b0c4b8c26496582ead88bc1ae5e93994f Mon Sep 17 00:00:00 2001 From: Marc Sluiter Date: Wed, 8 Feb 2023 11:05:27 +0100 Subject: [PATCH] Delete machine when reboot remediation fails Metal3 sets a specific condition in this case, which is evaluated by CAPI. In OCP we don't have this, so delete the machine directly (similar to the annotation based remediation in the machine actuator) Signed-off-by: Marc Sluiter --- pkg/baremetal/metal3remediation_manager.go | 57 ++++++++++++------- .../zz_generated.metal3remediation_manager.go | 43 +++++++++----- .../metal3remediation_controller.go | 19 ++++--- .../metal3remediation_controller_test.go | 2 +- 4 files changed, 78 insertions(+), 43 deletions(-) diff --git a/pkg/baremetal/metal3remediation_manager.go b/pkg/baremetal/metal3remediation_manager.go index 9a86c7a67..942d57ae7 100644 --- a/pkg/baremetal/metal3remediation_manager.go +++ b/pkg/baremetal/metal3remediation_manager.go @@ -44,7 +44,9 @@ const ( nodeLabelsBackupAnnotation = "remediation.metal3.io/node-labels-backup" // HostAnnotation is the key for an annotation that should go on a Metal3Machine to // reference what BareMetalHost it corresponds to. - HostAnnotation = "metal3.io/BareMetalHost" + HostAnnotation = "metal3.io/BareMetalHost" + machineRoleLabel = "machine.openshift.io/cluster-api-machine-role" + machineRoleMaster = "master" ) // RemediationManagerInterface is an interface for a RemediationManager. @@ -69,13 +71,17 @@ type RemediationManagerInterface interface { SetLastRemediationTime(remediationTime *metav1.Time) GetTimeout() *metav1.Duration IncreaseRetryCount() - SetOwnerRemediatedConditionNew(ctx context.Context) error GetNode(ctx context.Context) (*corev1.Node, error) UpdateNode(ctx context.Context, node *corev1.Node) error DeleteNode(ctx context.Context, node *corev1.Node) error SetNodeBackupAnnotations(annotations string, labels string) bool GetNodeBackupAnnotations() (annotations, labels string) RemoveNodeBackupAnnotations() + + // OCP specific methods, differs from upstream metal3 + + CanReprovision(context.Context) (bool, error) + DeleteMachine(ctx context.Context) error } // RemediationManager is responsible for performing remediation reconciliation. @@ -329,24 +335,6 @@ func (r *RemediationManager) IncreaseRetryCount() { r.Metal3Remediation.Status.RetryCount++ } -// SetOwnerRemediatedConditionNew sets MachineOwnerRemediatedCondition on CAPI machine object -// that have failed a healthcheck. -func (r *RemediationManager) SetOwnerRemediatedConditionNew(ctx context.Context) error { - machineHelper, err := patch.NewHelper(r.OCPMachine, r.Client) - if err != nil { - r.Log.Info("Unable to create patch helper for Machine") - return err - } - // TODO - //conditions.MarkFalse(r.OCPMachine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") - err = machineHelper.Patch(ctx, r.OCPMachine) - if err != nil { - r.Log.Info("Unable to patch Machine", "machine", r.OCPMachine) - return err - } - return nil -} - // GetNode returns the Node associated with the machine in the current context. func (r *RemediationManager) GetNode(ctx context.Context) (*corev1.Node, error) { @@ -432,3 +420,32 @@ func (r *RemediationManager) RemoveNodeBackupAnnotations() { func (r *RemediationManager) getPowerOffAnnotationKey() string { return fmt.Sprintf(powerOffAnnotation, r.Metal3Remediation.UID) } + +func (r *RemediationManager) CanReprovision(ctx context.Context) (bool, error) { + baremetalhost, _, err := r.GetUnhealthyHost(ctx) + if err != nil { + r.Log.Error(err, "Failed to get BMH for machine", "machine", r.OCPMachine.Name) + return false, err + } + if baremetalhost.Spec.ExternallyProvisioned { + r.Log.Info("Reprovisioning of machine not allowed: BMH is externally provisioned", "machine", r.OCPMachine.Name, "bmh", baremetalhost.Name) + return false, nil + } + if metav1.GetControllerOf(r.OCPMachine) == nil { + r.Log.Info("Reprovisioning of machine not allowed: no owning controller", "machine", r.OCPMachine.Name) + return false, nil + } + if r.OCPMachine.Labels[machineRoleLabel] == machineRoleMaster { + r.Log.Info("Reprovisioning of machine not allowed: has master role", "machine", r.OCPMachine.Name) + return false, nil + } + return true, nil +} + +func (r *RemediationManager) DeleteMachine(ctx context.Context) error { + err := r.Client.Delete(ctx, r.OCPMachine) + if err != nil { + r.Log.Error(err, "Failed to delete machine", "machine", r.OCPMachine.Name) + } + return err +} diff --git a/pkg/baremetal/mocks/zz_generated.metal3remediation_manager.go b/pkg/baremetal/mocks/zz_generated.metal3remediation_manager.go index e2938e51b..7766dc4a6 100644 --- a/pkg/baremetal/mocks/zz_generated.metal3remediation_manager.go +++ b/pkg/baremetal/mocks/zz_generated.metal3remediation_manager.go @@ -56,6 +56,35 @@ func (m *MockRemediationManagerInterface) EXPECT() *MockRemediationManagerInterf return m.recorder } +// CanReprovision mocks base method. +func (m *MockRemediationManagerInterface) CanReprovision(arg0 context.Context) (bool, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CanReprovision", arg0) + ret0, _ := ret[0].(bool) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// CanReprovision indicates an expected call of CanReprovision. +func (mr *MockRemediationManagerInterfaceMockRecorder) CanReprovision(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CanReprovision", reflect.TypeOf((*MockRemediationManagerInterface)(nil).CanReprovision), arg0) +} + +// DeleteMachine mocks base method. +func (m *MockRemediationManagerInterface) DeleteMachine(ctx context.Context) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "DeleteMachine", ctx) + ret0, _ := ret[0].(error) + return ret0 +} + +// DeleteMachine indicates an expected call of DeleteMachine. +func (mr *MockRemediationManagerInterfaceMockRecorder) DeleteMachine(ctx interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DeleteMachine", reflect.TypeOf((*MockRemediationManagerInterface)(nil).DeleteMachine), ctx) +} + // DeleteNode mocks base method. func (m *MockRemediationManagerInterface) DeleteNode(ctx context.Context, node *v1.Node) error { m.ctrl.T.Helper() @@ -334,20 +363,6 @@ func (mr *MockRemediationManagerInterfaceMockRecorder) SetNodeBackupAnnotations( return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetNodeBackupAnnotations", reflect.TypeOf((*MockRemediationManagerInterface)(nil).SetNodeBackupAnnotations), annotations, labels) } -// SetOwnerRemediatedConditionNew mocks base method. -func (m *MockRemediationManagerInterface) SetOwnerRemediatedConditionNew(ctx context.Context) error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "SetOwnerRemediatedConditionNew", ctx) - ret0, _ := ret[0].(error) - return ret0 -} - -// SetOwnerRemediatedConditionNew indicates an expected call of SetOwnerRemediatedConditionNew. -func (mr *MockRemediationManagerInterfaceMockRecorder) SetOwnerRemediatedConditionNew(ctx interface{}) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetOwnerRemediatedConditionNew", reflect.TypeOf((*MockRemediationManagerInterface)(nil).SetOwnerRemediatedConditionNew), ctx) -} - // SetPowerOffAnnotation mocks base method. func (m *MockRemediationManagerInterface) SetPowerOffAnnotation(ctx context.Context) error { m.ctrl.T.Helper() diff --git a/pkg/controller/metal3remediation/metal3remediation_controller.go b/pkg/controller/metal3remediation/metal3remediation_controller.go index 886d8915e..798fd8d20 100644 --- a/pkg/controller/metal3remediation/metal3remediation_controller.go +++ b/pkg/controller/metal3remediation/metal3remediation_controller.go @@ -253,14 +253,17 @@ func (r *Metal3RemediationReconciler) reconcileNormal(ctx context.Context, r.Log.Info("Remediation timed out and retry limit reached") - // TODO what to do here in OCP - // When machine is still unhealthy after remediation, setting of OwnerRemediatedCondition - // moves control to CAPI machine controller. The owning controller will do - // preflight checks and handles the Machine deletion - err = remediationMgr.SetOwnerRemediatedConditionNew(ctx) - if err != nil { - r.Log.Error(err, "error setting cluster api conditions") - return ctrl.Result{}, errors.Wrapf(err, "error setting cluster api conditions") + // When machine is still unhealthy after remediation, and it can be re-provisioned, delete the machine. + // Note: this differs to the upstream metal3 remediation, which sets a condition which is handled by CAPI + if ok, err := remediationMgr.CanReprovision(ctx); err != nil { + return ctrl.Result{}, errors.Wrapf(err, "Failed to check if machine can be reprovisoned") + } else if ok { + r.Log.Info("Deleting machine") + if err := remediationMgr.DeleteMachine(ctx); err != nil { + return ctrl.Result{}, errors.Wrapf(err, "Failed to delete machine") + } + } else { + r.Log.Info("Machine can't be re-provisioned, will not delete it") } // Remediation failed, so set unhealthy annotation on BMH diff --git a/pkg/controller/metal3remediation/metal3remediation_controller_test.go b/pkg/controller/metal3remediation/metal3remediation_controller_test.go index 92c9d9f6f..76e68c60c 100644 --- a/pkg/controller/metal3remediation/metal3remediation_controller_test.go +++ b/pkg/controller/metal3remediation/metal3remediation_controller_test.go @@ -175,7 +175,7 @@ func setReconcileNormalRemediationExpectations(ctrl *gomock.Controller, m.EXPECT().IncreaseRetryCount() return m } - m.EXPECT().SetOwnerRemediatedConditionNew(context.TODO()) + m.EXPECT().CanReprovision(context.TODO()) m.EXPECT().SetUnhealthyAnnotation(context.TODO()) m.EXPECT().SetRemediationPhase(infrav1.PhaseDeleting) }