diff --git a/pkg/baremetal/metal3remediation_manager.go b/pkg/baremetal/metal3remediation_manager.go index 9a86c7a67..942d57ae7 100644 --- a/pkg/baremetal/metal3remediation_manager.go +++ b/pkg/baremetal/metal3remediation_manager.go @@ -44,7 +44,9 @@ const ( nodeLabelsBackupAnnotation = "remediation.metal3.io/node-labels-backup" // HostAnnotation is the key for an annotation that should go on a Metal3Machine to // reference what BareMetalHost it corresponds to. - HostAnnotation = "metal3.io/BareMetalHost" + HostAnnotation = "metal3.io/BareMetalHost" + machineRoleLabel = "machine.openshift.io/cluster-api-machine-role" + machineRoleMaster = "master" ) // RemediationManagerInterface is an interface for a RemediationManager. @@ -69,13 +71,17 @@ type RemediationManagerInterface interface { SetLastRemediationTime(remediationTime *metav1.Time) GetTimeout() *metav1.Duration IncreaseRetryCount() - SetOwnerRemediatedConditionNew(ctx context.Context) error GetNode(ctx context.Context) (*corev1.Node, error) UpdateNode(ctx context.Context, node *corev1.Node) error DeleteNode(ctx context.Context, node *corev1.Node) error SetNodeBackupAnnotations(annotations string, labels string) bool GetNodeBackupAnnotations() (annotations, labels string) RemoveNodeBackupAnnotations() + + // OCP specific methods, differs from upstream metal3 + + CanReprovision(context.Context) (bool, error) + DeleteMachine(ctx context.Context) error } // RemediationManager is responsible for performing remediation reconciliation. @@ -329,24 +335,6 @@ func (r *RemediationManager) IncreaseRetryCount() { r.Metal3Remediation.Status.RetryCount++ } -// SetOwnerRemediatedConditionNew sets MachineOwnerRemediatedCondition on CAPI machine object -// that have failed a healthcheck. -func (r *RemediationManager) SetOwnerRemediatedConditionNew(ctx context.Context) error { - machineHelper, err := patch.NewHelper(r.OCPMachine, r.Client) - if err != nil { - r.Log.Info("Unable to create patch helper for Machine") - return err - } - // TODO - //conditions.MarkFalse(r.OCPMachine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") - err = machineHelper.Patch(ctx, r.OCPMachine) - if err != nil { - r.Log.Info("Unable to patch Machine", "machine", r.OCPMachine) - return err - } - return nil -} - // GetNode returns the Node associated with the machine in the current context. func (r *RemediationManager) GetNode(ctx context.Context) (*corev1.Node, error) { @@ -432,3 +420,32 @@ func (r *RemediationManager) RemoveNodeBackupAnnotations() { func (r *RemediationManager) getPowerOffAnnotationKey() string { return fmt.Sprintf(powerOffAnnotation, r.Metal3Remediation.UID) } + +func (r *RemediationManager) CanReprovision(ctx context.Context) (bool, error) { + baremetalhost, _, err := r.GetUnhealthyHost(ctx) + if err != nil { + r.Log.Error(err, "Failed to get BMH for machine", "machine", r.OCPMachine.Name) + return false, err + } + if baremetalhost.Spec.ExternallyProvisioned { + r.Log.Info("Reprovisioning of machine not allowed: BMH is externally provisioned", "machine", r.OCPMachine.Name, "bmh", baremetalhost.Name) + return false, nil + } + if metav1.GetControllerOf(r.OCPMachine) == nil { + r.Log.Info("Reprovisioning of machine not allowed: no owning controller", "machine", r.OCPMachine.Name) + return false, nil + } + if r.OCPMachine.Labels[machineRoleLabel] == machineRoleMaster { + r.Log.Info("Reprovisioning of machine not allowed: has master role", "machine", r.OCPMachine.Name) + return false, nil + } + return true, nil +} + +func (r *RemediationManager) DeleteMachine(ctx context.Context) error { + err := r.Client.Delete(ctx, r.OCPMachine) + if err != nil { + r.Log.Error(err, "Failed to delete machine", "machine", r.OCPMachine.Name) + } + return err +} diff --git a/pkg/baremetal/mocks/zz_generated.metal3remediation_manager.go b/pkg/baremetal/mocks/zz_generated.metal3remediation_manager.go index e2938e51b..7766dc4a6 100644 --- a/pkg/baremetal/mocks/zz_generated.metal3remediation_manager.go +++ b/pkg/baremetal/mocks/zz_generated.metal3remediation_manager.go @@ -56,6 +56,35 @@ func (m *MockRemediationManagerInterface) EXPECT() *MockRemediationManagerInterf return m.recorder } +// CanReprovision mocks base method. +func (m *MockRemediationManagerInterface) CanReprovision(arg0 context.Context) (bool, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CanReprovision", arg0) + ret0, _ := ret[0].(bool) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// CanReprovision indicates an expected call of CanReprovision. +func (mr *MockRemediationManagerInterfaceMockRecorder) CanReprovision(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CanReprovision", reflect.TypeOf((*MockRemediationManagerInterface)(nil).CanReprovision), arg0) +} + +// DeleteMachine mocks base method. +func (m *MockRemediationManagerInterface) DeleteMachine(ctx context.Context) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "DeleteMachine", ctx) + ret0, _ := ret[0].(error) + return ret0 +} + +// DeleteMachine indicates an expected call of DeleteMachine. +func (mr *MockRemediationManagerInterfaceMockRecorder) DeleteMachine(ctx interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DeleteMachine", reflect.TypeOf((*MockRemediationManagerInterface)(nil).DeleteMachine), ctx) +} + // DeleteNode mocks base method. func (m *MockRemediationManagerInterface) DeleteNode(ctx context.Context, node *v1.Node) error { m.ctrl.T.Helper() @@ -334,20 +363,6 @@ func (mr *MockRemediationManagerInterfaceMockRecorder) SetNodeBackupAnnotations( return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetNodeBackupAnnotations", reflect.TypeOf((*MockRemediationManagerInterface)(nil).SetNodeBackupAnnotations), annotations, labels) } -// SetOwnerRemediatedConditionNew mocks base method. -func (m *MockRemediationManagerInterface) SetOwnerRemediatedConditionNew(ctx context.Context) error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "SetOwnerRemediatedConditionNew", ctx) - ret0, _ := ret[0].(error) - return ret0 -} - -// SetOwnerRemediatedConditionNew indicates an expected call of SetOwnerRemediatedConditionNew. -func (mr *MockRemediationManagerInterfaceMockRecorder) SetOwnerRemediatedConditionNew(ctx interface{}) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetOwnerRemediatedConditionNew", reflect.TypeOf((*MockRemediationManagerInterface)(nil).SetOwnerRemediatedConditionNew), ctx) -} - // SetPowerOffAnnotation mocks base method. func (m *MockRemediationManagerInterface) SetPowerOffAnnotation(ctx context.Context) error { m.ctrl.T.Helper() diff --git a/pkg/controller/metal3remediation/metal3remediation_controller.go b/pkg/controller/metal3remediation/metal3remediation_controller.go index 886d8915e..798fd8d20 100644 --- a/pkg/controller/metal3remediation/metal3remediation_controller.go +++ b/pkg/controller/metal3remediation/metal3remediation_controller.go @@ -253,14 +253,17 @@ func (r *Metal3RemediationReconciler) reconcileNormal(ctx context.Context, r.Log.Info("Remediation timed out and retry limit reached") - // TODO what to do here in OCP - // When machine is still unhealthy after remediation, setting of OwnerRemediatedCondition - // moves control to CAPI machine controller. The owning controller will do - // preflight checks and handles the Machine deletion - err = remediationMgr.SetOwnerRemediatedConditionNew(ctx) - if err != nil { - r.Log.Error(err, "error setting cluster api conditions") - return ctrl.Result{}, errors.Wrapf(err, "error setting cluster api conditions") + // When machine is still unhealthy after remediation, and it can be re-provisioned, delete the machine. + // Note: this differs to the upstream metal3 remediation, which sets a condition which is handled by CAPI + if ok, err := remediationMgr.CanReprovision(ctx); err != nil { + return ctrl.Result{}, errors.Wrapf(err, "Failed to check if machine can be reprovisoned") + } else if ok { + r.Log.Info("Deleting machine") + if err := remediationMgr.DeleteMachine(ctx); err != nil { + return ctrl.Result{}, errors.Wrapf(err, "Failed to delete machine") + } + } else { + r.Log.Info("Machine can't be re-provisioned, will not delete it") } // Remediation failed, so set unhealthy annotation on BMH diff --git a/pkg/controller/metal3remediation/metal3remediation_controller_test.go b/pkg/controller/metal3remediation/metal3remediation_controller_test.go index 92c9d9f6f..76e68c60c 100644 --- a/pkg/controller/metal3remediation/metal3remediation_controller_test.go +++ b/pkg/controller/metal3remediation/metal3remediation_controller_test.go @@ -175,7 +175,7 @@ func setReconcileNormalRemediationExpectations(ctrl *gomock.Controller, m.EXPECT().IncreaseRetryCount() return m } - m.EXPECT().SetOwnerRemediatedConditionNew(context.TODO()) + m.EXPECT().CanReprovision(context.TODO()) m.EXPECT().SetUnhealthyAnnotation(context.TODO()) m.EXPECT().SetRemediationPhase(infrav1.PhaseDeleting) }