Skip to content

Commit

Permalink
Delete machine when reboot remediation fails
Browse files Browse the repository at this point in the history
Metal3 sets a specific condition in this case,
which is evaluated by CAPI. In OCP we don't have
this, so delete the machine directly (similar to
the annotation based remediation in the machine
actuator)

Signed-off-by: Marc Sluiter <msluiter@redhat.com>
  • Loading branch information
slintes committed Feb 8, 2023
1 parent dd83434 commit 6cfe843
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 43 deletions.
57 changes: 37 additions & 20 deletions pkg/baremetal/metal3remediation_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ const (
nodeLabelsBackupAnnotation = "remediation.metal3.io/node-labels-backup"
// HostAnnotation is the key for an annotation that should go on a Metal3Machine to
// reference what BareMetalHost it corresponds to.
HostAnnotation = "metal3.io/BareMetalHost"
HostAnnotation = "metal3.io/BareMetalHost"
machineRoleLabel = "machine.openshift.io/cluster-api-machine-role"
machineRoleMaster = "master"
)

// RemediationManagerInterface is an interface for a RemediationManager.
Expand All @@ -69,13 +71,17 @@ type RemediationManagerInterface interface {
SetLastRemediationTime(remediationTime *metav1.Time)
GetTimeout() *metav1.Duration
IncreaseRetryCount()
SetOwnerRemediatedConditionNew(ctx context.Context) error
GetNode(ctx context.Context) (*corev1.Node, error)
UpdateNode(ctx context.Context, node *corev1.Node) error
DeleteNode(ctx context.Context, node *corev1.Node) error
SetNodeBackupAnnotations(annotations string, labels string) bool
GetNodeBackupAnnotations() (annotations, labels string)
RemoveNodeBackupAnnotations()

// OCP specific methods, differs from upstream metal3

CanReprovision(context.Context) (bool, error)
DeleteMachine(ctx context.Context) error
}

// RemediationManager is responsible for performing remediation reconciliation.
Expand Down Expand Up @@ -329,24 +335,6 @@ func (r *RemediationManager) IncreaseRetryCount() {
r.Metal3Remediation.Status.RetryCount++
}

// SetOwnerRemediatedConditionNew sets MachineOwnerRemediatedCondition on CAPI machine object
// that have failed a healthcheck.
func (r *RemediationManager) SetOwnerRemediatedConditionNew(ctx context.Context) error {
machineHelper, err := patch.NewHelper(r.OCPMachine, r.Client)
if err != nil {
r.Log.Info("Unable to create patch helper for Machine")
return err
}
// TODO
//conditions.MarkFalse(r.OCPMachine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
err = machineHelper.Patch(ctx, r.OCPMachine)
if err != nil {
r.Log.Info("Unable to patch Machine", "machine", r.OCPMachine)
return err
}
return nil
}

// GetNode returns the Node associated with the machine in the current context.
func (r *RemediationManager) GetNode(ctx context.Context) (*corev1.Node, error) {

Expand Down Expand Up @@ -432,3 +420,32 @@ func (r *RemediationManager) RemoveNodeBackupAnnotations() {
func (r *RemediationManager) getPowerOffAnnotationKey() string {
return fmt.Sprintf(powerOffAnnotation, r.Metal3Remediation.UID)
}

func (r *RemediationManager) CanReprovision(ctx context.Context) (bool, error) {
baremetalhost, _, err := r.GetUnhealthyHost(ctx)
if err != nil {
r.Log.Error(err, "Failed to get BMH for machine", "machine", r.OCPMachine.Name)
return false, err
}
if baremetalhost.Spec.ExternallyProvisioned {
r.Log.Info("Reprovisioning of machine not allowed: BMH is externally provisioned", "machine", r.OCPMachine.Name, "bmh", baremetalhost.Name)
return false, nil
}
if metav1.GetControllerOf(r.OCPMachine) == nil {
r.Log.Info("Reprovisioning of machine not allowed: no owning controller", "machine", r.OCPMachine.Name)
return false, nil
}
if r.OCPMachine.Labels[machineRoleLabel] == machineRoleMaster {
r.Log.Info("Reprovisioning of machine not allowed: has master role", "machine", r.OCPMachine.Name)
return false, nil
}
return true, nil
}

func (r *RemediationManager) DeleteMachine(ctx context.Context) error {
err := r.Client.Delete(ctx, r.OCPMachine)
if err != nil {
r.Log.Error(err, "Failed to delete machine", "machine", r.OCPMachine.Name)
}
return err
}
43 changes: 29 additions & 14 deletions pkg/baremetal/mocks/zz_generated.metal3remediation_manager.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 11 additions & 8 deletions pkg/controller/metal3remediation/metal3remediation_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -253,14 +253,17 @@ func (r *Metal3RemediationReconciler) reconcileNormal(ctx context.Context,

r.Log.Info("Remediation timed out and retry limit reached")

// TODO what to do here in OCP
// When machine is still unhealthy after remediation, setting of OwnerRemediatedCondition
// moves control to CAPI machine controller. The owning controller will do
// preflight checks and handles the Machine deletion
err = remediationMgr.SetOwnerRemediatedConditionNew(ctx)
if err != nil {
r.Log.Error(err, "error setting cluster api conditions")
return ctrl.Result{}, errors.Wrapf(err, "error setting cluster api conditions")
// When machine is still unhealthy after remediation, and it can be re-provisioned, delete the machine.
// Note: this differs to the upstream metal3 remediation, which sets a condition which is handled by CAPI
if ok, err := remediationMgr.CanReprovision(ctx); err != nil {
return ctrl.Result{}, errors.Wrapf(err, "Failed to check if machine can be reprovisoned")
} else if ok {
r.Log.Info("Deleting machine")
if err := remediationMgr.DeleteMachine(ctx); err != nil {
return ctrl.Result{}, errors.Wrapf(err, "Failed to delete machine")
}
} else {
r.Log.Info("Machine can't be re-provisioned, will not delete it")
}

// Remediation failed, so set unhealthy annotation on BMH
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ func setReconcileNormalRemediationExpectations(ctrl *gomock.Controller,
m.EXPECT().IncreaseRetryCount()
return m
}
m.EXPECT().SetOwnerRemediatedConditionNew(context.TODO())
m.EXPECT().CanReprovision(context.TODO())
m.EXPECT().SetUnhealthyAnnotation(context.TODO())
m.EXPECT().SetRemediationPhase(infrav1.PhaseDeleting)
}
Expand Down

0 comments on commit 6cfe843

Please sign in to comment.