From 2f4593e81f3090a3637bc1d85b524b3b7b5b8405 Mon Sep 17 00:00:00 2001 From: Honza Pokorny Date: Thu, 20 Oct 2022 09:45:06 -0300 Subject: [PATCH] Power off nodes upon deletion Co-authored-by: Sandhya Dasu --- .../metal3.io/v1alpha1/baremetalhost_types.go | 3 ++ .../metal3.io/baremetalhost_controller.go | 29 ++++++++++++++++++- .../metal3.io/host_state_machine_test.go | 2 +- pkg/provisioner/demo/demo.go | 2 +- pkg/provisioner/fixture/fixture.go | 4 +-- pkg/provisioner/ironic/delete_test.go | 2 +- pkg/provisioner/ironic/ironic.go | 4 +-- pkg/provisioner/provisioner.go | 2 +- 8 files changed, 39 insertions(+), 9 deletions(-) diff --git a/apis/metal3.io/v1alpha1/baremetalhost_types.go b/apis/metal3.io/v1alpha1/baremetalhost_types.go index d53586ec10..9f1563e44c 100644 --- a/apis/metal3.io/v1alpha1/baremetalhost_types.go +++ b/apis/metal3.io/v1alpha1/baremetalhost_types.go @@ -157,6 +157,9 @@ const ( // DetachError is an error condition occurring when the // controller is unable to detatch the host from the provisioner DetachError ErrorType = "detach error" + // DeleteError is an error condition occurring when the controller + // is unable to delete the host. + DeleteError ErrorType = "delete error" ) // ProvisioningState defines the states the provisioner will report diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index 7d086d0db3..5d9b3f526a 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -62,6 +62,7 @@ const ( hardwareDetailsAnnotation = inspectAnnotationPrefix + "/hardwaredetails" clarifySoftPoweroffFailure = "Continuing with hard poweroff after soft poweroff fails. More details: " hardwareDataFinalizer = metal3v1alpha1.BareMetalHostFinalizer + "/hardwareData" + maxPowerOffRetryCount = 3 ) // BareMetalHostReconciler reconciles a BareMetalHost object @@ -321,6 +322,7 @@ func recordActionFailure(info *reconcileInfo, errorType metal3v1alpha1.ErrorType metal3v1alpha1.InspectionError: "InspectionError", metal3v1alpha1.ProvisioningError: "ProvisioningError", metal3v1alpha1.PowerManagementError: "PowerManagementError", + metal3v1alpha1.DeleteError: "DeleteError", }[errorType] counter := actionFailureCounters.WithLabelValues(eventType) @@ -494,10 +496,35 @@ func (r *BareMetalHostReconciler) actionDeleting(prov provisioner.Provisioner, i return deleteComplete{} } - provResult, err := prov.Delete() + // Only try to power off `maxPowerOffRetryCount` times before giving up, and deleting the host anyway + if info.host.Status.ErrorType != metal3v1alpha1.PowerManagementError || (info.host.Status.ErrorType == metal3v1alpha1.PowerManagementError && info.host.Status.ErrorCount < maxPowerOffRetryCount) { + info.log.Info("host ready to be powered off") + provResult, err := prov.PowerOff( + metal3v1alpha1.RebootModeHard, + info.host.Status.ErrorType == metal3v1alpha1.PowerManagementError) + + if err != nil { + return actionError{errors.Wrap(err, "failed to power off")} + } + + if provResult.ErrorMessage != "" { + return recordActionFailure(info, metal3v1alpha1.PowerManagementError, provResult.ErrorMessage) + } + + if provResult.Dirty { + return actionContinue{provResult.RequeueAfter} + } + } + + provResult, err := prov.Delete(info.host.Status.ErrorType == metal3v1alpha1.DeleteError) if err != nil { return actionError{errors.Wrap(err, "failed to delete")} } + + if provResult.ErrorMessage != "" { + return recordActionFailure(info, metal3v1alpha1.DeleteError, provResult.ErrorMessage) + } + if provResult.Dirty { return actionContinue{provResult.RequeueAfter} } diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index f7901c8c97..12e7c5b448 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -1287,7 +1287,7 @@ func (m *mockProvisioner) Deprovision(force bool) (result provisioner.Result, er return m.getNextResultByMethod("Deprovision"), err } -func (m *mockProvisioner) Delete() (result provisioner.Result, err error) { +func (m *mockProvisioner) Delete(force bool) (result provisioner.Result, err error) { return m.getNextResultByMethod("Delete"), err } diff --git a/pkg/provisioner/demo/demo.go b/pkg/provisioner/demo/demo.go index b38da17684..579bdad390 100644 --- a/pkg/provisioner/demo/demo.go +++ b/pkg/provisioner/demo/demo.go @@ -292,7 +292,7 @@ func (p *demoProvisioner) Deprovision(force bool) (result provisioner.Result, er // Delete removes the host from the provisioning system. It may be // called multiple times, and should return true for its dirty flag // until the deprovisioning operation is completed. -func (p *demoProvisioner) Delete() (result provisioner.Result, err error) { +func (p *demoProvisioner) Delete(force bool) (result provisioner.Result, err error) { p.log.Info("deleting host") return result, nil } diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index 2628e51103..b024f1616e 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -271,7 +271,7 @@ func (p *fixtureProvisioner) Deprovision(force bool) (result provisioner.Result, // Delete removes the host from the provisioning system. It may be // called multiple times, and should return true for its dirty flag // until the deprovisioning operation is completed. -func (p *fixtureProvisioner) Delete() (result provisioner.Result, err error) { +func (p *fixtureProvisioner) Delete(force bool) (result provisioner.Result, err error) { p.log.Info("deleting host") if !p.state.Deleted { @@ -290,7 +290,7 @@ func (p *fixtureProvisioner) Delete() (result provisioner.Result, err error) { // and should return true for its dirty flag until the // deletion operation is completed. func (p *fixtureProvisioner) Detach() (result provisioner.Result, err error) { - return p.Delete() + return p.Delete(true) } // PowerOn ensures the server is powered on independently of any image diff --git a/pkg/provisioner/ironic/delete_test.go b/pkg/provisioner/ironic/delete_test.go index 3c7eba1093..3588eebd98 100644 --- a/pkg/provisioner/ironic/delete_test.go +++ b/pkg/provisioner/ironic/delete_test.go @@ -206,7 +206,7 @@ func deleteTest(t *testing.T, detach bool) { if detach { result, err = prov.Detach() } else { - result, err = prov.Delete() + result, err = prov.Delete(true) } assert.Equal(t, tc.expectedDirty, result.Dirty) diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index a4a73f9c9e..2be60297de 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -1707,7 +1707,7 @@ func (p *ironicProvisioner) Deprovision(force bool) (result provisioner.Result, // Delete removes the host from the provisioning system. It may be // called multiple times, and should return true for its dirty flag // until the deprovisioning operation is completed. -func (p *ironicProvisioner) Delete() (result provisioner.Result, err error) { +func (p *ironicProvisioner) Delete(force bool) (result provisioner.Result, err error) { ironicNode, err := p.getNode() if err != nil { if errors.Is(err, provisioner.ErrNeedsRegistration) { @@ -1777,7 +1777,7 @@ func (p *ironicProvisioner) Delete() (result provisioner.Result, err error) { // deletion operation is completed. func (p *ironicProvisioner) Detach() (result provisioner.Result, err error) { // Currently the same behavior as Delete() - return p.Delete() + return p.Delete(true) } // softPowerOffUnsupportedError is returned when the BMC does not diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index 74b671bdba..a16027751f 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -164,7 +164,7 @@ type Provisioner interface { // Delete removes the host from the provisioning system. It may be // called multiple times, and should return true for its dirty // flag until the deletion operation is completed. - Delete() (result Result, err error) + Delete(force bool) (result Result, err error) // Detach removes the host from the provisioning system. // Similar to Delete, but ensures non-interruptive behavior