diff --git a/controllers/metal3.io/host_state_machine.go b/controllers/metal3.io/host_state_machine.go index 7f71a1badb..40891f190c 100644 --- a/controllers/metal3.io/host_state_machine.go +++ b/controllers/metal3.io/host_state_machine.go @@ -517,6 +517,8 @@ func (hsm *hostStateMachine) handleDeprovisioning(info *reconcileInfo) actionRes // If the provisioner gives up deprovisioning and // deletion has been requested, continue to delete. if hsm.Host.Status.ErrorCount > 3 { + info.log.Info("Giving up on host clean up after 3 attempts. The host may still be operational " + + "and cause issues in your clusters. You should clean it up manually now.") return skipToDelete() } case actionError: diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index cd894d853f..aa6760f5d7 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -1660,27 +1660,30 @@ func (p *ironicProvisioner) Deprovision(force bool) (result provisioner.Result, ) case nodes.CleanFail: - p.log.Info("cleaning failed") + if !force { + p.log.Info("cleaning failed", "lastError", ironicNode.LastError) + return operationFailed(fmt.Sprintf("Cleaning failed: %s", ironicNode.LastError)) + } + p.log.Info("retrying cleaning") if ironicNode.Maintenance { - p.log.Info("clearing maintenance flag") + p.log.Info("clearing maintenance flag", "maintenanceReason", ironicNode.MaintenanceReason) return p.setMaintenanceFlag(ironicNode, false, "") } - // This will return us to the manageable state without completing - // cleaning. Because cleaning happens in the process of moving from - // manageable to available, the node will still get cleaned before - // we provision it again. + // Move to manageable for retrying. return p.changeNodeProvisionState( ironicNode, nodes.ProvisionStateOpts{Target: nodes.TargetManage}, ) case nodes.Manageable: - // We end up here after CleanFail. Because cleaning happens in the - // process of moving from manageable to available, the node will still - // get cleaned before we provision it again. Therefore, just declare - // deprovisioning complete. - p.log.Info("deprovisioning node is in manageable state") - return operationComplete() + // We end up here after CleanFail, retry cleaning. If a user + // wants to delete a host without cleaning, they can always set + // automatedCleaningMode: disabled. + p.log.Info("deprovisioning node is in manageable state", "automatedClean", ironicNode.AutomatedClean) + return p.changeNodeProvisionState( + ironicNode, + nodes.ProvisionStateOpts{Target: nodes.TargetProvide}, + ) case nodes.Available: p.publisher("DeprovisioningComplete", "Image deprovisioning completed") @@ -1706,7 +1709,7 @@ func (p *ironicProvisioner) Deprovision(force bool) (result provisioner.Result, return operationContinuing(deprovisionRequeueDelay) case nodes.Active, nodes.DeployFail, nodes.DeployWait: - p.log.Info("starting deprovisioning") + p.log.Info("starting deprovisioning", "automatedClean", ironicNode.AutomatedClean) p.publisher("DeprovisioningStarted", "Image deprovisioning started") return p.changeNodeProvisionState( ironicNode, diff --git a/pkg/provisioner/ironic/provision_test.go b/pkg/provisioner/ironic/provision_test.go index de8366b877..bdd9683afe 100644 --- a/pkg/provisioner/ironic/provision_test.go +++ b/pkg/provisioner/ironic/provision_test.go @@ -237,16 +237,16 @@ func TestDeprovision(t *testing.T) { ProvisionState: string(nodes.CleanFail), UUID: nodeUUID, }), - expectedRequestAfter: 10, - expectedDirty: true, + expectedErrorMessage: true, }, { name: "manageable state", - ironic: testserver.NewIronic(t).Ready().Node(nodes.Node{ + ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ ProvisionState: string(nodes.Manageable), UUID: nodeUUID, }), - expectedDirty: false, + expectedRequestAfter: 10, + expectedDirty: true, }, }