Skip to content

Commit

Permalink
Merge pull request #1184 from dtantsur/always-clean
Browse files Browse the repository at this point in the history
🐛 Always retry failed cleaning on deprovisioning (fixes #1182)
  • Loading branch information
metal3-io-bot committed Feb 28, 2023
2 parents c71c7f0 + d78fb4d commit cf8c50a
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 17 deletions.
2 changes: 2 additions & 0 deletions controllers/metal3.io/host_state_machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,8 @@ func (hsm *hostStateMachine) handleDeprovisioning(info *reconcileInfo) actionRes
// If the provisioner gives up deprovisioning and
// deletion has been requested, continue to delete.
if hsm.Host.Status.ErrorCount > 3 {
info.log.Info("Giving up on host clean up after 3 attempts. The host may still be operational " +
"and cause issues in your clusters. You should clean it up manually now.")
return skipToDelete()
}
case actionError:
Expand Down
29 changes: 16 additions & 13 deletions pkg/provisioner/ironic/ironic.go
Original file line number Diff line number Diff line change
Expand Up @@ -1660,27 +1660,30 @@ func (p *ironicProvisioner) Deprovision(force bool) (result provisioner.Result,
)

case nodes.CleanFail:
p.log.Info("cleaning failed")
if !force {
p.log.Info("cleaning failed", "lastError", ironicNode.LastError)
return operationFailed(fmt.Sprintf("Cleaning failed: %s", ironicNode.LastError))
}
p.log.Info("retrying cleaning")
if ironicNode.Maintenance {
p.log.Info("clearing maintenance flag")
p.log.Info("clearing maintenance flag", "maintenanceReason", ironicNode.MaintenanceReason)
return p.setMaintenanceFlag(ironicNode, false, "")
}
// This will return us to the manageable state without completing
// cleaning. Because cleaning happens in the process of moving from
// manageable to available, the node will still get cleaned before
// we provision it again.
// Move to manageable for retrying.
return p.changeNodeProvisionState(
ironicNode,
nodes.ProvisionStateOpts{Target: nodes.TargetManage},
)

case nodes.Manageable:
// We end up here after CleanFail. Because cleaning happens in the
// process of moving from manageable to available, the node will still
// get cleaned before we provision it again. Therefore, just declare
// deprovisioning complete.
p.log.Info("deprovisioning node is in manageable state")
return operationComplete()
// We end up here after CleanFail, retry cleaning. If a user
// wants to delete a host without cleaning, they can always set
// automatedCleaningMode: disabled.
p.log.Info("deprovisioning node is in manageable state", "automatedClean", ironicNode.AutomatedClean)
return p.changeNodeProvisionState(
ironicNode,
nodes.ProvisionStateOpts{Target: nodes.TargetProvide},
)

case nodes.Available:
p.publisher("DeprovisioningComplete", "Image deprovisioning completed")
Expand All @@ -1706,7 +1709,7 @@ func (p *ironicProvisioner) Deprovision(force bool) (result provisioner.Result,
return operationContinuing(deprovisionRequeueDelay)

case nodes.Active, nodes.DeployFail, nodes.DeployWait:
p.log.Info("starting deprovisioning")
p.log.Info("starting deprovisioning", "automatedClean", ironicNode.AutomatedClean)
p.publisher("DeprovisioningStarted", "Image deprovisioning started")
return p.changeNodeProvisionState(
ironicNode,
Expand Down
8 changes: 4 additions & 4 deletions pkg/provisioner/ironic/provision_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,16 +237,16 @@ func TestDeprovision(t *testing.T) {
ProvisionState: string(nodes.CleanFail),
UUID: nodeUUID,
}),
expectedRequestAfter: 10,
expectedDirty: true,
expectedErrorMessage: true,
},
{
name: "manageable state",
ironic: testserver.NewIronic(t).Ready().Node(nodes.Node{
ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{
ProvisionState: string(nodes.Manageable),
UUID: nodeUUID,
}),
expectedDirty: false,
expectedRequestAfter: 10,
expectedDirty: true,
},
}

Expand Down

0 comments on commit cf8c50a

Please sign in to comment.