Skip to content

Commit

Permalink
Vsphere: Handle cloned instance with lost taskID
Browse files Browse the repository at this point in the history
This commit attempts to recover from a state when instance was
cloned the clone taskID was lost. In this state another clone
would be issued instead of poweron command.

TaskID can get lost when both machine status cannot be saved and
the infrastructure provider restarts.
  • Loading branch information
RadekManak committed Mar 26, 2024
1 parent 99b3ec4 commit 0a60232
Showing 1 changed file with 25 additions and 2 deletions.
27 changes: 25 additions & 2 deletions pkg/controller/vsphere/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,28 @@ func (r *Reconciler) create() error {
if !r.machineScope.session.IsVC() {
return fmt.Errorf("%v: not connected to a vCenter", r.machine.GetName())
}

// Attempt to power on instance in situation where we cloned the instance and lost taskRef.
if types.VirtualMachinePowerState(*r.machineScope.providerStatus.InstanceState) == types.VirtualMachinePowerStatePoweredOff {
klog.Infof("Powering on cloned machine without taskID: %v", r.machine.Name)
task, err := powerOn(r.machineScope)
if err != nil {
metrics.RegisterFailedInstanceCreate(&metrics.MachineLabels{
Name: r.machine.Name,
Namespace: r.machine.Namespace,
Reason: "PowerOn task finished with error",
})
conditionFailed := conditionFailed()
conditionFailed.Message = err.Error()
statusError := setProviderStatus(task, conditionFailed, r.machineScope, nil)
if statusError != nil {
return fmt.Errorf("failed to set provider status: %w", err)
}
return err
}
return setProviderStatus(task, conditionSuccess(), r.machineScope, nil)
}

klog.Infof("%v: cloning", r.machine.GetName())
task, err := clone(r.machineScope)
if err != nil {
Expand Down Expand Up @@ -287,7 +309,7 @@ func (r *Reconciler) exists() (bool, error) {
}

// Check if machine was powered on after clone.
// If it is powered off and in "Provisioning" phase, treat machine as non-existed yet and requeue for proceed
// If it is powered off and in "Provisioning" phase, treat machine as non-existed yet and proceed
// with creation procedure.
powerState := types.VirtualMachinePowerState(ptr.Deref(r.machineScope.providerStatus.InstanceState, ""))
if powerState == "" {
Expand All @@ -303,7 +325,8 @@ func (r *Reconciler) exists() (bool, error) {
}

if ptr.Deref(r.machine.Status.Phase, "") == machinev1.PhaseProvisioning && powerState == types.VirtualMachinePowerStatePoweredOff {
klog.Infof("%v: already exists, but was not powered on after clone, requeue ", r.machine.GetName())
klog.Infof("%v: already exists, but was not powered on after clone", r.machine.GetName())
r.machineScope.providerStatus.InstanceState = ptr.To(string(powerState))
return false, nil
}

Expand Down

0 comments on commit 0a60232

Please sign in to comment.