Skip to content

Commit

Permalink
Merge pull request kubernetes-sigs#10177 from g-gaston/race-condition…
Browse files Browse the repository at this point in the history
…-machine-controller-release-1.6

[release-1.6] 🐛 Watch external objects for machine before deleting
  • Loading branch information
k8s-ci-robot committed Feb 20, 2024
2 parents fc43a06 + a64cd41 commit da795db
Show file tree
Hide file tree
Showing 3 changed files with 241 additions and 35 deletions.
24 changes: 16 additions & 8 deletions internal/controllers/machine/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
if err != nil {
switch err {
case errNoControlPlaneNodes, errLastControlPlaneNode, errNilNodeRef, errClusterIsBeingDeleted, errControlPlaneIsBeingDeleted:
var nodeName = ""
nodeName := ""
if m.Status.NodeRef != nil {
nodeName = m.Status.NodeRef.Name
}
Expand Down Expand Up @@ -427,7 +427,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine")
}

infrastructureDeleted, err := r.reconcileDeleteInfrastructure(ctx, m)
infrastructureDeleted, err := r.reconcileDeleteInfrastructure(ctx, cluster, m)
if err != nil {
return ctrl.Result{}, err
}
Expand All @@ -436,7 +436,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
return ctrl.Result{}, nil
}

bootstrapDeleted, err := r.reconcileDeleteBootstrap(ctx, m)
bootstrapDeleted, err := r.reconcileDeleteBootstrap(ctx, cluster, m)
if err != nil {
return ctrl.Result{}, err
}
Expand Down Expand Up @@ -723,8 +723,8 @@ func (r *Reconciler) deleteNode(ctx context.Context, cluster *clusterv1.Cluster,
return nil
}

func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, m *clusterv1.Machine) (bool, error) {
obj, err := r.reconcileDeleteExternal(ctx, m, m.Spec.Bootstrap.ConfigRef)
func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (bool, error) {
obj, err := r.reconcileDeleteExternal(ctx, cluster, m, m.Spec.Bootstrap.ConfigRef)
if err != nil {
return false, err
}
Expand All @@ -743,8 +743,8 @@ func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, m *clusterv1.
return false, nil
}

func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, m *clusterv1.Machine) (bool, error) {
obj, err := r.reconcileDeleteExternal(ctx, m, &m.Spec.InfrastructureRef)
func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (bool, error) {
obj, err := r.reconcileDeleteExternal(ctx, cluster, m, &m.Spec.InfrastructureRef)
if err != nil {
return false, err
}
Expand All @@ -764,7 +764,7 @@ func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, m *clust
}

// reconcileDeleteExternal tries to delete external references.
func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, m *clusterv1.Machine, ref *corev1.ObjectReference) (*unstructured.Unstructured, error) {
func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (*unstructured.Unstructured, error) {
if ref == nil {
return nil, nil
}
Expand All @@ -777,6 +777,14 @@ func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, m *clusterv1.M
}

if obj != nil {
// reconcileExternal ensures that we set the object's OwnerReferences correctly and watch the object.
// The machine delete logic depends on reconciling the machine when the external objects are deleted.
// This avoids a race condition where the machine is deleted before the external objects are ever reconciled
// by this controller.
if _, err := r.ensureExternalOwnershipAndWatch(ctx, cluster, m, ref); err != nil {
return nil, err
}

// Issue a delete request.
if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) {
return obj, errors.Wrapf(err,
Expand Down
56 changes: 35 additions & 21 deletions internal/controllers/machine/machine_controller_phases.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,7 @@ import (
"sigs.k8s.io/cluster-api/util/patch"
)

var (
externalReadyWait = 30 * time.Second
)
var externalReadyWait = 30 * time.Second

func (r *Reconciler) reconcilePhase(_ context.Context, m *clusterv1.Machine) {
originalPhase := m.Status.Phase
Expand Down Expand Up @@ -89,12 +87,44 @@ func (r *Reconciler) reconcilePhase(_ context.Context, m *clusterv1.Machine) {

// reconcileExternal handles generic unstructured objects referenced by a Machine.
func (r *Reconciler) reconcileExternal(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (external.ReconcileOutput, error) {
log := ctrl.LoggerFrom(ctx)

if err := utilconversion.UpdateReferenceAPIContract(ctx, r.Client, ref); err != nil {
return external.ReconcileOutput{}, err
}

result, err := r.ensureExternalOwnershipAndWatch(ctx, cluster, m, ref)
if err != nil {
return external.ReconcileOutput{}, err
}
if result.RequeueAfter > 0 || result.Paused {
return result, nil
}

obj := result.Result

// Set failure reason and message, if any.
failureReason, failureMessage, err := external.FailuresFrom(obj)
if err != nil {
return external.ReconcileOutput{}, err
}
if failureReason != "" {
machineStatusError := capierrors.MachineStatusError(failureReason)
m.Status.FailureReason = &machineStatusError
}
if failureMessage != "" {
m.Status.FailureMessage = pointer.String(
fmt.Sprintf("Failure detected from referenced resource %v with name %q: %s",
obj.GroupVersionKind(), obj.GetName(), failureMessage),
)
}

return external.ReconcileOutput{Result: obj}, nil
}

// ensureExternalOwnershipAndWatch ensures that only the Machine owns the external object,
// adds a watch to the external object if one does not already exist and adds the necessary labels.
func (r *Reconciler) ensureExternalOwnershipAndWatch(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (external.ReconcileOutput, error) {
log := ctrl.LoggerFrom(ctx)

obj, err := external.Get(ctx, r.UnstructuredCachingClient, ref, m.Namespace)
if err != nil {
if apierrors.IsNotFound(errors.Cause(err)) {
Expand Down Expand Up @@ -146,22 +176,6 @@ func (r *Reconciler) reconcileExternal(ctx context.Context, cluster *clusterv1.C
return external.ReconcileOutput{}, err
}

// Set failure reason and message, if any.
failureReason, failureMessage, err := external.FailuresFrom(obj)
if err != nil {
return external.ReconcileOutput{}, err
}
if failureReason != "" {
machineStatusError := capierrors.MachineStatusError(failureReason)
m.Status.FailureReason = &machineStatusError
}
if failureMessage != "" {
m.Status.FailureMessage = pointer.String(
fmt.Sprintf("Failure detected from referenced resource %v with name %q: %s",
obj.GroupVersionKind(), obj.GetName(), failureMessage),
)
}

return external.ReconcileOutput{Result: obj}, nil
}

Expand Down

0 comments on commit da795db

Please sign in to comment.