From 721bdd84ba40e9f743501a601bfc17742d3f0305 Mon Sep 17 00:00:00 2001 From: Philip Gough Date: Fri, 21 Jan 2022 09:27:52 +0000 Subject: [PATCH] Avoid problematic race condition on statefulset recreation We have observed an issue where the operator fails to update a sts and must delete, recreate, a race condition can occur. The delete is a foreground cascading delete which causes the sts to remain visible via the API server for an unknown amount of time. We have observed for all resources, the operator is receiving a multitude of update requests before the delete is finalized. This is causing a hot-loop of delete, recreate cycles which in some cases appear never to resolve without manual intervention i.e. (deleting the sts in the background). Since the foreground delete will add a deletionTimestamp to the existing spec, we can check its existence to determine we should not continue processing the update. While this change does not guarantee zero downtime of the pods owned by the sts, it should at least minimize it and not cause the problem to exacerbate. Co-authored-by: Simon Pasquier --- pkg/alertmanager/operator.go | 21 ++++++++++++--------- pkg/alertmanager/statefulset.go | 4 ++++ pkg/prometheus/operator.go | 24 ++++++++++++++++++------ pkg/prometheus/statefulset.go | 4 ++++ pkg/thanos/operator.go | 18 ++++++++++++------ pkg/thanos/statefulset.go | 4 ++++ 6 files changed, 54 insertions(+), 21 deletions(-) diff --git a/pkg/alertmanager/operator.go b/pkg/alertmanager/operator.go index f75abc9d1b..e923aab30f 100644 --- a/pkg/alertmanager/operator.go +++ b/pkg/alertmanager/operator.go @@ -763,13 +763,20 @@ func (c *Operator) sync(ctx context.Context, key string) error { return errors.Wrap(err, "failed to retrieve statefulset") } - oldSpec := appsv1.StatefulSetSpec{} + existingStatefulSet := &appsv1.StatefulSet{} if obj != nil { - ss := obj.(*appsv1.StatefulSet) - oldSpec = ss.Spec + existingStatefulSet = obj.(*appsv1.StatefulSet) + if existingStatefulSet.DeletionTimestamp != nil { + level.Info(logger).Log( + "msg", "halting update of StatefulSet", + "reason", "resource has been marked for deletion", + "resource_name", existingStatefulSet.GetName(), + ) + return nil + } } - newSSetInputHash, err := createSSetInputHash(*am, c.config, tlsAssets, oldSpec) + newSSetInputHash, err := createSSetInputHash(*am, c.config, tlsAssets, existingStatefulSet.Spec) if err != nil { return err } @@ -782,11 +789,7 @@ func (c *Operator) sync(ctx context.Context, key string) error { ssetClient := c.kclient.AppsV1().StatefulSets(am.Namespace) - var oldSSetInputHash string - if obj != nil { - oldSSetInputHash = obj.(*appsv1.StatefulSet).ObjectMeta.Annotations[sSetInputHashName] - } - if newSSetInputHash == oldSSetInputHash { + if newSSetInputHash == existingStatefulSet.ObjectMeta.Annotations[sSetInputHashName] { level.Debug(logger).Log("msg", "new statefulset generation inputs match current, skipping any actions") return nil } diff --git a/pkg/alertmanager/statefulset.go b/pkg/alertmanager/statefulset.go index 940d7a5ad0..7e9cc2b630 100644 --- a/pkg/alertmanager/statefulset.go +++ b/pkg/alertmanager/statefulset.go @@ -321,6 +321,10 @@ func makeStatefulSetSpec(a *monitoringv1.Alertmanager, config Config, tlsAssetSe podLabels := map[string]string{ "app.kubernetes.io/version": version.String(), } + // In cases where an existing selector label is modified, or a new one is added, new sts cannot match existing pods. + // We should try to avoid removing such immutable fields whenever possible since doing + // so forces us to enter the 'recreate cycle' and can potentially lead to downtime. + // The requirement to make a change here should be carefully evaluated. podSelectorLabels := map[string]string{ "app.kubernetes.io/name": "alertmanager", "app.kubernetes.io/managed-by": "prometheus-operator", diff --git a/pkg/prometheus/operator.go b/pkg/prometheus/operator.go index 0dd0758c13..f24e2d4956 100644 --- a/pkg/prometheus/operator.go +++ b/pkg/prometheus/operator.go @@ -1256,12 +1256,24 @@ func (c *Operator) sync(ctx context.Context, key string) error { return errors.Wrap(err, "retrieving statefulset failed") } - spec := appsv1.StatefulSetSpec{} + existingStatefulSet := &appsv1.StatefulSet{} if obj != nil { - ss := obj.(*appsv1.StatefulSet) - spec = ss.Spec + existingStatefulSet = obj.(*appsv1.StatefulSet) + if existingStatefulSet.DeletionTimestamp != nil { + // We want to avoid entering a hot-loop of update/delete cycles here since the sts was + // marked for deletion in foreground, which means it may take some time before the finalizers complete and + // the resource disappears from the API. The deletion timestamp will have been set when the initial + // delete request was issued. In that case, we avoid further processing. + level.Info(logger).Log( + "msg", "halting update of StatefulSet", + "reason", "resource has been marked for deletion", + "resource_name", existingStatefulSet.GetName(), + ) + return nil + } } - newSSetInputHash, err := createSSetInputHash(*p, c.config, ruleConfigMapNames, tlsAssets, spec) + + newSSetInputHash, err := createSSetInputHash(*p, c.config, ruleConfigMapNames, tlsAssets, existingStatefulSet.Spec) if err != nil { return err } @@ -1281,8 +1293,7 @@ func (c *Operator) sync(ctx context.Context, key string) error { return nil } - oldSSetInputHash := obj.(*appsv1.StatefulSet).ObjectMeta.Annotations[sSetInputHashName] - if newSSetInputHash == oldSSetInputHash { + if newSSetInputHash == existingStatefulSet.ObjectMeta.Annotations[sSetInputHashName] { level.Debug(logger).Log("msg", "new statefulset generation inputs match current, skipping any actions") continue } @@ -1302,6 +1313,7 @@ func (c *Operator) sync(ctx context.Context, key string) error { } level.Info(logger).Log("msg", "recreating StatefulSet because the update operation wasn't possible", "reason", strings.Join(failMsg, ", ")) + propagationPolicy := metav1.DeletePropagationForeground if err := ssetClient.Delete(ctx, sset.GetName(), metav1.DeleteOptions{PropagationPolicy: &propagationPolicy}); err != nil { return errors.Wrap(err, "failed to delete StatefulSet to avoid forbidden action") diff --git a/pkg/prometheus/statefulset.go b/pkg/prometheus/statefulset.go index 6282bdf817..8981949810 100644 --- a/pkg/prometheus/statefulset.go +++ b/pkg/prometheus/statefulset.go @@ -655,6 +655,10 @@ func makeStatefulSetSpec(p monitoringv1.Prometheus, c *operator.Config, shard in podLabels := map[string]string{ "app.kubernetes.io/version": version.String(), } + // In cases where an existing selector label is modified, or a new one is added, new sts cannot match existing pods. + // We should try to avoid removing such immutable fields whenever possible since doing + // so forces us to enter the 'recreate cycle' and can potentially lead to downtime. + // The requirement to make a change here should be carefully evaluated. podSelectorLabels := map[string]string{ "app.kubernetes.io/name": "prometheus", "app.kubernetes.io/managed-by": "prometheus-operator", diff --git a/pkg/thanos/operator.go b/pkg/thanos/operator.go index 5ca0ff4f63..904aabb14c 100644 --- a/pkg/thanos/operator.go +++ b/pkg/thanos/operator.go @@ -675,13 +675,20 @@ func (o *Operator) sync(ctx context.Context, key string) error { return nil } - spec := appsv1.StatefulSetSpec{} + existingStatefulSet := &appsv1.StatefulSet{} if obj != nil { - ss := obj.(*appsv1.StatefulSet) - spec = ss.Spec + existingStatefulSet = obj.(*appsv1.StatefulSet) + if existingStatefulSet.DeletionTimestamp != nil { + level.Info(logger).Log( + "msg", "halting update of StatefulSet", + "reason", "resource has been marked for deletion", + "resource_name", existingStatefulSet.GetName(), + ) + return nil + } } - newSSetInputHash, err := createSSetInputHash(*tr, o.config, ruleConfigMapNames, spec) + newSSetInputHash, err := createSSetInputHash(*tr, o.config, ruleConfigMapNames, existingStatefulSet.Spec) if err != nil { return err } @@ -693,8 +700,7 @@ func (o *Operator) sync(ctx context.Context, key string) error { operator.SanitizeSTS(sset) - oldSSetInputHash := obj.(*appsv1.StatefulSet).ObjectMeta.Annotations[sSetInputHashName] - if newSSetInputHash == oldSSetInputHash { + if newSSetInputHash == existingStatefulSet.ObjectMeta.Annotations[sSetInputHashName] { level.Debug(logger).Log("msg", "new statefulset generation inputs match current, skipping any actions") return nil } diff --git a/pkg/thanos/statefulset.go b/pkg/thanos/statefulset.go index cedafb53dc..90755bfee0 100644 --- a/pkg/thanos/statefulset.go +++ b/pkg/thanos/statefulset.go @@ -380,6 +380,10 @@ func makeStatefulSetSpec(tr *monitoringv1.ThanosRuler, config Config, ruleConfig } } } + // In cases where an existing selector label is modified, or a new one is added, new sts cannot match existing pods. + // We should try to avoid removing such immutable fields whenever possible since doing + // so forces us to enter the 'recreate cycle' and can potentially lead to downtime. + // The requirement to make a change here should be carefully evaluated. podLabels["app.kubernetes.io/name"] = thanosRulerLabel podLabels["app.kubernetes.io/managed-by"] = "prometheus-operator" podLabels["app.kubernetes.io/instance"] = tr.Name