Skip to content

Commit

Permalink
Merge pull request #152 from PhilipGough/4.10-cp
Browse files Browse the repository at this point in the history
Bug 2030539: Address race condition in recreate flow for statefulset
  • Loading branch information
openshift-merge-robot committed Jan 28, 2022
2 parents f176cb1 + 721bdd8 commit 53d6d76
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 21 deletions.
21 changes: 12 additions & 9 deletions pkg/alertmanager/operator.go
Expand Up @@ -763,13 +763,20 @@ func (c *Operator) sync(ctx context.Context, key string) error {
return errors.Wrap(err, "failed to retrieve statefulset")
}

oldSpec := appsv1.StatefulSetSpec{}
existingStatefulSet := &appsv1.StatefulSet{}
if obj != nil {
ss := obj.(*appsv1.StatefulSet)
oldSpec = ss.Spec
existingStatefulSet = obj.(*appsv1.StatefulSet)
if existingStatefulSet.DeletionTimestamp != nil {
level.Info(logger).Log(
"msg", "halting update of StatefulSet",
"reason", "resource has been marked for deletion",
"resource_name", existingStatefulSet.GetName(),
)
return nil
}
}

newSSetInputHash, err := createSSetInputHash(*am, c.config, tlsAssets, oldSpec)
newSSetInputHash, err := createSSetInputHash(*am, c.config, tlsAssets, existingStatefulSet.Spec)
if err != nil {
return err
}
Expand All @@ -782,11 +789,7 @@ func (c *Operator) sync(ctx context.Context, key string) error {

ssetClient := c.kclient.AppsV1().StatefulSets(am.Namespace)

var oldSSetInputHash string
if obj != nil {
oldSSetInputHash = obj.(*appsv1.StatefulSet).ObjectMeta.Annotations[sSetInputHashName]
}
if newSSetInputHash == oldSSetInputHash {
if newSSetInputHash == existingStatefulSet.ObjectMeta.Annotations[sSetInputHashName] {
level.Debug(logger).Log("msg", "new statefulset generation inputs match current, skipping any actions")
return nil
}
Expand Down
4 changes: 4 additions & 0 deletions pkg/alertmanager/statefulset.go
Expand Up @@ -321,6 +321,10 @@ func makeStatefulSetSpec(a *monitoringv1.Alertmanager, config Config, tlsAssetSe
podLabels := map[string]string{
"app.kubernetes.io/version": version.String(),
}
// In cases where an existing selector label is modified, or a new one is added, new sts cannot match existing pods.
// We should try to avoid removing such immutable fields whenever possible since doing
// so forces us to enter the 'recreate cycle' and can potentially lead to downtime.
// The requirement to make a change here should be carefully evaluated.
podSelectorLabels := map[string]string{
"app.kubernetes.io/name": "alertmanager",
"app.kubernetes.io/managed-by": "prometheus-operator",
Expand Down
24 changes: 18 additions & 6 deletions pkg/prometheus/operator.go
Expand Up @@ -1256,12 +1256,24 @@ func (c *Operator) sync(ctx context.Context, key string) error {
return errors.Wrap(err, "retrieving statefulset failed")
}

spec := appsv1.StatefulSetSpec{}
existingStatefulSet := &appsv1.StatefulSet{}
if obj != nil {
ss := obj.(*appsv1.StatefulSet)
spec = ss.Spec
existingStatefulSet = obj.(*appsv1.StatefulSet)
if existingStatefulSet.DeletionTimestamp != nil {
// We want to avoid entering a hot-loop of update/delete cycles here since the sts was
// marked for deletion in foreground, which means it may take some time before the finalizers complete and
// the resource disappears from the API. The deletion timestamp will have been set when the initial
// delete request was issued. In that case, we avoid further processing.
level.Info(logger).Log(
"msg", "halting update of StatefulSet",
"reason", "resource has been marked for deletion",
"resource_name", existingStatefulSet.GetName(),
)
return nil
}
}
newSSetInputHash, err := createSSetInputHash(*p, c.config, ruleConfigMapNames, tlsAssets, spec)

newSSetInputHash, err := createSSetInputHash(*p, c.config, ruleConfigMapNames, tlsAssets, existingStatefulSet.Spec)
if err != nil {
return err
}
Expand All @@ -1281,8 +1293,7 @@ func (c *Operator) sync(ctx context.Context, key string) error {
return nil
}

oldSSetInputHash := obj.(*appsv1.StatefulSet).ObjectMeta.Annotations[sSetInputHashName]
if newSSetInputHash == oldSSetInputHash {
if newSSetInputHash == existingStatefulSet.ObjectMeta.Annotations[sSetInputHashName] {
level.Debug(logger).Log("msg", "new statefulset generation inputs match current, skipping any actions")
continue
}
Expand All @@ -1302,6 +1313,7 @@ func (c *Operator) sync(ctx context.Context, key string) error {
}

level.Info(logger).Log("msg", "recreating StatefulSet because the update operation wasn't possible", "reason", strings.Join(failMsg, ", "))

propagationPolicy := metav1.DeletePropagationForeground
if err := ssetClient.Delete(ctx, sset.GetName(), metav1.DeleteOptions{PropagationPolicy: &propagationPolicy}); err != nil {
return errors.Wrap(err, "failed to delete StatefulSet to avoid forbidden action")
Expand Down
4 changes: 4 additions & 0 deletions pkg/prometheus/statefulset.go
Expand Up @@ -655,6 +655,10 @@ func makeStatefulSetSpec(p monitoringv1.Prometheus, c *operator.Config, shard in
podLabels := map[string]string{
"app.kubernetes.io/version": version.String(),
}
// In cases where an existing selector label is modified, or a new one is added, new sts cannot match existing pods.
// We should try to avoid removing such immutable fields whenever possible since doing
// so forces us to enter the 'recreate cycle' and can potentially lead to downtime.
// The requirement to make a change here should be carefully evaluated.
podSelectorLabels := map[string]string{
"app.kubernetes.io/name": "prometheus",
"app.kubernetes.io/managed-by": "prometheus-operator",
Expand Down
18 changes: 12 additions & 6 deletions pkg/thanos/operator.go
Expand Up @@ -675,13 +675,20 @@ func (o *Operator) sync(ctx context.Context, key string) error {
return nil
}

spec := appsv1.StatefulSetSpec{}
existingStatefulSet := &appsv1.StatefulSet{}
if obj != nil {
ss := obj.(*appsv1.StatefulSet)
spec = ss.Spec
existingStatefulSet = obj.(*appsv1.StatefulSet)
if existingStatefulSet.DeletionTimestamp != nil {
level.Info(logger).Log(
"msg", "halting update of StatefulSet",
"reason", "resource has been marked for deletion",
"resource_name", existingStatefulSet.GetName(),
)
return nil
}
}

newSSetInputHash, err := createSSetInputHash(*tr, o.config, ruleConfigMapNames, spec)
newSSetInputHash, err := createSSetInputHash(*tr, o.config, ruleConfigMapNames, existingStatefulSet.Spec)
if err != nil {
return err
}
Expand All @@ -693,8 +700,7 @@ func (o *Operator) sync(ctx context.Context, key string) error {

operator.SanitizeSTS(sset)

oldSSetInputHash := obj.(*appsv1.StatefulSet).ObjectMeta.Annotations[sSetInputHashName]
if newSSetInputHash == oldSSetInputHash {
if newSSetInputHash == existingStatefulSet.ObjectMeta.Annotations[sSetInputHashName] {
level.Debug(logger).Log("msg", "new statefulset generation inputs match current, skipping any actions")
return nil
}
Expand Down
4 changes: 4 additions & 0 deletions pkg/thanos/statefulset.go
Expand Up @@ -380,6 +380,10 @@ func makeStatefulSetSpec(tr *monitoringv1.ThanosRuler, config Config, ruleConfig
}
}
}
// In cases where an existing selector label is modified, or a new one is added, new sts cannot match existing pods.
// We should try to avoid removing such immutable fields whenever possible since doing
// so forces us to enter the 'recreate cycle' and can potentially lead to downtime.
// The requirement to make a change here should be carefully evaluated.
podLabels["app.kubernetes.io/name"] = thanosRulerLabel
podLabels["app.kubernetes.io/managed-by"] = "prometheus-operator"
podLabels["app.kubernetes.io/instance"] = tr.Name
Expand Down

0 comments on commit 53d6d76

Please sign in to comment.