Skip to content

Commit

Permalink
CO health: only track current in-progress upgrade start
Browse files Browse the repository at this point in the history
Keeping the original upgrade start time may cause CVO spuriously warn
about waiting on a CO for over 40 minutes, in a scenario like this:

1. CVO upgrades etcd / KAS very early in the upgrade, noting the time when it started to do that
2. These two COs upgrade successfuly and upgrade proceeds
3. Eventually cluster starts rebooting masters and etcd/KAS go degraded
4. CVO compares current time against the noted time, discovers its more than 40 minutes and starts warning about it.
  • Loading branch information
petr-muller committed Dec 22, 2023
1 parent baef28d commit 65a23de
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 4 deletions.
11 changes: 9 additions & 2 deletions pkg/cvo/internal/operatorstatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ func (b *clusterOperatorBuilder) Do(ctx context.Context) error {
os := readClusterOperatorV1OrDie(b.raw)

// add cluster operator's start time if not already there
payload.COUpdateStartTimesEnsureName(os.Name)
payload.COUpdateStartTimesEnsure(os.Name)

if b.modifier != nil {
b.modifier(os)
Expand All @@ -120,7 +120,14 @@ func (b *clusterOperatorBuilder) Do(ctx context.Context) error {
return nil
}

return checkOperatorHealth(ctx, b.client, os, b.mode)
err := checkOperatorHealth(ctx, b.client, os, b.mode)
if err == nil {
// Operator is updated to desired version and healthy. These times are now only used to compute
// the "waiting 40 minutes" message, so once we succeed, keeping time when we *first* started
// to upgrade this CO makes no sense. We need to track when the current ongoing upgrade started.
payload.COUpdateStartTimesRemove(os.Name)
}
return err
}

func checkOperatorHealth(ctx context.Context, client ClusterOperatorsGetter, expected *configv1.ClusterOperator, mode resourcebuilder.Mode) error {
Expand Down
11 changes: 9 additions & 2 deletions pkg/payload/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,23 @@ func InitCOUpdateStartTimes() {
clusterOperatorUpdateStartTimes.lock.Unlock()
}

// COUpdateStartTimesEnsureName adds name to clusterOperatorUpdateStartTimes map and sets to
// COUpdateStartTimesEnsure adds name to clusterOperatorUpdateStartTimes map and sets to
// current time if name does not already exist in map.
func COUpdateStartTimesEnsureName(name string) {
func COUpdateStartTimesEnsure(name string) {
clusterOperatorUpdateStartTimes.lock.Lock()
if _, ok := clusterOperatorUpdateStartTimes.m[name]; !ok {
clusterOperatorUpdateStartTimes.m[name] = time.Now()
}
clusterOperatorUpdateStartTimes.lock.Unlock()
}

// COUpdateStartTimesRemove removes name from clusterOperatorUpdateStartTimes
func COUpdateStartTimesRemove(name string) {
clusterOperatorUpdateStartTimes.lock.Lock()
delete(clusterOperatorUpdateStartTimes.m, name)
clusterOperatorUpdateStartTimes.lock.Unlock()
}

// COUpdateStartTimesGet returns name's value from clusterOperatorUpdateStartTimes map.
func COUpdateStartTimesGet(name string) time.Time {
clusterOperatorUpdateStartTimes.lock.Lock()
Expand Down

0 comments on commit 65a23de

Please sign in to comment.