Skip to content

Commit

Permalink
CO health: only track current in-progress upgrade start
Browse files Browse the repository at this point in the history
Keeping the original upgrade start time may cause CVO spuriously warn
about waiting on a CO for over 40 minutes, in a scenario like this:

1. CVO upgrades etcd / KAS very early in the upgrade, noting the time when it started to do that
2. These two COs upgrade successfuly and upgrade proceeds
3. Eventually cluster starts rebooting masters and etcd/KAS go degraded
4. CVO compares current time against the noted time, discovers its more than 40 minutes and starts warning about it.
  • Loading branch information
petr-muller authored and openshift-cherrypick-robot committed Jan 18, 2024
1 parent 2454e67 commit caa63e1
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 4 deletions.
11 changes: 9 additions & 2 deletions pkg/cvo/internal/operatorstatus.go
Expand Up @@ -99,7 +99,7 @@ func (b *clusterOperatorBuilder) Do(ctx context.Context) error {
co := readClusterOperatorV1OrDie(b.raw)

// add cluster operator's start time if not already there
payload.COUpdateStartTimesEnsureName(co.Name)
payload.COUpdateStartTimesEnsure(co.Name)

if b.modifier != nil {
b.modifier(co)
Expand Down Expand Up @@ -144,7 +144,14 @@ func (b *clusterOperatorBuilder) Do(ctx context.Context) error {
}
}

return checkOperatorHealth(ctx, b.client, co, b.mode)
err := checkOperatorHealth(ctx, b.client, co, b.mode)
if err == nil {
// Operator is updated to desired version and healthy. These times are now only used to compute
// the "waiting 40 minutes" message, so once we succeed, keeping time when we *first* started
// to upgrade this CO makes no sense. We need to track when the current ongoing upgrade started.
payload.COUpdateStartTimesRemove(co.Name)
}
return err
}

func checkOperatorHealth(ctx context.Context, client ClusterOperatorsGetter, expected *configv1.ClusterOperator, mode resourcebuilder.Mode) error {
Expand Down
11 changes: 9 additions & 2 deletions pkg/payload/task.go
Expand Up @@ -44,16 +44,23 @@ func InitCOUpdateStartTimes() {
clusterOperatorUpdateStartTimes.lock.Unlock()
}

// COUpdateStartTimesEnsureName adds name to clusterOperatorUpdateStartTimes map and sets to
// COUpdateStartTimesEnsure adds name to clusterOperatorUpdateStartTimes map and sets to
// current time if name does not already exist in map.
func COUpdateStartTimesEnsureName(name string) {
func COUpdateStartTimesEnsure(name string) {
clusterOperatorUpdateStartTimes.lock.Lock()
if _, ok := clusterOperatorUpdateStartTimes.m[name]; !ok {
clusterOperatorUpdateStartTimes.m[name] = time.Now()
}
clusterOperatorUpdateStartTimes.lock.Unlock()
}

// COUpdateStartTimesRemove removes name from clusterOperatorUpdateStartTimes
func COUpdateStartTimesRemove(name string) {
clusterOperatorUpdateStartTimes.lock.Lock()
delete(clusterOperatorUpdateStartTimes.m, name)
clusterOperatorUpdateStartTimes.lock.Unlock()
}

// COUpdateStartTimesGet returns name's value from clusterOperatorUpdateStartTimes map.
func COUpdateStartTimesGet(name string) time.Time {
clusterOperatorUpdateStartTimes.lock.Lock()
Expand Down

0 comments on commit caa63e1

Please sign in to comment.