Skip to content

Commit

Permalink
OCPBUGS-30873: CEO aliveness check should only detect deadlocks
Browse files Browse the repository at this point in the history
Currently we only detect whether a controller has been running
continuously into errors. Whereas we wanted to detect real deadlock
situations. This change defuses the aliveness check to only declare real
locking situations as problematic.

Additionally, to not create insane amounts of log traffic, this change
will throttle the stack dumping to once every 15 minutes. Previously it
would trigger almost immediately every health probe invocation and
create multi-megabyte log spam.

Signed-off-by: Thomas Jungblut <tjungblu@redhat.com>
  • Loading branch information
tjungblu authored and openshift-cherrypick-robot committed Mar 27, 2024
1 parent c40d710 commit 4517a47
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 12 deletions.
20 changes: 14 additions & 6 deletions pkg/operator/health/aliveness_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package health
import (
"runtime"
"sync"
"time"

"k8s.io/klog/v2"
)
Expand All @@ -15,6 +16,8 @@ type MultiAlivenessChecker struct {
m sync.Mutex
// name -> checker
checkerMap map[string]AlivenessChecker

lastPrintTime time.Time
}

func (r *MultiAlivenessChecker) Add(name string, c AlivenessChecker) {
Expand All @@ -31,11 +34,15 @@ func (r *MultiAlivenessChecker) Alive() bool {
for s, checker := range r.checkerMap {
if !checker.Alive() {
klog.Warningf("Controller [%s] didn't sync for a long time, declaring unhealthy and dumping stack", s)
// 12 mb should be enough for a full goroutine dump
buf := make([]byte, 1024*1024*12)
n := runtime.Stack(buf, true)
klog.Warningf("%s", buf[:n])

// we throttle this to once every 15m because dumping 12mb of logs for every probe failure is very expensive
if r.lastPrintTime.Add(time.Minute * 15).After(time.Now()) {
// 12 mb should be enough for a full goroutine dump
buf := make([]byte, 1024*1024*12)
n := runtime.Stack(buf, true)
klog.Warningf("%s", buf[:n])
r.lastPrintTime = time.Now()
}
return false
}
}
Expand All @@ -45,7 +52,8 @@ func (r *MultiAlivenessChecker) Alive() bool {

func NewMultiAlivenessChecker() *MultiAlivenessChecker {
return &MultiAlivenessChecker{
m: sync.Mutex{},
checkerMap: make(map[string]AlivenessChecker),
m: sync.Mutex{},
checkerMap: make(map[string]AlivenessChecker),
lastPrintTime: time.UnixMilli(0),
}
}
6 changes: 3 additions & 3 deletions pkg/operator/health/checking_sync_wrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ type CheckingSyncWrapper struct {

func (r *CheckingSyncWrapper) Sync(ctx context.Context, controllerContext factory.SyncContext) error {
err := r.syncFunc(ctx, controllerContext)
if err == nil {
atomic.StoreInt64(&r.lastSuccessfulRun, time.Now().UnixMilli())
}
// we store the time regardless of success of the sync because some controllers error out when they have unhealthy members,
// whereas we actually want to detect deadlocks - which would entirely block the sync from returning.
atomic.StoreInt64(&r.lastSuccessfulRun, time.Now().UnixMilli())
return err
}

Expand Down
6 changes: 3 additions & 3 deletions pkg/operator/health/checking_sync_wrapper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func TestHappyPathAliveness(t *testing.T) {

}

func TestErrorDoesNotUpdateSuccess(t *testing.T) {
func TestErrorDoesUpdatesSuccess(t *testing.T) {
syncer := NewCheckingSyncWrapper(func(ctx context.Context, controllerContext factory.SyncContext) error {
return fmt.Errorf("some")
}, 1*time.Second)
Expand All @@ -49,6 +49,6 @@ func TestErrorDoesNotUpdateSuccess(t *testing.T) {

err := syncer.Sync(context.Background(), nil)
require.Error(t, err)
require.Equal(t, int64(0), syncer.lastSuccessfulRun)
require.False(t, syncer.Alive())
require.NotEqual(t, int64(0), syncer.lastSuccessfulRun)
require.True(t, syncer.Alive())
}

0 comments on commit 4517a47

Please sign in to comment.