OCPBUGS-30873: CEO aliveness check should only detect deadlocks

Currently we only detect whether a controller has been running continuously into errors. Whereas we wanted to detect real deadlock situations. This change defuses the aliveness check to only declare real locking situations as problematic. Additionally, to not create insane amounts of log traffic, this change will throttle the stack dumping to once every 15 minutes. Previously it would trigger almost immediately every health probe invocation and create multi-megabyte log spam. Signed-off-by: Thomas Jungblut <tjungblu@redhat.com>
openshift · Mar 27, 2024 · 4517a47 · 4517a47
1 parent c40d710
commit 4517a47
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 12 deletions.
diff --git a/pkg/operator/health/aliveness_checker.go b/pkg/operator/health/aliveness_checker.go
@@ -3,6 +3,7 @@ package health
 import (
 	"runtime"
 	"sync"
+	"time"
 
 	"k8s.io/klog/v2"
 )
@@ -15,6 +16,8 @@ type MultiAlivenessChecker struct {
 	m sync.Mutex
 	// name -> checker
 	checkerMap map[string]AlivenessChecker
+
+	lastPrintTime time.Time
 }
 
 func (r *MultiAlivenessChecker) Add(name string, c AlivenessChecker) {
@@ -31,11 +34,15 @@ func (r *MultiAlivenessChecker) Alive() bool {
 	for s, checker := range r.checkerMap {
 		if !checker.Alive() {
 			klog.Warningf("Controller [%s] didn't sync for a long time, declaring unhealthy and dumping stack", s)
-			// 12 mb should be enough for a full goroutine dump
-			buf := make([]byte, 1024*1024*12)
-			n := runtime.Stack(buf, true)
-			klog.Warningf("%s", buf[:n])
 
+			// we throttle this to once every 15m because dumping 12mb of logs for every probe failure is very expensive
+			if r.lastPrintTime.Add(time.Minute * 15).After(time.Now()) {
+				// 12 mb should be enough for a full goroutine dump
+				buf := make([]byte, 1024*1024*12)
+				n := runtime.Stack(buf, true)
+				klog.Warningf("%s", buf[:n])
+				r.lastPrintTime = time.Now()
+			}
 			return false
 		}
 	}
@@ -45,7 +52,8 @@ func (r *MultiAlivenessChecker) Alive() bool {
 
 func NewMultiAlivenessChecker() *MultiAlivenessChecker {
 	return &MultiAlivenessChecker{
-		m:          sync.Mutex{},
-		checkerMap: make(map[string]AlivenessChecker),
+		m:             sync.Mutex{},
+		checkerMap:    make(map[string]AlivenessChecker),
+		lastPrintTime: time.UnixMilli(0),
 	}
 }
diff --git a/pkg/operator/health/checking_sync_wrapper.go b/pkg/operator/health/checking_sync_wrapper.go
@@ -20,9 +20,9 @@ type CheckingSyncWrapper struct {
 
 func (r *CheckingSyncWrapper) Sync(ctx context.Context, controllerContext factory.SyncContext) error {
 	err := r.syncFunc(ctx, controllerContext)
-	if err == nil {
-		atomic.StoreInt64(&r.lastSuccessfulRun, time.Now().UnixMilli())
-	}
+	// we store the time regardless of success of the sync because some controllers error out when they have unhealthy members,
+	// whereas we actually want to detect deadlocks - which would entirely block the sync from returning.
+	atomic.StoreInt64(&r.lastSuccessfulRun, time.Now().UnixMilli())
 	return err
 }
 

diff --git a/pkg/operator/health/checking_sync_wrapper_test.go b/pkg/operator/health/checking_sync_wrapper_test.go
@@ -40,7 +40,7 @@ func TestHappyPathAliveness(t *testing.T) {
 
 }
 
-func TestErrorDoesNotUpdateSuccess(t *testing.T) {
+func TestErrorDoesUpdatesSuccess(t *testing.T) {
 	syncer := NewCheckingSyncWrapper(func(ctx context.Context, controllerContext factory.SyncContext) error {
 		return fmt.Errorf("some")
 	}, 1*time.Second)
@@ -49,6 +49,6 @@ func TestErrorDoesNotUpdateSuccess(t *testing.T) {
 
 	err := syncer.Sync(context.Background(), nil)
 	require.Error(t, err)
-	require.Equal(t, int64(0), syncer.lastSuccessfulRun)
-	require.False(t, syncer.Alive())
+	require.NotEqual(t, int64(0), syncer.lastSuccessfulRun)
+	require.True(t, syncer.Alive())
 }