openshift · openshift-merge-bot · Mar 28, 2024 · Mar 13, 2024
diff --git a/pkg/operator/health/aliveness_checker.go b/pkg/operator/health/aliveness_checker.go
@@ -3,6 +3,7 @@ package health
 import (
 	"runtime"
 	"sync"
+	"time"
 
 	"k8s.io/klog/v2"
 )
@@ -15,6 +16,8 @@ type MultiAlivenessChecker struct {
 	m sync.Mutex
 	// name -> checker
 	checkerMap map[string]AlivenessChecker
+
+	lastPrintTime time.Time
 }
 
 func (r *MultiAlivenessChecker) Add(name string, c AlivenessChecker) {
@@ -31,11 +34,15 @@ func (r *MultiAlivenessChecker) Alive() bool {
 	for s, checker := range r.checkerMap {
 		if !checker.Alive() {
 			klog.Warningf("Controller [%s] didn't sync for a long time, declaring unhealthy and dumping stack", s)
-			// 12 mb should be enough for a full goroutine dump
-			buf := make([]byte, 1024*1024*12)
-			n := runtime.Stack(buf, true)
-			klog.Warningf("%s", buf[:n])
 
+			// we throttle this to once every 15m because dumping 12mb of logs for every probe failure is very expensive
+			if r.lastPrintTime.Add(time.Minute * 15).After(time.Now()) {
+				// 12 mb should be enough for a full goroutine dump
+				buf := make([]byte, 1024*1024*12)
+				n := runtime.Stack(buf, true)
+				klog.Warningf("%s", buf[:n])
+				r.lastPrintTime = time.Now()
+			}
 			return false
 		}
 	}
@@ -45,7 +52,8 @@ func (r *MultiAlivenessChecker) Alive() bool {
 
 func NewMultiAlivenessChecker() *MultiAlivenessChecker {
 	return &MultiAlivenessChecker{
-		m:          sync.Mutex{},
-		checkerMap: make(map[string]AlivenessChecker),
+		m:             sync.Mutex{},
+		checkerMap:    make(map[string]AlivenessChecker),
+		lastPrintTime: time.UnixMilli(0),
 	}
 }
diff --git a/pkg/operator/health/checking_sync_wrapper.go b/pkg/operator/health/checking_sync_wrapper.go
@@ -20,9 +20,9 @@ type CheckingSyncWrapper struct {
 
 func (r *CheckingSyncWrapper) Sync(ctx context.Context, controllerContext factory.SyncContext) error {
 	err := r.syncFunc(ctx, controllerContext)
-	if err == nil {
-		atomic.StoreInt64(&r.lastSuccessfulRun, time.Now().UnixMilli())
-	}
+	// we store the time regardless of success of the sync because some controllers error out when they have unhealthy members,
+	// whereas we actually want to detect deadlocks - which would entirely block the sync from returning.
+	atomic.StoreInt64(&r.lastSuccessfulRun, time.Now().UnixMilli())
 	return err
 }
 

diff --git a/pkg/operator/health/checking_sync_wrapper_test.go b/pkg/operator/health/checking_sync_wrapper_test.go
@@ -40,7 +40,7 @@ func TestHappyPathAliveness(t *testing.T) {
 
 }
 
-func TestErrorDoesNotUpdateSuccess(t *testing.T) {
+func TestErrorDoesUpdatesSuccess(t *testing.T) {
 	syncer := NewCheckingSyncWrapper(func(ctx context.Context, controllerContext factory.SyncContext) error {
 		return fmt.Errorf("some")
 	}, 1*time.Second)
@@ -49,6 +49,6 @@ func TestErrorDoesNotUpdateSuccess(t *testing.T) {
 
 	err := syncer.Sync(context.Background(), nil)
 	require.Error(t, err)
-	require.Equal(t, int64(0), syncer.lastSuccessfulRun)
-	require.False(t, syncer.Alive())
+	require.NotEqual(t, int64(0), syncer.lastSuccessfulRun)
+	require.True(t, syncer.Alive())
 }