Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-4.14] OCPBUGS-31428: CEO aliveness check should only detect deadlocks #1231

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
20 changes: 14 additions & 6 deletions pkg/operator/health/aliveness_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package health
import (
"runtime"
"sync"
"time"

"k8s.io/klog/v2"
)
Expand All @@ -15,6 +16,8 @@ type MultiAlivenessChecker struct {
m sync.Mutex
// name -> checker
checkerMap map[string]AlivenessChecker

lastPrintTime time.Time
}

func (r *MultiAlivenessChecker) Add(name string, c AlivenessChecker) {
Expand All @@ -31,11 +34,15 @@ func (r *MultiAlivenessChecker) Alive() bool {
for s, checker := range r.checkerMap {
if !checker.Alive() {
klog.Warningf("Controller [%s] didn't sync for a long time, declaring unhealthy and dumping stack", s)
// 12 mb should be enough for a full goroutine dump
buf := make([]byte, 1024*1024*12)
n := runtime.Stack(buf, true)
klog.Warningf("%s", buf[:n])

// we throttle this to once every 15m because dumping 12mb of logs for every probe failure is very expensive
if r.lastPrintTime.Add(time.Minute * 15).After(time.Now()) {
// 12 mb should be enough for a full goroutine dump
buf := make([]byte, 1024*1024*12)
n := runtime.Stack(buf, true)
klog.Warningf("%s", buf[:n])
r.lastPrintTime = time.Now()
}
return false
}
}
Expand All @@ -45,7 +52,8 @@ func (r *MultiAlivenessChecker) Alive() bool {

func NewMultiAlivenessChecker() *MultiAlivenessChecker {
return &MultiAlivenessChecker{
m: sync.Mutex{},
checkerMap: make(map[string]AlivenessChecker),
m: sync.Mutex{},
checkerMap: make(map[string]AlivenessChecker),
lastPrintTime: time.UnixMilli(0),
}
}
6 changes: 3 additions & 3 deletions pkg/operator/health/checking_sync_wrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ type CheckingSyncWrapper struct {

func (r *CheckingSyncWrapper) Sync(ctx context.Context, controllerContext factory.SyncContext) error {
err := r.syncFunc(ctx, controllerContext)
if err == nil {
atomic.StoreInt64(&r.lastSuccessfulRun, time.Now().UnixMilli())
}
// we store the time regardless of success of the sync because some controllers error out when they have unhealthy members,
// whereas we actually want to detect deadlocks - which would entirely block the sync from returning.
atomic.StoreInt64(&r.lastSuccessfulRun, time.Now().UnixMilli())
return err
}

Expand Down
6 changes: 3 additions & 3 deletions pkg/operator/health/checking_sync_wrapper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func TestHappyPathAliveness(t *testing.T) {

}

func TestErrorDoesNotUpdateSuccess(t *testing.T) {
func TestErrorDoesUpdatesSuccess(t *testing.T) {
syncer := NewCheckingSyncWrapper(func(ctx context.Context, controllerContext factory.SyncContext) error {
return fmt.Errorf("some")
}, 1*time.Second)
Expand All @@ -49,6 +49,6 @@ func TestErrorDoesNotUpdateSuccess(t *testing.T) {

err := syncer.Sync(context.Background(), nil)
require.Error(t, err)
require.Equal(t, int64(0), syncer.lastSuccessfulRun)
require.False(t, syncer.Alive())
require.NotEqual(t, int64(0), syncer.lastSuccessfulRun)
require.True(t, syncer.Alive())
}