From 5ccbe1446bef03b64b6aa558df939c3eb4cd4ff4 Mon Sep 17 00:00:00 2001 From: Corey Larson Date: Tue, 9 Aug 2022 14:11:48 -0600 Subject: [PATCH] Add config, logging for healthcheck Signed-off-by: Corey Larson Signed-off-by: Tonis Tiigi (cherry picked from commit b63786184b8a6df37366117c49ceb8c9d826225d) Signed-off-by: Tonis Tiigi --- session/grpc.go | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/session/grpc.go b/session/grpc.go index a7237ac35046..dd67c69b6466 100644 --- a/session/grpc.go +++ b/session/grpc.go @@ -2,6 +2,7 @@ package session import ( "context" + "math" "net" "sync/atomic" "time" @@ -10,6 +11,7 @@ import ( "github.com/moby/buildkit/util/bklog" "github.com/moby/buildkit/util/grpcerrors" "github.com/pkg/errors" + "github.com/sirupsen/logrus" "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "go.opentelemetry.io/otel/trace" "golang.org/x/net/http2" @@ -79,21 +81,55 @@ func monitorHealth(ctx context.Context, cc *grpc.ClientConn, cancelConn func()) defer cancelConn() defer cc.Close() - ticker := time.NewTicker(1 * time.Second) + ticker := time.NewTicker(5 * time.Second) defer ticker.Stop() healthClient := grpc_health_v1.NewHealthClient(cc) + failedBefore := false + consecutiveSuccessful := 0 + defaultHealthcheckDuration := 30 * time.Second + lastHealthcheckDuration := time.Duration(0) + for { select { case <-ctx.Done(): return case <-ticker.C: - ctx, cancel := context.WithTimeout(ctx, 10*time.Second) + // This healthcheck can erroneously fail in some instances, such as receiving lots of data in a low-bandwidth scenario or too many concurrent builds. + // So, this healthcheck is purposely long, and can tolerate some failures on purpose. + + healthcheckStart := time.Now() + + timeout := time.Duration(math.Max(float64(defaultHealthcheckDuration), float64(lastHealthcheckDuration)*1.5)) + ctx, cancel := context.WithTimeout(ctx, timeout) _, err := healthClient.Check(ctx, &grpc_health_v1.HealthCheckRequest{}) cancel() + + lastHealthcheckDuration = time.Since(healthcheckStart) + logFields := logrus.Fields{ + "timeout": timeout, + "actualDuration": lastHealthcheckDuration, + } + if err != nil { - return + if failedBefore { + bklog.G(ctx).Error("healthcheck failed fatally") + return + } + + failedBefore = true + consecutiveSuccessful = 0 + bklog.G(ctx).WithFields(logFields).Warn("healthcheck failed") + } else { + consecutiveSuccessful++ + + if consecutiveSuccessful >= 5 && failedBefore { + failedBefore = false + bklog.G(ctx).WithFields(logFields).Debug("reset healthcheck failure") + } } + + bklog.G(ctx).WithFields(logFields).Debug("healthcheck completed") } } }