Skip to content

Commit

Permalink
Add config, logging for healthcheck
Browse files Browse the repository at this point in the history
Signed-off-by: Corey Larson <corey@earthly.dev>
Signed-off-by: Tonis Tiigi <tonistiigi@gmail.com>
(cherry picked from commit b637861)
Signed-off-by: Tonis Tiigi <tonistiigi@gmail.com>
  • Loading branch information
dchw authored and tonistiigi committed Aug 22, 2022
1 parent b8cdffd commit 5ccbe14
Showing 1 changed file with 39 additions and 3 deletions.
42 changes: 39 additions & 3 deletions session/grpc.go
Expand Up @@ -2,6 +2,7 @@ package session

import (
"context"
"math"
"net"
"sync/atomic"
"time"
Expand All @@ -10,6 +11,7 @@ import (
"github.com/moby/buildkit/util/bklog"
"github.com/moby/buildkit/util/grpcerrors"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
"go.opentelemetry.io/otel/trace"
"golang.org/x/net/http2"
Expand Down Expand Up @@ -79,21 +81,55 @@ func monitorHealth(ctx context.Context, cc *grpc.ClientConn, cancelConn func())
defer cancelConn()
defer cc.Close()

ticker := time.NewTicker(1 * time.Second)
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
healthClient := grpc_health_v1.NewHealthClient(cc)

failedBefore := false
consecutiveSuccessful := 0
defaultHealthcheckDuration := 30 * time.Second
lastHealthcheckDuration := time.Duration(0)

for {
select {
case <-ctx.Done():
return
case <-ticker.C:
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
// This healthcheck can erroneously fail in some instances, such as receiving lots of data in a low-bandwidth scenario or too many concurrent builds.
// So, this healthcheck is purposely long, and can tolerate some failures on purpose.

healthcheckStart := time.Now()

timeout := time.Duration(math.Max(float64(defaultHealthcheckDuration), float64(lastHealthcheckDuration)*1.5))
ctx, cancel := context.WithTimeout(ctx, timeout)
_, err := healthClient.Check(ctx, &grpc_health_v1.HealthCheckRequest{})
cancel()

lastHealthcheckDuration = time.Since(healthcheckStart)
logFields := logrus.Fields{
"timeout": timeout,
"actualDuration": lastHealthcheckDuration,
}

if err != nil {
return
if failedBefore {
bklog.G(ctx).Error("healthcheck failed fatally")
return
}

failedBefore = true
consecutiveSuccessful = 0
bklog.G(ctx).WithFields(logFields).Warn("healthcheck failed")
} else {
consecutiveSuccessful++

if consecutiveSuccessful >= 5 && failedBefore {
failedBefore = false
bklog.G(ctx).WithFields(logFields).Debug("reset healthcheck failure")
}
}

bklog.G(ctx).WithFields(logFields).Debug("healthcheck completed")
}
}
}

0 comments on commit 5ccbe14

Please sign in to comment.