diff --git a/backend/postgres/health_postgres.go b/backend/postgres/health_postgres.go index 45052422..3d1f2a78 100644 --- a/backend/postgres/health_postgres.go +++ b/backend/postgres/health_postgres.go @@ -6,7 +6,7 @@ package postgres import ( "time" - "github.com/go-pg/pg" + "github.com/go-pg/pg/orm" "github.com/pace/bricks/maintenance/health/servicehealthcheck" ) @@ -14,7 +14,11 @@ import ( // after it was registered as a health check. type HealthCheck struct { state servicehealthcheck.ConnectionState - Pool *pg.DB + Pool postgresQueryExecutor +} + +type postgresQueryExecutor interface { + Exec(query interface{}, params ...interface{}) (res orm.Result, err error) } // Init initialises the test table diff --git a/backend/postgres/health_postgres_test.go b/backend/postgres/health_postgres_test.go index 589eff93..d0b45bc6 100644 --- a/backend/postgres/health_postgres_test.go +++ b/backend/postgres/health_postgres_test.go @@ -9,9 +9,14 @@ import ( "net/http/httptest" "strings" "testing" + "time" + "github.com/go-pg/pg/orm" http2 "github.com/pace/bricks/http" + "github.com/pace/bricks/maintenance/errors" + "github.com/pace/bricks/maintenance/health/servicehealthcheck" "github.com/pace/bricks/maintenance/log" + "github.com/stretchr/testify/require" ) func setup() *http.Response { @@ -41,3 +46,33 @@ func TestIntegrationHealthCheck(t *testing.T) { t.Errorf("Expected /health/check to return OK, got: %q", string(data[:])) } } + +type testPool struct { + err error +} + +func (t *testPool) Exec(query interface{}, params ...interface{}) (res orm.Result, err error) { + return nil, t.err +} + +func TestHealthCheckCaching(t *testing.T) { + // set the TTL to a minute because this is long enough to test that the result is cached + cfg.HealthCheckResultTTL = time.Minute + requiredErr := errors.New("TestHealthCheckCaching") + pool := &testPool{err: requiredErr} + h := &HealthCheck{Pool: pool} + res := h.HealthCheck() + // get the error for the first time + require.Equal(t, servicehealthcheck.Err, res.State) + require.Equal(t, "TestHealthCheckCaching", res.Msg) + res = h.HealthCheck() + pool.err = nil + // getting the cached error + require.Equal(t, servicehealthcheck.Err, res.State) + require.Equal(t, "TestHealthCheckCaching", res.Msg) + // Resetting the TTL to get a uncached result + cfg.HealthCheckResultTTL = 0 + res = h.HealthCheck() + require.Equal(t, servicehealthcheck.Ok, res.State) + require.Equal(t, "", res.Msg) +} diff --git a/maintenance/health/servicehealthcheck/README.md b/maintenance/health/servicehealthcheck/README.md index 29b90357..c7cbdd34 100644 --- a/maintenance/health/servicehealthcheck/README.md +++ b/maintenance/health/servicehealthcheck/README.md @@ -1,4 +1,5 @@ # Health Checks + * Makes it possible to add a health check for a service (e.g. postgres). The list of checks is checked for the routes `/health` and `/health/check` @@ -42,3 +43,5 @@ for caching * `/health` => OK and WARN means the service is healthy. * `/health/check` => the complete result of the check is added to the response +## Environment Variables +`HEALTH_CHECK_INIT_RESULT_ERROR_TTL` : Amount of time to cache the errors that occur in the initialisation of the HealthCheck \ No newline at end of file diff --git a/maintenance/health/servicehealthcheck/healthchecker.go b/maintenance/health/servicehealthcheck/healthchecker.go index a9edd9b8..aa4c0423 100644 --- a/maintenance/health/servicehealthcheck/healthchecker.go +++ b/maintenance/health/servicehealthcheck/healthchecker.go @@ -7,7 +7,9 @@ import ( "fmt" "net/http" "sync" + "time" + "github.com/caarlos0/env" "github.com/pace/bricks/maintenance/log" ) @@ -22,13 +24,20 @@ type Initialisable interface { Init() error } +type config struct { + // Amount of time to cache the last init + HealthCheckInitResultErrorTTL time.Duration `env:"HEALTH_CHECK_INIT_RESULT_ERROR_TTL" envDefault:"10s"` +} + +var cfg config + // requiredChecks contains all required registered Health Checks - key:Name var requiredChecks sync.Map // optionalChecks contains all optional registered Health Checks - key:Name var optionalChecks sync.Map -// initErrors map with all errors that happened in the initialisation of any health check - key:Name +// initErrors map with all err ConnectionState that happened in the initialisation of any health check - key:Name var initErrors sync.Map // HealthState describes if a any error or warning occurred during the health check of a service @@ -51,6 +60,13 @@ type HealthCheckResult struct { Msg string } +func init() { + err := env.Parse(&cfg) + if err != nil { + log.Fatalf("Failed to parse health check environment: %v", err) + } +} + func check(hcs *sync.Map) map[string]HealthCheckResult { result := make(map[string]HealthCheckResult) hcs.Range(func(key, value interface{}) bool { @@ -58,9 +74,9 @@ func check(hcs *sync.Map) map[string]HealthCheckResult { hc := value.(HealthCheck) // If it was not possible to initialise this health check, then show the initialisation error message if val, isIn := initErrors.Load(name); isIn { - err := val.(error) - result[name] = HealthCheckResult{State: Err, Msg: err.Error()} - return true + if done := reInitHealthCheck(val.(*ConnectionState), result, name, hc.(Initialisable)); done { + return true + } } // this is the actual health check result[name] = hc.HealthCheck() @@ -69,6 +85,21 @@ func check(hcs *sync.Map) map[string]HealthCheckResult { return result } +func reInitHealthCheck(conState *ConnectionState, result map[string]HealthCheckResult, name string, initHc Initialisable) bool { + if time.Since(conState.LastChecked()) < cfg.HealthCheckInitResultErrorTTL { + result[name] = conState.GetState() + return true + } + err := initHc.Init() + if err != nil { + conState.SetErrorState(err) + result[name] = conState.GetState() + return true + } + initErrors.Delete(name) + return false +} + func writeResult(w http.ResponseWriter, status int, body string) { w.Header().Set("Content-Type", "text/plain") w.WriteHeader(status) @@ -104,7 +135,13 @@ func registerHealthCheck(checks *sync.Map, hc HealthCheck, name string) { if initHC, ok := hc.(Initialisable); ok { if err := initHC.Init(); err != nil { log.Warnf("error initialising health check %q: %s", name, err) - initErrors.Store(name, err) + initErrors.Store(name, &ConnectionState{ + lastCheck: time.Now(), + result: HealthCheckResult{ + State: Err, + Msg: err.Error(), + }, + }) } } // save the length of the longest health check name, for the width of the column in /health/check diff --git a/maintenance/health/servicehealthcheck/healthchecker_test.go b/maintenance/health/servicehealthcheck/healthchecker_test.go index 14902728..ccff7cad 100644 --- a/maintenance/health/servicehealthcheck/healthchecker_test.go +++ b/maintenance/health/servicehealthcheck/healthchecker_test.go @@ -12,6 +12,7 @@ import ( "strings" "sync" "testing" + "time" "github.com/stretchr/testify/require" ) @@ -77,10 +78,7 @@ func TestHandlerHealthCheck(t *testing.T) { } for _, tc := range testCases { t.Run(tc.title, func(t *testing.T) { - //remove all previous health checks - requiredChecks = sync.Map{} - optionalChecks = sync.Map{} - initErrors = sync.Map{} + resetHealthChecks() rec := httptest.NewRecorder() RegisterHealthCheck(tc.check, tc.check.name) req := httptest.NewRequest("GET", "/health/", nil) @@ -96,27 +94,80 @@ func TestHandlerHealthCheck(t *testing.T) { } } -func TestHandlerHealthCheckOptional(t *testing.T) { - checkOpt := &testHealthChecker{name: "TestHandlerHealthCheckErr", healthCheckErr: true} - checkReq := &testHealthChecker{name: "TestOk"} - //remove all previous health checks - requiredChecks = sync.Map{} - optionalChecks = sync.Map{} - initErrors = sync.Map{} +func TestInitErrorRetry(t *testing.T) { + // No caching of the init results + cfg.HealthCheckInitResultErrorTTL = 0 - rec := httptest.NewRecorder() - RegisterHealthCheck(checkReq, checkReq.name) - RegisterOptionalHealthCheck(checkOpt, checkOpt.name) + resetHealthChecks() + // Create Check with initErr + checker := &testHealthChecker{ + initErr: true, + healthCheckErr: false, + healthCheckWarn: false, + name: "initRetry", + } + RegisterHealthCheck(checker, checker.name) + + testRequest(t, http.StatusServiceUnavailable, "ERR") + + // remove initErr + checker.initErr = false + testRequest(t, http.StatusOK, "OK") +} + +func testRequest(t *testing.T, expCode int, expBody string) { req := httptest.NewRequest("GET", "/health/", nil) + rec := httptest.NewRecorder() HealthHandler().ServeHTTP(rec, req) resp := rec.Result() - - require.Equal(t, http.StatusOK, resp.StatusCode) + require.Equal(t, expCode, resp.StatusCode) data, err := ioutil.ReadAll(resp.Body) require.NoError(t, err) - require.Equal(t, "OK", string(data)) + require.Equal(t, expBody, string(data)) +} + +func resetHealthChecks() { + //remove all previous health checks + requiredChecks = sync.Map{} + optionalChecks = sync.Map{} + initErrors = sync.Map{} +} + +func TestInitErrorCaching(t *testing.T) { + cfg.HealthCheckInitResultErrorTTL = time.Hour + hc := &testHealthChecker{ + initErr: true, + healthCheckErr: false, + healthCheckWarn: false, + name: "initErrorCaching", + } + + resetHealthChecks() + RegisterHealthCheck(hc, hc.name) + + testRequest(t, http.StatusServiceUnavailable, "ERR") + hc.initErr = false + testRequest(t, http.StatusServiceUnavailable, "ERR") + + cfg.HealthCheckInitResultErrorTTL = 0 + resetHealthChecks() + hc.initErr = true + RegisterHealthCheck(hc, hc.name) + testRequest(t, http.StatusServiceUnavailable, "ERR") + hc.initErr = false + testRequest(t, http.StatusOK, "OK") + +} +func TestHandlerHealthCheckOptional(t *testing.T) { + checkOpt := &testHealthChecker{name: "TestHandlerHealthCheckErr", healthCheckErr: true} + checkReq := &testHealthChecker{name: "TestOk"} + resetHealthChecks() + + RegisterHealthCheck(checkReq, checkReq.name) + RegisterOptionalHealthCheck(checkOpt, checkOpt.name) + testRequest(t, http.StatusOK, "OK") } func TestHandlerReadableHealthCheck(t *testing.T) { @@ -166,10 +217,7 @@ func TestHandlerReadableHealthCheck(t *testing.T) { } for _, tc := range testcases { t.Run(tc.title, func(t *testing.T) { - //remove all previous health checks - requiredChecks = sync.Map{} - optionalChecks = sync.Map{} - initErrors = sync.Map{} + resetHealthChecks() rec := httptest.NewRecorder() for _, hc := range tc.req {