Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Health Check retry Initialisation #155

Merged
merged 7 commits into from Jan 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 6 additions & 2 deletions backend/postgres/health_postgres.go
Expand Up @@ -6,15 +6,19 @@ package postgres
import (
"time"

"github.com/go-pg/pg"
"github.com/go-pg/pg/orm"
"github.com/pace/bricks/maintenance/health/servicehealthcheck"
)

// HealthCheck checks the state of a postgres connection. It must not be changed
// after it was registered as a health check.
type HealthCheck struct {
state servicehealthcheck.ConnectionState
Pool *pg.DB
Pool postgresQueryExecutor
}

type postgresQueryExecutor interface {
Exec(query interface{}, params ...interface{}) (res orm.Result, err error)
}

// Init initialises the test table
Expand Down
35 changes: 35 additions & 0 deletions backend/postgres/health_postgres_test.go
Expand Up @@ -9,9 +9,14 @@ import (
"net/http/httptest"
"strings"
"testing"
"time"

"github.com/go-pg/pg/orm"
http2 "github.com/pace/bricks/http"
"github.com/pace/bricks/maintenance/errors"
"github.com/pace/bricks/maintenance/health/servicehealthcheck"
"github.com/pace/bricks/maintenance/log"
"github.com/stretchr/testify/require"
)

func setup() *http.Response {
Expand Down Expand Up @@ -41,3 +46,33 @@ func TestIntegrationHealthCheck(t *testing.T) {
t.Errorf("Expected /health/check to return OK, got: %q", string(data[:]))
}
}

type testPool struct {
err error
}

func (t *testPool) Exec(query interface{}, params ...interface{}) (res orm.Result, err error) {
return nil, t.err
}

func TestHealthCheckCaching(t *testing.T) {
// set the TTL to a minute because this is long enough to test that the result is cached
cfg.HealthCheckResultTTL = time.Minute
requiredErr := errors.New("TestHealthCheckCaching")
pool := &testPool{err: requiredErr}
h := &HealthCheck{Pool: pool}
res := h.HealthCheck()
// get the error for the first time
require.Equal(t, servicehealthcheck.Err, res.State)
require.Equal(t, "TestHealthCheckCaching", res.Msg)
res = h.HealthCheck()
pool.err = nil
// getting the cached error
require.Equal(t, servicehealthcheck.Err, res.State)
require.Equal(t, "TestHealthCheckCaching", res.Msg)
// Resetting the TTL to get a uncached result
cfg.HealthCheckResultTTL = 0
res = h.HealthCheck()
require.Equal(t, servicehealthcheck.Ok, res.State)
require.Equal(t, "", res.Msg)
}
3 changes: 3 additions & 0 deletions maintenance/health/servicehealthcheck/README.md
@@ -1,4 +1,5 @@
# Health Checks

* Makes it possible to add a health check for a service (e.g. postgres). The list of checks is checked for the routes
`/health` and `/health/check`

Expand Down Expand Up @@ -42,3 +43,5 @@ for caching
* `/health` => OK and WARN means the service is healthy.
* `/health/check` => the complete result of the check is added to the response

## Environment Variables
`HEALTH_CHECK_INIT_RESULT_ERROR_TTL` : Amount of time to cache the errors that occur in the initialisation of the HealthCheck
47 changes: 42 additions & 5 deletions maintenance/health/servicehealthcheck/healthchecker.go
Expand Up @@ -7,7 +7,9 @@ import (
"fmt"
"net/http"
"sync"
"time"

"github.com/caarlos0/env"
"github.com/pace/bricks/maintenance/log"
)

Expand All @@ -22,13 +24,20 @@ type Initialisable interface {
Init() error
}

type config struct {
// Amount of time to cache the last init
HealthCheckInitResultErrorTTL time.Duration `env:"HEALTH_CHECK_INIT_RESULT_ERROR_TTL" envDefault:"10s"`
}

var cfg config

// requiredChecks contains all required registered Health Checks - key:Name
var requiredChecks sync.Map

// optionalChecks contains all optional registered Health Checks - key:Name
var optionalChecks sync.Map

// initErrors map with all errors that happened in the initialisation of any health check - key:Name
// initErrors map with all err ConnectionState that happened in the initialisation of any health check - key:Name
var initErrors sync.Map

// HealthState describes if a any error or warning occurred during the health check of a service
Expand All @@ -51,16 +60,23 @@ type HealthCheckResult struct {
Msg string
}

func init() {
err := env.Parse(&cfg)
if err != nil {
log.Fatalf("Failed to parse health check environment: %v", err)
}
}

func check(hcs *sync.Map) map[string]HealthCheckResult {
result := make(map[string]HealthCheckResult)
hcs.Range(func(key, value interface{}) bool {
name := key.(string)
hc := value.(HealthCheck)
// If it was not possible to initialise this health check, then show the initialisation error message
if val, isIn := initErrors.Load(name); isIn {
err := val.(error)
result[name] = HealthCheckResult{State: Err, Msg: err.Error()}
return true
if done := reInitHealthCheck(val.(*ConnectionState), result, name, hc.(Initialisable)); done {
return true
}
}
// this is the actual health check
result[name] = hc.HealthCheck()
Expand All @@ -69,6 +85,21 @@ func check(hcs *sync.Map) map[string]HealthCheckResult {
return result
}

func reInitHealthCheck(conState *ConnectionState, result map[string]HealthCheckResult, name string, initHc Initialisable) bool {
if time.Since(conState.LastChecked()) < cfg.HealthCheckInitResultErrorTTL {
result[name] = conState.GetState()
return true
}
err := initHc.Init()
if err != nil {
conState.SetErrorState(err)
result[name] = conState.GetState()
return true
}
initErrors.Delete(name)
return false
}

func writeResult(w http.ResponseWriter, status int, body string) {
w.Header().Set("Content-Type", "text/plain")
w.WriteHeader(status)
Expand Down Expand Up @@ -104,7 +135,13 @@ func registerHealthCheck(checks *sync.Map, hc HealthCheck, name string) {
if initHC, ok := hc.(Initialisable); ok {
if err := initHC.Init(); err != nil {
log.Warnf("error initialising health check %q: %s", name, err)
initErrors.Store(name, err)
initErrors.Store(name, &ConnectionState{
lastCheck: time.Now(),
result: HealthCheckResult{
State: Err,
Msg: err.Error(),
},
})
}
}
// save the length of the longest health check name, for the width of the column in /health/check
Expand Down
90 changes: 69 additions & 21 deletions maintenance/health/servicehealthcheck/healthchecker_test.go
Expand Up @@ -12,6 +12,7 @@ import (
"strings"
"sync"
"testing"
"time"

"github.com/stretchr/testify/require"
)
Expand Down Expand Up @@ -77,10 +78,7 @@ func TestHandlerHealthCheck(t *testing.T) {
}
for _, tc := range testCases {
t.Run(tc.title, func(t *testing.T) {
//remove all previous health checks
requiredChecks = sync.Map{}
optionalChecks = sync.Map{}
initErrors = sync.Map{}
resetHealthChecks()
rec := httptest.NewRecorder()
RegisterHealthCheck(tc.check, tc.check.name)
req := httptest.NewRequest("GET", "/health/", nil)
Expand All @@ -96,27 +94,80 @@ func TestHandlerHealthCheck(t *testing.T) {
}
}

func TestHandlerHealthCheckOptional(t *testing.T) {
checkOpt := &testHealthChecker{name: "TestHandlerHealthCheckErr", healthCheckErr: true}
checkReq := &testHealthChecker{name: "TestOk"}
//remove all previous health checks
requiredChecks = sync.Map{}
optionalChecks = sync.Map{}
initErrors = sync.Map{}
func TestInitErrorRetry(t *testing.T) {
// No caching of the init results
cfg.HealthCheckInitResultErrorTTL = 0

rec := httptest.NewRecorder()
RegisterHealthCheck(checkReq, checkReq.name)
RegisterOptionalHealthCheck(checkOpt, checkOpt.name)
resetHealthChecks()

// Create Check with initErr
checker := &testHealthChecker{
initErr: true,
healthCheckErr: false,
healthCheckWarn: false,
name: "initRetry",
}
RegisterHealthCheck(checker, checker.name)

testRequest(t, http.StatusServiceUnavailable, "ERR")

// remove initErr
checker.initErr = false
testRequest(t, http.StatusOK, "OK")
}

func testRequest(t *testing.T, expCode int, expBody string) {
req := httptest.NewRequest("GET", "/health/", nil)
rec := httptest.NewRecorder()
HealthHandler().ServeHTTP(rec, req)
resp := rec.Result()

require.Equal(t, http.StatusOK, resp.StatusCode)
require.Equal(t, expCode, resp.StatusCode)
data, err := ioutil.ReadAll(resp.Body)
require.NoError(t, err)
require.Equal(t, "OK", string(data))
require.Equal(t, expBody, string(data))
}

func resetHealthChecks() {
//remove all previous health checks
requiredChecks = sync.Map{}
optionalChecks = sync.Map{}
initErrors = sync.Map{}
}

func TestInitErrorCaching(t *testing.T) {
cfg.HealthCheckInitResultErrorTTL = time.Hour
hc := &testHealthChecker{
initErr: true,
healthCheckErr: false,
healthCheckWarn: false,
name: "initErrorCaching",
}

resetHealthChecks()
RegisterHealthCheck(hc, hc.name)

testRequest(t, http.StatusServiceUnavailable, "ERR")
hc.initErr = false
testRequest(t, http.StatusServiceUnavailable, "ERR")

cfg.HealthCheckInitResultErrorTTL = 0
resetHealthChecks()
hc.initErr = true
RegisterHealthCheck(hc, hc.name)
testRequest(t, http.StatusServiceUnavailable, "ERR")
hc.initErr = false
testRequest(t, http.StatusOK, "OK")

}
func TestHandlerHealthCheckOptional(t *testing.T) {
checkOpt := &testHealthChecker{name: "TestHandlerHealthCheckErr", healthCheckErr: true}
checkReq := &testHealthChecker{name: "TestOk"}
resetHealthChecks()

RegisterHealthCheck(checkReq, checkReq.name)
RegisterOptionalHealthCheck(checkOpt, checkOpt.name)

testRequest(t, http.StatusOK, "OK")
}

func TestHandlerReadableHealthCheck(t *testing.T) {
Expand Down Expand Up @@ -166,10 +217,7 @@ func TestHandlerReadableHealthCheck(t *testing.T) {
}
for _, tc := range testcases {
t.Run(tc.title, func(t *testing.T) {
//remove all previous health checks
requiredChecks = sync.Map{}
optionalChecks = sync.Map{}
initErrors = sync.Map{}
resetHealthChecks()

rec := httptest.NewRecorder()
for _, hc := range tc.req {
Expand Down