Skip to content

Commit

Permalink
Include recent errors in canary status when failing
Browse files Browse the repository at this point in the history
When the canary is failing, put up to 3 of the most recent errors into the status message to aid in debugging. If the errors have occurred more than once since the canary was last passing, also include the number of occurrences in the status.
  • Loading branch information
rfredette committed Feb 8, 2024
1 parent d31546e commit 6557c06
Show file tree
Hide file tree
Showing 2 changed files with 159 additions and 3 deletions.
71 changes: 68 additions & 3 deletions pkg/operator/controller/canary/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"strconv"
"strings"
"sync"
"time"

Expand Down Expand Up @@ -43,6 +44,9 @@ const (
// canaryCheckFailureCount is how many successive failing canary checks should
// be observed before the default ingress controller goes degraded.
canaryCheckFailureCount = 5
// canaryFailingNumErrors is how many error messages to include in the
// CanaryChecksSucceeding status condition when checks are failing
canaryFailingNumErrors = 3

// CanaryRouteRotationAnnotation is an annotation on the default ingress controller
// that specifies whether or not the canary check loop should periodically rotate
Expand Down Expand Up @@ -221,6 +225,15 @@ func (r *reconciler) isCanaryRouteRotationEnabled() bool {
return r.enableCanaryRouteRotation
}

type timestampedError struct {
timestamp time.Time
err error
}

func (err timestampedError) Error() string {
return err.err.Error()
}

func (r *reconciler) startCanaryRoutePolling(stop <-chan struct{}) error {
// Keep track of how many canary checks have passed
// so the route endpoint can be periodically cycled
Expand All @@ -231,6 +244,8 @@ func (r *reconciler) startCanaryRoutePolling(stop <-chan struct{}) error {
// for status reporting.
successiveFail := 0

errors := []timestampedError{}

go wait.Until(func() {
// Get the current canary route every iteration in case it has been modified
haveRoute, route, err := r.currentCanaryRoute()
Expand Down Expand Up @@ -282,9 +297,10 @@ func (r *reconciler) startCanaryRoutePolling(stop <-chan struct{}) error {
log.Error(err, "error performing canary route check")
SetCanaryRouteReachableMetric(getRouteHost(route), false)
successiveFail += 1
errors = append(errors, timestampedError{err: err, timestamp: time.Now()})
// Mark the default ingress controller degraded after 5 successive canary check failures
if successiveFail >= canaryCheckFailureCount {
if err := r.setCanaryFailingStatusCondition(); err != nil {
if err := r.setCanaryFailingStatusCondition(errors); err != nil {
log.Error(err, "error updating canary status condition")
}
}
Expand All @@ -296,6 +312,7 @@ func (r *reconciler) startCanaryRoutePolling(stop <-chan struct{}) error {
log.Error(err, "error updating canary status condition")
}
successiveFail = 0
errors = []timestampedError{}
// Only increment checkCount if periodic canary route
// endpoint rotation is enabled to prevent unbounded
// integer growth.
Expand All @@ -307,17 +324,65 @@ func (r *reconciler) startCanaryRoutePolling(stop <-chan struct{}) error {
return nil
}

func (r *reconciler) setCanaryFailingStatusCondition() error {
func (r *reconciler) setCanaryFailingStatusCondition(errors []timestampedError) error {
errorStrings := deduplicateErrorStrings(errors, time.Now())
if len(errorStrings) > canaryFailingNumErrors {
errorStrings = errorStrings[len(errorStrings)-canaryFailingNumErrors:]
}
cond := operatorv1.OperatorCondition{
Type: ingresscontroller.IngressControllerCanaryCheckSuccessConditionType,
Status: operatorv1.ConditionFalse,
Reason: "CanaryChecksRepetitiveFailures",
Message: "Canary route checks for the default ingress controller are failing",
Message: fmt.Sprintf("Canary route checks for the default ingress controller are failing. Last %d error messages:\n%s", len(errorStrings), strings.Join(errorStrings, "\n")),
}

return r.setCanaryStatusCondition(cond)
}

type dedupCounter struct {
Count int
FirstOccurrence time.Time
}

// deduplicateErrorStrings takes a slice of errors in chronological order, prunes the list for unique error messages
// It returns a slice of the error message as strings, with any string seen more than once incuding how many times it
// was seen.
// The chronological order is preserved, based on the last time each error message was seen
func deduplicateErrorStrings(errors []timestampedError, now time.Time) []string {
encountered := map[string]dedupCounter{}
uniqueErrors := []string{}
// Iterate over the list of errors from newest to oldest, keeping track of how many times each error message was
// seen. This iteration order makes sure we preserve order based on the newest time each message was seen.
for i := len(errors) - 1; i >= 0; i-- {
err := errors[i]
if errCounter := encountered[err.Error()]; errCounter.Count >= 1 {
errCounter.Count += 1
if errCounter.FirstOccurrence.After(err.timestamp) {
errCounter.FirstOccurrence = err.timestamp
}
encountered[err.Error()] = errCounter
continue
}
encountered[err.Error()] = dedupCounter{
Count: 1,
FirstOccurrence: err.timestamp,
}
uniqueErrors = append(uniqueErrors, err.Error())
}
ret, j := make([]string, len(uniqueErrors)), len(uniqueErrors)-1
// Now that all error messages have been ordered by their most recent occurrence, reverse the list to switch it to
// cronological order, and append a message indicating how often an error has occurred if it has more than one
// occurrence.
for _, err := range uniqueErrors {
if counter := encountered[err]; counter.Count > 1 {
err = fmt.Sprintf("%s (x%d over %v)", err, counter.Count, now.Sub(counter.FirstOccurrence).Round(time.Second))
}
ret[j] = err
j--
}
return ret
}

func (r *reconciler) setCanaryPassingStatusCondition() error {
cond := operatorv1.OperatorCondition{
Type: ingresscontroller.IngressControllerCanaryCheckSuccessConditionType,
Expand Down
91 changes: 91 additions & 0 deletions pkg/operator/controller/canary/controller_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
package canary

import (
"fmt"
"strings"
"testing"
"time"

"github.com/google/go-cmp/cmp"

Expand Down Expand Up @@ -145,3 +148,91 @@ func Test_cycleServicePort(t *testing.T) {
})
}
}

func Test_deduplicateErrorStrings(t *testing.T) {
now := time.Now()
testCases := []struct {
Name string
ErrorMessages []timestampedError
ExpectedResult []string
}{
{
Name: "Empty input",
ErrorMessages: []timestampedError{},
ExpectedResult: []string{},
},
{
Name: "Multiple errors, no repetition",
ErrorMessages: []timestampedError{
{err: fmt.Errorf("foo"), timestamp: now.Add(-10 * time.Second)},
{err: fmt.Errorf("bar"), timestamp: now.Add(-9 * time.Second)},
{err: fmt.Errorf("baz"), timestamp: now.Add(-8 * time.Second)},
{err: fmt.Errorf("quux"), timestamp: now.Add(-7 * time.Second)},
},
ExpectedResult: []string{
"foo",
"bar",
"baz",
"quux",
},
},
{
Name: "All identical errors",
ErrorMessages: []timestampedError{
{err: fmt.Errorf("foo"), timestamp: now.Add(-10 * time.Second)},
{err: fmt.Errorf("foo"), timestamp: now.Add(-9 * time.Second)},
{err: fmt.Errorf("foo"), timestamp: now.Add(-8 * time.Second)},
{err: fmt.Errorf("foo"), timestamp: now.Add(-7 * time.Second)},
},
ExpectedResult: []string{
"foo (x4 over 10s)",
},
},
{
Name: "Multiple errors, with repetition",
ErrorMessages: []timestampedError{
{err: fmt.Errorf("foo"), timestamp: now.Add(-10 * time.Second)},
{err: fmt.Errorf("bar"), timestamp: now.Add(-9 * time.Second)},
{err: fmt.Errorf("foo"), timestamp: now.Add(-8 * time.Second)},
{err: fmt.Errorf("baz"), timestamp: now.Add(-7 * time.Second)},
},
ExpectedResult: []string{
"bar",
"foo (x2 over 10s)",
"baz",
},
},
{
Name: "Many errors, with repetition",
ErrorMessages: []timestampedError{
{err: fmt.Errorf("foo"), timestamp: now.Add(-20 * time.Second)},
{err: fmt.Errorf("bar"), timestamp: now.Add(-19 * time.Second)},
{err: fmt.Errorf("foo"), timestamp: now.Add(-18 * time.Second)},
{err: fmt.Errorf("bar"), timestamp: now.Add(-17 * time.Second)},
{err: fmt.Errorf("baz"), timestamp: now.Add(-16 * time.Second)},
{err: fmt.Errorf("foo"), timestamp: now.Add(-15 * time.Second)},
{err: fmt.Errorf("foo"), timestamp: now.Add(-14 * time.Second)},
{err: fmt.Errorf("quux"), timestamp: now.Add(-13 * time.Second)},
{err: fmt.Errorf("quux"), timestamp: now.Add(-12 * time.Second)},
{err: fmt.Errorf("foo"), timestamp: now.Add(-11 * time.Second)},
{err: fmt.Errorf("foo"), timestamp: now.Add(-10 * time.Second)},
{err: fmt.Errorf("quux"), timestamp: now.Add(-9 * time.Second)},
},
ExpectedResult: []string{
"bar (x2 over 19s)",
"baz",
"foo (x6 over 20s)",
"quux (x3 over 13s)",
},
},
}

for _, tc := range testCases {
t.Run(tc.Name, func(t *testing.T) {
result := deduplicateErrorStrings(tc.ErrorMessages, now)
if !cmp.Equal(tc.ExpectedResult, result) {
t.Errorf("Expected result:\n%s\nbut got:\n%s", strings.Join(tc.ExpectedResult, "\n"), strings.Join(result, "\n"))
}
})
}
}

0 comments on commit 6557c06

Please sign in to comment.