Skip to content

Commit

Permalink
Merge pull request #28332 from dgoodwin/alert-pin-tests
Browse files Browse the repository at this point in the history
Improvements and Additions to Alert Testing Stack
  • Loading branch information
openshift-ci[bot] committed Oct 20, 2023
2 parents abb76df + dc3c6fc commit 617bc41
Show file tree
Hide file tree
Showing 24 changed files with 383 additions and 304 deletions.
152 changes: 0 additions & 152 deletions pkg/alerts/check.go

This file was deleted.

2 changes: 1 addition & 1 deletion pkg/cmd/openshift-tests/dev/dev.go
Expand Up @@ -135,7 +135,7 @@ func readIntervalsFromFile(intervalsFile string) (monitorapi.Intervals, error) {
return nil, err
}

return monitorserialization.EventsFromJSON(jsonBytes)
return monitorserialization.IntervalsFromJSON(jsonBytes)
}

func newRunDisruptionInvariantsCommand() *cobra.Command {
Expand Down
Expand Up @@ -57,7 +57,7 @@ func NewTimelineOptions(ioStreams genericclioptions.IOStreams) *TimelineOptions

IOStreams: ioStreams,
KnownRenderers: map[string]RenderFunc{
"json": monitorserialization.EventsToJSON,
"json": monitorserialization.IntervalsToJSON,
"html": renderHTML,
},
KnownTimelines: map[string]monitorapi.EventIntervalMatchesFunc{
Expand Down
2 changes: 1 addition & 1 deletion pkg/disruption/backend/sampler/remote.go
Expand Up @@ -380,7 +380,7 @@ func fetchEventsFromFileOnNode(ctx context.Context, clientset *kubernetes.Client
if err != nil {
return filteredEvents, fmt.Errorf("failed to fetch file %s on node %s: %v", filePath, nodeName, err)
}
allEvents, err := monitorserialization.EventsFromJSON(allBytes)
allEvents, err := monitorserialization.IntervalsFromJSON(allBytes)
if err != nil {
return nil, fmt.Errorf("failed to convert file %s from node %s to intervals: %v", filePath, nodeName, err)
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/monitor/monitorapi/types.go
Expand Up @@ -199,6 +199,8 @@ const (
type AnnotationKey string

const (
AnnotationAlertState AnnotationKey = "alertstate"
AnnotationSeverity AnnotationKey = "severity"
AnnotationReason AnnotationKey = "reason"
AnnotationContainerExitCode AnnotationKey = "code"
AnnotationCause AnnotationKey = "cause"
Expand Down
19 changes: 10 additions & 9 deletions pkg/monitor/serialization/serialize.go
Expand Up @@ -37,7 +37,7 @@ type EventIntervalList struct {
}

func EventsToFile(filename string, events monitorapi.Intervals) error {
json, err := EventsToJSON(events)
json, err := IntervalsToJSON(events)
if err != nil {
return err
}
Expand All @@ -49,10 +49,10 @@ func EventsFromFile(filename string) (monitorapi.Intervals, error) {
if err != nil {
return nil, err
}
return EventsFromJSON(data)
return IntervalsFromJSON(data)
}

func EventsFromJSON(data []byte) (monitorapi.Intervals, error) {
func IntervalsFromJSON(data []byte) (monitorapi.Intervals, error) {
var list EventIntervalList
if err := json.Unmarshal(data, &list); err != nil {
return nil, err
Expand All @@ -68,8 +68,8 @@ func EventsFromJSON(data []byte) (monitorapi.Intervals, error) {
Condition: monitorapi.Condition{
Level: level,
Locator: interval.Locator,
Message: interval.Message,
StructuredLocator: interval.StructuredLocator,
Message: interval.Message,
StructuredMessage: interval.StructuredMessage,
},

Expand Down Expand Up @@ -120,9 +120,9 @@ func IntervalToOneLineJSON(interval monitorapi.Interval) ([]byte, error) {
return buf.Bytes(), nil
}

func EventsToJSON(events monitorapi.Intervals) ([]byte, error) {
func IntervalsToJSON(intervals monitorapi.Intervals) ([]byte, error) {
outputEvents := []EventInterval{}
for _, curr := range events {
for _, curr := range intervals {
outputEvents = append(outputEvents, monitorEventIntervalToEventInterval(curr))
}

Expand All @@ -131,14 +131,16 @@ func EventsToJSON(events monitorapi.Intervals) ([]byte, error) {
return json.MarshalIndent(list, "", " ")
}

func EventsIntervalsToFile(filename string, events monitorapi.Intervals) error {
json, err := EventsIntervalsToJSON(events)
func IntervalsToFile(filename string, intervals monitorapi.Intervals) error {
json, err := EventsIntervalsToJSON(intervals)
if err != nil {
return err
}
return ioutil.WriteFile(filename, json, 0644)
}

// TODO: this is very similar but subtly different to the function above, what is the purpose of skipping those
// with from/to equal or empty to?
func EventsIntervalsToJSON(events monitorapi.Intervals) ([]byte, error) {
outputEvents := []EventInterval{}
for _, curr := range events {
Expand All @@ -165,7 +167,6 @@ func monitorEventIntervalToEventInterval(interval monitorapi.Interval) EventInte
From: metav1.Time{Time: interval.From},
To: metav1.Time{Time: interval.To},
}

return ret
}

Expand Down
6 changes: 6 additions & 0 deletions pkg/monitortestlibrary/allowedalerts/basic_alert.go
Expand Up @@ -31,6 +31,7 @@ type AlertTest interface {

// AlertState is the state of the alert. They are logically ordered, so if a test says it limits on "pending", then
// any state above pending (like info or warning) will cause the test to fail.
// TODO this looks wrong, AlertState (pending|firing) and AlertLevel (info|warning|critical) are different things, but they seem lumped together here.
type AlertState string

const (
Expand Down Expand Up @@ -111,6 +112,11 @@ func (a *alertBuilder) neverFail() *alertBuilder {
return a
}

func (a *alertBuilder) alwaysFlake() *alertBuilder {
a.allowanceCalculator = alwaysFlake()
return a
}

func (a *alertBuilder) toTests() []AlertTest {
if !a.divideByNamespaces {
return []AlertTest{
Expand Down
19 changes: 18 additions & 1 deletion pkg/monitortestlibrary/allowedalerts/matches.go
Expand Up @@ -52,5 +52,22 @@ func (d *percentileAllowances) FlakeAfter(key historicaldata2.AlertDataKey) time
// getClosestPercentilesValues uses the backend and information about the cluster to choose the best historical p99 to operate against.
// We enforce "don't get worse" for disruption by watching the aggregate data in CI over many runs.
func getClosestPercentilesValues(key historicaldata2.AlertDataKey) (historicaldata2.StatisticalDuration, string, error) {
return getCurrentResults().BestMatchDuration(key)
return GetHistoricalData().BestMatchDuration(key)
}

func alwaysFlake() AlertTestAllowanceCalculator {
return &alwaysFlakeAllowance{}
}

// alwaysFlakeAllowance is for alerts we want to flake a test if they occur at all.
type alwaysFlakeAllowance struct {
}

func (d *alwaysFlakeAllowance) FailAfter(key historicaldata2.AlertDataKey) (time.Duration, error) {
// make it effectively impossible for a test failure here, we only want flakes
return 24 * time.Hour, nil
}

func (d *alwaysFlakeAllowance) FlakeAfter(key historicaldata2.AlertDataKey) time.Duration {
return 1 * time.Second
}
2 changes: 1 addition & 1 deletion pkg/monitortestlibrary/allowedalerts/matches_test.go
Expand Up @@ -171,7 +171,7 @@ func TestGetClosestP99Value(t *testing.T) {
// from bigquery and commit into origin. Test ensures we can parse it and the data looks sane.
func TestAlertDataFileParsing(t *testing.T) {

alertMatcher := getCurrentResults()
alertMatcher := GetHistoricalData()

// The list of known alerts that goes into this file is composed of everything we've ever
// seen fire in that release. As such it can change from one release to the next as alerts
Expand Down
13 changes: 12 additions & 1 deletion pkg/monitortestlibrary/allowedalerts/types.go
Expand Up @@ -22,7 +22,7 @@ var (
historicalData *historicaldata.AlertBestMatcher
)

func getCurrentResults() *historicaldata.AlertBestMatcher {
func GetHistoricalData() *historicaldata.AlertBestMatcher {
readResults.Do(
func() {
var err error
Expand All @@ -34,3 +34,14 @@ func getCurrentResults() *historicaldata.AlertBestMatcher {

return historicalData
}

// AllowedAlertNames is a list of alerts we do not test against.
var AllowedAlertNames = []string{
"Watchdog",
"AlertmanagerReceiversNotConfigured",
"PrometheusRemoteWriteDesiredShards",
"KubeJobFailed", // this is a result of bug https://bugzilla.redhat.com/show_bug.cgi?id=2054426 . We should catch these in the prometheus tests.

// indicates a problem in the external Telemeter service, presently very common, does not impact our ability to e2e test:
"TelemeterClientFailures",
}
30 changes: 19 additions & 11 deletions pkg/monitortestlibrary/historicaldata/alert_types.go
Expand Up @@ -12,11 +12,15 @@ import (
)

type AlertStatisticalData struct {
AlertDataKey `json:",inline"`
Name string
P95 float64
P99 float64
JobRuns int64
AlertDataKey `json:",inline"`
Name string
P50 float64
P75 float64
P95 float64
P99 float64
FirstObserved time.Time
LastObserved time.Time
JobRuns int64
}

type AlertDataKey struct {
Expand Down Expand Up @@ -80,11 +84,11 @@ func NewAlertMatcherWithHistoricalData(data map[AlertDataKey]AlertStatisticalDat

func (b *AlertBestMatcher) bestMatch(key AlertDataKey) (AlertStatisticalData, string, error) {
exactMatchKey := key
logrus.WithField("alertName", key.AlertName).Infof("searching for bestMatch for %+v", key.JobType)
logrus.Infof("historicalData has %d entries", len(b.HistoricalData))
logrus.WithField("alertName", key.AlertName).WithField("entries", len(b.HistoricalData)).
Debugf("searching for best match for %+v", key.JobType)

if percentiles, ok := b.HistoricalData[exactMatchKey]; ok {
if percentiles.JobRuns > minJobRuns {
if percentiles.JobRuns >= minJobRuns {
logrus.Infof("found exact match: %+v", percentiles)
return percentiles, "", nil
}
Expand Down Expand Up @@ -159,8 +163,12 @@ func (b *AlertBestMatcher) BestMatchP99(key AlertDataKey) (*time.Duration, strin

func toAlertStatisticalDuration(in AlertStatisticalData) StatisticalDuration {
return StatisticalDuration{
JobType: in.AlertDataKey.JobType,
P95: DurationOrDie(in.P95),
P99: DurationOrDie(in.P99),
JobType: in.AlertDataKey.JobType,
P50: DurationOrDie(in.P50),
P75: DurationOrDie(in.P75),
P95: DurationOrDie(in.P95),
P99: DurationOrDie(in.P99),
FirstObserved: in.FirstObserved,
LastObserved: in.LastObserved,
}
}

0 comments on commit 617bc41

Please sign in to comment.