Merge pull request #28332 from dgoodwin/alert-pin-tests

Improvements and Additions to Alert Testing Stack
openshift · Oct 20, 2023 · 617bc41 · 617bc41
2 parents abb76df + dc3c6fc
commit 617bc41
Show file tree

Hide file tree

Showing 24 changed files with 383 additions and 304 deletions.
diff --git a/pkg/alerts/check.go b/pkg/alerts/check.go
diff --git a/pkg/cmd/openshift-tests/dev/dev.go b/pkg/cmd/openshift-tests/dev/dev.go
@@ -135,7 +135,7 @@ func readIntervalsFromFile(intervalsFile string) (monitorapi.Intervals, error) {
 		return nil, err
 	}
 
-	return monitorserialization.EventsFromJSON(jsonBytes)
+	return monitorserialization.IntervalsFromJSON(jsonBytes)
 }
 
 func newRunDisruptionInvariantsCommand() *cobra.Command {

diff --git a/pkg/cmd/openshift-tests/monitor/timeline/timeline_command.go b/pkg/cmd/openshift-tests/monitor/timeline/timeline_command.go
@@ -57,7 +57,7 @@ func NewTimelineOptions(ioStreams genericclioptions.IOStreams) *TimelineOptions
 
 		IOStreams: ioStreams,
 		KnownRenderers: map[string]RenderFunc{
-			"json": monitorserialization.EventsToJSON,
+			"json": monitorserialization.IntervalsToJSON,
 			"html": renderHTML,
 		},
 		KnownTimelines: map[string]monitorapi.EventIntervalMatchesFunc{

diff --git a/pkg/disruption/backend/sampler/remote.go b/pkg/disruption/backend/sampler/remote.go
@@ -380,7 +380,7 @@ func fetchEventsFromFileOnNode(ctx context.Context, clientset *kubernetes.Client
 	if err != nil {
 		return filteredEvents, fmt.Errorf("failed to fetch file %s on node %s: %v", filePath, nodeName, err)
 	}
-	allEvents, err := monitorserialization.EventsFromJSON(allBytes)
+	allEvents, err := monitorserialization.IntervalsFromJSON(allBytes)
 	if err != nil {
 		return nil, fmt.Errorf("failed to convert file %s from node %s to intervals: %v", filePath, nodeName, err)
 	}

diff --git a/pkg/monitor/monitorapi/types.go b/pkg/monitor/monitorapi/types.go
@@ -199,6 +199,8 @@ const (
 type AnnotationKey string
 
 const (
+	AnnotationAlertState         AnnotationKey = "alertstate"
+	AnnotationSeverity           AnnotationKey = "severity"
 	AnnotationReason             AnnotationKey = "reason"
 	AnnotationContainerExitCode  AnnotationKey = "code"
 	AnnotationCause              AnnotationKey = "cause"

diff --git a/pkg/monitor/serialization/serialize.go b/pkg/monitor/serialization/serialize.go
@@ -37,7 +37,7 @@ type EventIntervalList struct {
 }
 
 func EventsToFile(filename string, events monitorapi.Intervals) error {
-	json, err := EventsToJSON(events)
+	json, err := IntervalsToJSON(events)
 	if err != nil {
 		return err
 	}
@@ -49,10 +49,10 @@ func EventsFromFile(filename string) (monitorapi.Intervals, error) {
 	if err != nil {
 		return nil, err
 	}
-	return EventsFromJSON(data)
+	return IntervalsFromJSON(data)
 }
 
-func EventsFromJSON(data []byte) (monitorapi.Intervals, error) {
+func IntervalsFromJSON(data []byte) (monitorapi.Intervals, error) {
 	var list EventIntervalList
 	if err := json.Unmarshal(data, &list); err != nil {
 		return nil, err
@@ -68,8 +68,8 @@ func EventsFromJSON(data []byte) (monitorapi.Intervals, error) {
 			Condition: monitorapi.Condition{
 				Level:             level,
 				Locator:           interval.Locator,
-				Message:           interval.Message,
 				StructuredLocator: interval.StructuredLocator,
+				Message:           interval.Message,
 				StructuredMessage: interval.StructuredMessage,
 			},
 
@@ -120,9 +120,9 @@ func IntervalToOneLineJSON(interval monitorapi.Interval) ([]byte, error) {
 	return buf.Bytes(), nil
 }
 
-func EventsToJSON(events monitorapi.Intervals) ([]byte, error) {
+func IntervalsToJSON(intervals monitorapi.Intervals) ([]byte, error) {
 	outputEvents := []EventInterval{}
-	for _, curr := range events {
+	for _, curr := range intervals {
 		outputEvents = append(outputEvents, monitorEventIntervalToEventInterval(curr))
 	}
 
@@ -131,14 +131,16 @@ func EventsToJSON(events monitorapi.Intervals) ([]byte, error) {
 	return json.MarshalIndent(list, "", "    ")
 }
 
-func EventsIntervalsToFile(filename string, events monitorapi.Intervals) error {
-	json, err := EventsIntervalsToJSON(events)
+func IntervalsToFile(filename string, intervals monitorapi.Intervals) error {
+	json, err := EventsIntervalsToJSON(intervals)
 	if err != nil {
 		return err
 	}
 	return ioutil.WriteFile(filename, json, 0644)
 }
 
+// TODO: this is very similar but subtly different to the function above, what is the purpose of skipping those
+// with from/to equal or empty to?
 func EventsIntervalsToJSON(events monitorapi.Intervals) ([]byte, error) {
 	outputEvents := []EventInterval{}
 	for _, curr := range events {
@@ -165,7 +167,6 @@ func monitorEventIntervalToEventInterval(interval monitorapi.Interval) EventInte
 		From: metav1.Time{Time: interval.From},
 		To:   metav1.Time{Time: interval.To},
 	}
-
 	return ret
 }
 

diff --git a/pkg/monitortestlibrary/allowedalerts/basic_alert.go b/pkg/monitortestlibrary/allowedalerts/basic_alert.go
@@ -31,6 +31,7 @@ type AlertTest interface {
 
 // AlertState is the state of the alert. They are logically ordered, so if a test says it limits on "pending", then
 // any state above pending (like info or warning) will cause the test to fail.
+// TODO this looks wrong, AlertState (pending|firing) and AlertLevel (info|warning|critical) are different things, but they seem lumped together here.
 type AlertState string
 
 const (
@@ -111,6 +112,11 @@ func (a *alertBuilder) neverFail() *alertBuilder {
 	return a
 }
 
+func (a *alertBuilder) alwaysFlake() *alertBuilder {
+	a.allowanceCalculator = alwaysFlake()
+	return a
+}
+
 func (a *alertBuilder) toTests() []AlertTest {
 	if !a.divideByNamespaces {
 		return []AlertTest{

diff --git a/pkg/monitortestlibrary/allowedalerts/matches.go b/pkg/monitortestlibrary/allowedalerts/matches.go
@@ -52,5 +52,22 @@ func (d *percentileAllowances) FlakeAfter(key historicaldata2.AlertDataKey) time
 // getClosestPercentilesValues uses the backend and information about the cluster to choose the best historical p99 to operate against.
 // We enforce "don't get worse" for disruption by watching the aggregate data in CI over many runs.
 func getClosestPercentilesValues(key historicaldata2.AlertDataKey) (historicaldata2.StatisticalDuration, string, error) {
-	return getCurrentResults().BestMatchDuration(key)
+	return GetHistoricalData().BestMatchDuration(key)
+}
+
+func alwaysFlake() AlertTestAllowanceCalculator {
+	return &alwaysFlakeAllowance{}
+}
+
+// alwaysFlakeAllowance is for alerts we want to flake a test if they occur at all.
+type alwaysFlakeAllowance struct {
+}
+
+func (d *alwaysFlakeAllowance) FailAfter(key historicaldata2.AlertDataKey) (time.Duration, error) {
+	// make it effectively impossible for a test failure here, we only want flakes
+	return 24 * time.Hour, nil
+}
+
+func (d *alwaysFlakeAllowance) FlakeAfter(key historicaldata2.AlertDataKey) time.Duration {
+	return 1 * time.Second
 }
diff --git a/pkg/monitortestlibrary/allowedalerts/matches_test.go b/pkg/monitortestlibrary/allowedalerts/matches_test.go
@@ -171,7 +171,7 @@ func TestGetClosestP99Value(t *testing.T) {
 // from bigquery and commit into origin. Test ensures we can parse it and the data looks sane.
 func TestAlertDataFileParsing(t *testing.T) {
 
-	alertMatcher := getCurrentResults()
+	alertMatcher := GetHistoricalData()
 
 	// The list of known alerts that goes into this file is composed of everything we've ever
 	// seen fire in that release. As such it can change from one release to the next as alerts

diff --git a/pkg/monitortestlibrary/allowedalerts/types.go b/pkg/monitortestlibrary/allowedalerts/types.go
@@ -22,7 +22,7 @@ var (
 	historicalData *historicaldata.AlertBestMatcher
 )
 
-func getCurrentResults() *historicaldata.AlertBestMatcher {
+func GetHistoricalData() *historicaldata.AlertBestMatcher {
 	readResults.Do(
 		func() {
 			var err error
@@ -34,3 +34,14 @@ func getCurrentResults() *historicaldata.AlertBestMatcher {
 
 	return historicalData
 }
+
+// AllowedAlertNames is a  list of alerts we do not test against.
+var AllowedAlertNames = []string{
+	"Watchdog",
+	"AlertmanagerReceiversNotConfigured",
+	"PrometheusRemoteWriteDesiredShards",
+	"KubeJobFailed", // this is a result of bug https://bugzilla.redhat.com/show_bug.cgi?id=2054426 .  We should catch these in the prometheus tests.
+
+	// indicates a problem in the external Telemeter service, presently very common, does not impact our ability to e2e test:
+	"TelemeterClientFailures",
+}
diff --git a/pkg/monitortestlibrary/historicaldata/alert_types.go b/pkg/monitortestlibrary/historicaldata/alert_types.go
@@ -12,11 +12,15 @@ import (
 )
 
 type AlertStatisticalData struct {
-	AlertDataKey `json:",inline"`
-	Name         string
-	P95          float64
-	P99          float64
-	JobRuns      int64
+	AlertDataKey  `json:",inline"`
+	Name          string
+	P50           float64
+	P75           float64
+	P95           float64
+	P99           float64
+	FirstObserved time.Time
+	LastObserved  time.Time
+	JobRuns       int64
 }
 
 type AlertDataKey struct {
@@ -80,11 +84,11 @@ func NewAlertMatcherWithHistoricalData(data map[AlertDataKey]AlertStatisticalDat
 
 func (b *AlertBestMatcher) bestMatch(key AlertDataKey) (AlertStatisticalData, string, error) {
 	exactMatchKey := key
-	logrus.WithField("alertName", key.AlertName).Infof("searching for bestMatch for %+v", key.JobType)
-	logrus.Infof("historicalData has %d entries", len(b.HistoricalData))
+	logrus.WithField("alertName", key.AlertName).WithField("entries", len(b.HistoricalData)).
+		Debugf("searching for best match for %+v", key.JobType)
 
 	if percentiles, ok := b.HistoricalData[exactMatchKey]; ok {
-		if percentiles.JobRuns > minJobRuns {
+		if percentiles.JobRuns >= minJobRuns {
 			logrus.Infof("found exact match: %+v", percentiles)
 			return percentiles, "", nil
 		}
@@ -159,8 +163,12 @@ func (b *AlertBestMatcher) BestMatchP99(key AlertDataKey) (*time.Duration, strin
 
 func toAlertStatisticalDuration(in AlertStatisticalData) StatisticalDuration {
 	return StatisticalDuration{
-		JobType: in.AlertDataKey.JobType,
-		P95:     DurationOrDie(in.P95),
-		P99:     DurationOrDie(in.P99),
+		JobType:       in.AlertDataKey.JobType,
+		P50:           DurationOrDie(in.P50),
+		P75:           DurationOrDie(in.P75),
+		P95:           DurationOrDie(in.P95),
+		P99:           DurationOrDie(in.P99),
+		FirstObserved: in.FirstObserved,
+		LastObserved:  in.LastObserved,
 	}
 }