Add new test that will fail on new alerts firing

Attempts to be forgiving, scan entire data file regardless of nurp looking to see if we've seen this alert name before anywhere. If not, or we did but first observed is less than two weeks ago, fail the test. This approach should help us survive the cutover to new releases.
openshift · Oct 19, 2023 · 5d28369 · 5d28369
1 parent 3ba4913
commit 5d28369
Show file tree

Hide file tree

Showing 20 changed files with 328 additions and 82 deletions.
diff --git a/pkg/cmd/openshift-tests/dev/dev.go b/pkg/cmd/openshift-tests/dev/dev.go
@@ -135,7 +135,7 @@ func readIntervalsFromFile(intervalsFile string) (monitorapi.Intervals, error) {
 		return nil, err
 	}
 
-	return monitorserialization.EventsFromJSON(jsonBytes)
+	return monitorserialization.IntervalsFromJSON(jsonBytes)
 }
 
 func newRunDisruptionInvariantsCommand() *cobra.Command {

diff --git a/pkg/cmd/openshift-tests/monitor/timeline/timeline_command.go b/pkg/cmd/openshift-tests/monitor/timeline/timeline_command.go
@@ -57,7 +57,7 @@ func NewTimelineOptions(ioStreams genericclioptions.IOStreams) *TimelineOptions
 
 		IOStreams: ioStreams,
 		KnownRenderers: map[string]RenderFunc{
-			"json": monitorserialization.EventsToJSON,
+			"json": monitorserialization.IntervalsToJSON,
 			"html": renderHTML,
 		},
 		KnownTimelines: map[string]monitorapi.EventIntervalMatchesFunc{

diff --git a/pkg/disruption/backend/sampler/remote.go b/pkg/disruption/backend/sampler/remote.go
@@ -380,7 +380,7 @@ func fetchEventsFromFileOnNode(ctx context.Context, clientset *kubernetes.Client
 	if err != nil {
 		return filteredEvents, fmt.Errorf("failed to fetch file %s on node %s: %v", filePath, nodeName, err)
 	}
-	allEvents, err := monitorserialization.EventsFromJSON(allBytes)
+	allEvents, err := monitorserialization.IntervalsFromJSON(allBytes)
 	if err != nil {
 		return nil, fmt.Errorf("failed to convert file %s from node %s to intervals: %v", filePath, nodeName, err)
 	}

diff --git a/pkg/monitor/monitorapi/types.go b/pkg/monitor/monitorapi/types.go
@@ -193,6 +193,8 @@ const (
 type AnnotationKey string
 
 const (
+	AnnotationAlertState         AnnotationKey = "alertstate"
+	AnnotationSeverity           AnnotationKey = "severity"
 	AnnotationReason             AnnotationKey = "reason"
 	AnnotationContainerExitCode  AnnotationKey = "code"
 	AnnotationCause              AnnotationKey = "cause"

diff --git a/pkg/monitor/serialization/serialize.go b/pkg/monitor/serialization/serialize.go
@@ -21,7 +21,8 @@ type EventInterval struct {
 
 	// TODO: Remove the omitempty, just here to keep from having to repeatedly updated the json
 	// files used in some new tests
-	Source string `json:"tempSource,omitempty"` // also temporary, unsure if this concept will survive
+	Source  string `json:"tempSource,omitempty"` // also temporary, unsure if this concept will survive
+	Display bool   `json:"display"`
 
 	// TODO: we're hoping to move these to just locator/message when everything is ready.
 	StructuredLocator monitorapi.Locator `json:"tempStructuredLocator"`
@@ -37,7 +38,7 @@ type EventIntervalList struct {
 }
 
 func EventsToFile(filename string, events monitorapi.Intervals) error {
-	json, err := EventsToJSON(events)
+	json, err := IntervalsToJSON(events)
 	if err != nil {
 		return err
 	}
@@ -49,10 +50,10 @@ func EventsFromFile(filename string) (monitorapi.Intervals, error) {
 	if err != nil {
 		return nil, err
 	}
-	return EventsFromJSON(data)
+	return IntervalsFromJSON(data)
 }
 
-func EventsFromJSON(data []byte) (monitorapi.Intervals, error) {
+func IntervalsFromJSON(data []byte) (monitorapi.Intervals, error) {
 	var list EventIntervalList
 	if err := json.Unmarshal(data, &list); err != nil {
 		return nil, err
@@ -64,11 +65,14 @@ func EventsFromJSON(data []byte) (monitorapi.Intervals, error) {
 			return nil, err
 		}
 		events = append(events, monitorapi.Interval{
-			Source: monitorapi.IntervalSource(interval.Source),
+			Source:  monitorapi.IntervalSource(interval.Source),
+			Display: interval.Display,
 			Condition: monitorapi.Condition{
-				Level:   level,
-				Locator: interval.Locator,
-				Message: interval.Message,
+				Level:             level,
+				Locator:           interval.Locator,
+				StructuredLocator: interval.StructuredLocator,
+				Message:           interval.Message,
+				StructuredMessage: interval.StructuredMessage,
 			},
 
 			From: interval.From.Time,
@@ -89,7 +93,8 @@ func IntervalFromJSON(data []byte) (*monitorapi.Interval, error) {
 		return nil, err
 	}
 	return &monitorapi.Interval{
-		Source: monitorapi.IntervalSource(serializedInterval.Source),
+		Source:  monitorapi.IntervalSource(serializedInterval.Source),
+		Display: serializedInterval.Display,
 		Condition: monitorapi.Condition{
 			Level:             level,
 			Locator:           serializedInterval.Locator,
@@ -118,9 +123,9 @@ func IntervalToOneLineJSON(interval monitorapi.Interval) ([]byte, error) {
 	return buf.Bytes(), nil
 }
 
-func EventsToJSON(events monitorapi.Intervals) ([]byte, error) {
+func IntervalsToJSON(intervals monitorapi.Intervals) ([]byte, error) {
 	outputEvents := []EventInterval{}
-	for _, curr := range events {
+	for _, curr := range intervals {
 		outputEvents = append(outputEvents, monitorEventIntervalToEventInterval(curr))
 	}
 
@@ -129,14 +134,16 @@ func EventsToJSON(events monitorapi.Intervals) ([]byte, error) {
 	return json.MarshalIndent(list, "", "    ")
 }
 
-func EventsIntervalsToFile(filename string, events monitorapi.Intervals) error {
-	json, err := EventsIntervalsToJSON(events)
+func IntervalsToFile(filename string, intervals monitorapi.Intervals) error {
+	json, err := EventsIntervalsToJSON(intervals)
 	if err != nil {
 		return err
 	}
 	return ioutil.WriteFile(filename, json, 0644)
 }
 
+// TODO: this is very similar but subtly different to the function above, what is the purpose of skipping those
+// with from/to equal or empty to?
 func EventsIntervalsToJSON(events monitorapi.Intervals) ([]byte, error) {
 	outputEvents := []EventInterval{}
 	for _, curr := range events {
@@ -159,11 +166,11 @@ func monitorEventIntervalToEventInterval(interval monitorapi.Interval) EventInte
 		StructuredLocator: interval.StructuredLocator,
 		StructuredMessage: interval.StructuredMessage,
 		Source:            string(interval.Source),
+		Display:           interval.Display,
 
 		From: metav1.Time{Time: interval.From},
 		To:   metav1.Time{Time: interval.To},
 	}
-
 	return ret
 }
 

diff --git a/pkg/monitortestlibrary/allowedalerts/basic_alert.go b/pkg/monitortestlibrary/allowedalerts/basic_alert.go
@@ -31,6 +31,7 @@ type AlertTest interface {
 
 // AlertState is the state of the alert. They are logically ordered, so if a test says it limits on "pending", then
 // any state above pending (like info or warning) will cause the test to fail.
+// TODO this looks wrong, AlertState (pending|firing) and AlertLevel (info|warning|critical) are different things, but they seem lumped together here.
 type AlertState string
 
 const (

diff --git a/pkg/monitortestlibrary/allowedalerts/matches.go b/pkg/monitortestlibrary/allowedalerts/matches.go
@@ -52,7 +52,7 @@ func (d *percentileAllowances) FlakeAfter(key historicaldata2.AlertDataKey) time
 // getClosestPercentilesValues uses the backend and information about the cluster to choose the best historical p99 to operate against.
 // We enforce "don't get worse" for disruption by watching the aggregate data in CI over many runs.
 func getClosestPercentilesValues(key historicaldata2.AlertDataKey) (historicaldata2.StatisticalDuration, string, error) {
-	return getCurrentResults().BestMatchDuration(key)
+	return GetHistoricalData().BestMatchDuration(key)
 }
 
 func alwaysFlake() AlertTestAllowanceCalculator {

diff --git a/pkg/monitortestlibrary/allowedalerts/matches_test.go b/pkg/monitortestlibrary/allowedalerts/matches_test.go
@@ -171,7 +171,7 @@ func TestGetClosestP99Value(t *testing.T) {
 // from bigquery and commit into origin. Test ensures we can parse it and the data looks sane.
 func TestAlertDataFileParsing(t *testing.T) {
 
-	alertMatcher := getCurrentResults()
+	alertMatcher := GetHistoricalData()
 
 	// The list of known alerts that goes into this file is composed of everything we've ever
 	// seen fire in that release. As such it can change from one release to the next as alerts

diff --git a/pkg/monitortestlibrary/allowedalerts/types.go b/pkg/monitortestlibrary/allowedalerts/types.go
@@ -22,7 +22,7 @@ var (
 	historicalData *historicaldata.AlertBestMatcher
 )
 
-func getCurrentResults() *historicaldata.AlertBestMatcher {
+func GetHistoricalData() *historicaldata.AlertBestMatcher {
 	readResults.Do(
 		func() {
 			var err error
@@ -34,3 +34,14 @@ func getCurrentResults() *historicaldata.AlertBestMatcher {
 
 	return historicalData
 }
+
+// AllowedAlertNames is a  list of alerts we do not test against.
+var AllowedAlertNames = []string{
+	"Watchdog",
+	"AlertmanagerReceiversNotConfigured",
+	"PrometheusRemoteWriteDesiredShards",
+	"KubeJobFailed", // this is a result of bug https://bugzilla.redhat.com/show_bug.cgi?id=2054426 .  We should catch these in the prometheus tests.
+
+	// indicates a problem in the external Telemeter service, presently very common, does not impact our ability to e2e test:
+	"TelemeterClientFailures",
+}
diff --git a/pkg/monitortestlibrary/historicaldata/alert_types.go b/pkg/monitortestlibrary/historicaldata/alert_types.go
@@ -12,11 +12,15 @@ import (
 )
 
 type AlertStatisticalData struct {
-	AlertDataKey `json:",inline"`
-	Name         string
-	P95          float64
-	P99          float64
-	JobRuns      int64
+	AlertDataKey  `json:",inline"`
+	Name          string
+	P50           float64
+	P75           float64
+	P95           float64
+	P99           float64
+	FirstObserved time.Time
+	LastObserved  time.Time
+	JobRuns       int64
 }
 
 type AlertDataKey struct {
@@ -84,7 +88,7 @@ func (b *AlertBestMatcher) bestMatch(key AlertDataKey) (AlertStatisticalData, st
 		Debugf("searching for best match for %+v", key.JobType)
 
 	if percentiles, ok := b.HistoricalData[exactMatchKey]; ok {
-		if percentiles.JobRuns > minJobRuns {
+		if percentiles.JobRuns >= minJobRuns {
 			logrus.Infof("found exact match: %+v", percentiles)
 			return percentiles, "", nil
 		}
@@ -159,8 +163,12 @@ func (b *AlertBestMatcher) BestMatchP99(key AlertDataKey) (*time.Duration, strin
 
 func toAlertStatisticalDuration(in AlertStatisticalData) StatisticalDuration {
 	return StatisticalDuration{
-		JobType: in.AlertDataKey.JobType,
-		P95:     DurationOrDie(in.P95),
-		P99:     DurationOrDie(in.P99),
+		JobType:       in.AlertDataKey.JobType,
+		P50:           DurationOrDie(in.P50),
+		P75:           DurationOrDie(in.P75),
+		P95:           DurationOrDie(in.P95),
+		P99:           DurationOrDie(in.P99),
+		FirstObserved: in.FirstObserved,
+		LastObserved:  in.LastObserved,
 	}
 }
diff --git a/pkg/monitortestlibrary/historicaldata/disruption_types.go b/pkg/monitortestlibrary/historicaldata/disruption_types.go
@@ -20,15 +20,23 @@ const minJobRuns = 100
 
 type StatisticalDuration struct {
 	platformidentification.JobType `json:",inline"`
+	P50                            time.Duration
+	P75                            time.Duration
 	P95                            time.Duration
 	P99                            time.Duration
+	FirstObserved                  time.Time
+	LastObserved                   time.Time
 }
 
 type DisruptionStatisticalData struct {
-	DataKey `json:",inline"`
-	P95     float64
-	P99     float64
-	JobRuns int64
+	DataKey       `json:",inline"`
+	P50           float64
+	P75           float64
+	P95           float64
+	P99           float64
+	FirstObserved time.Time
+	LastObserved  time.Time
+	JobRuns       int64
 }
 
 type DataKey struct {
@@ -172,9 +180,13 @@ func (b *DisruptionBestMatcher) BestMatchP99(name string, jobType platformidenti
 
 func toStatisticalDuration(in DisruptionStatisticalData) StatisticalDuration {
 	return StatisticalDuration{
-		JobType: in.DataKey.JobType,
-		P95:     DurationOrDie(in.P95),
-		P99:     DurationOrDie(in.P99),
+		JobType:       in.DataKey.JobType,
+		P50:           DurationOrDie(in.P50),
+		P75:           DurationOrDie(in.P75),
+		P95:           DurationOrDie(in.P95),
+		P99:           DurationOrDie(in.P99),
+		FirstObserved: in.FirstObserved,
+		LastObserved:  in.LastObserved,
 	}
 }
 

diff --git a/pkg/monitortests/node/nodestateanalyzer/node_test.go b/pkg/monitortests/node/nodestateanalyzer/node_test.go
@@ -75,7 +75,7 @@ type nodeIntervalTest struct {
 }
 
 func (p nodeIntervalTest) test(t *testing.T) {
-	inputIntervals, err := monitorserialization.EventsFromJSON(p.events)
+	inputIntervals, err := monitorserialization.IntervalsFromJSON(p.events)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -89,7 +89,7 @@ func (p nodeIntervalTest) test(t *testing.T) {
 	}
 	result := intervalsFromEvents_NodeChanges(inputIntervals, nil, startTime, endTime)
 
-	resultBytes, err := monitorserialization.EventsToJSON(result)
+	resultBytes, err := monitorserialization.IntervalsToJSON(result)
 	if err != nil {
 		t.Fatal(err)
 	}

diff --git a/pkg/monitortests/node/watchpods/compute_intervals_test.go b/pkg/monitortests/node/watchpods/compute_intervals_test.go
@@ -84,7 +84,7 @@ func (p podIntervalTest) test(t *testing.T) {
 		resourceMap["pods"] = podMap
 	}
 
-	inputIntervals, err := monitorserialization.EventsFromJSON(p.events)
+	inputIntervals, err := monitorserialization.IntervalsFromJSON(p.events)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -98,7 +98,7 @@ func (p podIntervalTest) test(t *testing.T) {
 	}
 	result := createPodIntervalsFromInstants(inputIntervals, resourceMap, startTime, endTime)
 
-	resultBytes, err := monitorserialization.EventsToJSON(result)
+	resultBytes, err := monitorserialization.IntervalsToJSON(result)
 	if err != nil {
 		t.Fatal(err)
 	}

diff --git a/pkg/monitortests/testframework/alertanalyzer/alerts.go b/pkg/monitortests/testframework/alertanalyzer/alerts.go
@@ -312,15 +312,22 @@ func createEventIntervalsForAlerts(ctx context.Context, alerts prometheustypes.V
 				level = monitorapi.Warning
 			case alert.Metric["severity"] == "critical":
 				level = monitorapi.Error
-			case alert.Metric["severity"] == "info":
+			case alert.Metric["severity"] == "info": // this case may not exist
 				level = monitorapi.Info
 			default:
 				level = monitorapi.Error
 			}
+			msg := monitorapi.NewMessage().HumanMessage(alert.Metric.String())
+			if len(string(alert.Metric["alertstate"])) > 0 {
+				msg = msg.WithAnnotation(monitorapi.AnnotationAlertState, string(alert.Metric["alertstate"]))
+			}
+			if len(string(alert.Metric["severity"])) > 0 {
+				msg = msg.WithAnnotation(monitorapi.AnnotationSeverity, string(alert.Metric["severity"]))
+			}
 			alertIntervalTemplate :=
 				monitorapi.NewInterval(monitorapi.SourceAlert, level).
 					Locator(lb).
-					Message(monitorapi.NewMessage().HumanMessage(alert.Metric.String()))
+					Message(msg)
 
 			var alertStartTime *time.Time
 			var lastTime *time.Time