Skip to content

Commit

Permalink
Add new test that will fail on new alerts firing
Browse files Browse the repository at this point in the history
Attempts to be forgiving, scan entire data file regardless of nurp looking to see if we've seen this alert name before anywhere. If not, or we did but first observed is less than two weeks ago, fail the test. This approach should help us survive the cutover to new releases.
  • Loading branch information
dgoodwin committed Oct 19, 2023
1 parent 3ba4913 commit 5d28369
Show file tree
Hide file tree
Showing 20 changed files with 328 additions and 82 deletions.
2 changes: 1 addition & 1 deletion pkg/cmd/openshift-tests/dev/dev.go
Expand Up @@ -135,7 +135,7 @@ func readIntervalsFromFile(intervalsFile string) (monitorapi.Intervals, error) {
return nil, err
}

return monitorserialization.EventsFromJSON(jsonBytes)
return monitorserialization.IntervalsFromJSON(jsonBytes)
}

func newRunDisruptionInvariantsCommand() *cobra.Command {
Expand Down
Expand Up @@ -57,7 +57,7 @@ func NewTimelineOptions(ioStreams genericclioptions.IOStreams) *TimelineOptions

IOStreams: ioStreams,
KnownRenderers: map[string]RenderFunc{
"json": monitorserialization.EventsToJSON,
"json": monitorserialization.IntervalsToJSON,
"html": renderHTML,
},
KnownTimelines: map[string]monitorapi.EventIntervalMatchesFunc{
Expand Down
2 changes: 1 addition & 1 deletion pkg/disruption/backend/sampler/remote.go
Expand Up @@ -380,7 +380,7 @@ func fetchEventsFromFileOnNode(ctx context.Context, clientset *kubernetes.Client
if err != nil {
return filteredEvents, fmt.Errorf("failed to fetch file %s on node %s: %v", filePath, nodeName, err)
}
allEvents, err := monitorserialization.EventsFromJSON(allBytes)
allEvents, err := monitorserialization.IntervalsFromJSON(allBytes)
if err != nil {
return nil, fmt.Errorf("failed to convert file %s from node %s to intervals: %v", filePath, nodeName, err)
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/monitor/monitorapi/types.go
Expand Up @@ -193,6 +193,8 @@ const (
type AnnotationKey string

const (
AnnotationAlertState AnnotationKey = "alertstate"
AnnotationSeverity AnnotationKey = "severity"
AnnotationReason AnnotationKey = "reason"
AnnotationContainerExitCode AnnotationKey = "code"
AnnotationCause AnnotationKey = "cause"
Expand Down
35 changes: 21 additions & 14 deletions pkg/monitor/serialization/serialize.go
Expand Up @@ -21,7 +21,8 @@ type EventInterval struct {

// TODO: Remove the omitempty, just here to keep from having to repeatedly updated the json
// files used in some new tests
Source string `json:"tempSource,omitempty"` // also temporary, unsure if this concept will survive
Source string `json:"tempSource,omitempty"` // also temporary, unsure if this concept will survive
Display bool `json:"display"`

// TODO: we're hoping to move these to just locator/message when everything is ready.
StructuredLocator monitorapi.Locator `json:"tempStructuredLocator"`
Expand All @@ -37,7 +38,7 @@ type EventIntervalList struct {
}

func EventsToFile(filename string, events monitorapi.Intervals) error {
json, err := EventsToJSON(events)
json, err := IntervalsToJSON(events)
if err != nil {
return err
}
Expand All @@ -49,10 +50,10 @@ func EventsFromFile(filename string) (monitorapi.Intervals, error) {
if err != nil {
return nil, err
}
return EventsFromJSON(data)
return IntervalsFromJSON(data)
}

func EventsFromJSON(data []byte) (monitorapi.Intervals, error) {
func IntervalsFromJSON(data []byte) (monitorapi.Intervals, error) {
var list EventIntervalList
if err := json.Unmarshal(data, &list); err != nil {
return nil, err
Expand All @@ -64,11 +65,14 @@ func EventsFromJSON(data []byte) (monitorapi.Intervals, error) {
return nil, err
}
events = append(events, monitorapi.Interval{
Source: monitorapi.IntervalSource(interval.Source),
Source: monitorapi.IntervalSource(interval.Source),
Display: interval.Display,
Condition: monitorapi.Condition{
Level: level,
Locator: interval.Locator,
Message: interval.Message,
Level: level,
Locator: interval.Locator,
StructuredLocator: interval.StructuredLocator,
Message: interval.Message,
StructuredMessage: interval.StructuredMessage,
},

From: interval.From.Time,
Expand All @@ -89,7 +93,8 @@ func IntervalFromJSON(data []byte) (*monitorapi.Interval, error) {
return nil, err
}
return &monitorapi.Interval{
Source: monitorapi.IntervalSource(serializedInterval.Source),
Source: monitorapi.IntervalSource(serializedInterval.Source),
Display: serializedInterval.Display,
Condition: monitorapi.Condition{
Level: level,
Locator: serializedInterval.Locator,
Expand Down Expand Up @@ -118,9 +123,9 @@ func IntervalToOneLineJSON(interval monitorapi.Interval) ([]byte, error) {
return buf.Bytes(), nil
}

func EventsToJSON(events monitorapi.Intervals) ([]byte, error) {
func IntervalsToJSON(intervals monitorapi.Intervals) ([]byte, error) {
outputEvents := []EventInterval{}
for _, curr := range events {
for _, curr := range intervals {
outputEvents = append(outputEvents, monitorEventIntervalToEventInterval(curr))
}

Expand All @@ -129,14 +134,16 @@ func EventsToJSON(events monitorapi.Intervals) ([]byte, error) {
return json.MarshalIndent(list, "", " ")
}

func EventsIntervalsToFile(filename string, events monitorapi.Intervals) error {
json, err := EventsIntervalsToJSON(events)
func IntervalsToFile(filename string, intervals monitorapi.Intervals) error {
json, err := EventsIntervalsToJSON(intervals)
if err != nil {
return err
}
return ioutil.WriteFile(filename, json, 0644)
}

// TODO: this is very similar but subtly different to the function above, what is the purpose of skipping those
// with from/to equal or empty to?
func EventsIntervalsToJSON(events monitorapi.Intervals) ([]byte, error) {
outputEvents := []EventInterval{}
for _, curr := range events {
Expand All @@ -159,11 +166,11 @@ func monitorEventIntervalToEventInterval(interval monitorapi.Interval) EventInte
StructuredLocator: interval.StructuredLocator,
StructuredMessage: interval.StructuredMessage,
Source: string(interval.Source),
Display: interval.Display,

From: metav1.Time{Time: interval.From},
To: metav1.Time{Time: interval.To},
}

return ret
}

Expand Down
1 change: 1 addition & 0 deletions pkg/monitortestlibrary/allowedalerts/basic_alert.go
Expand Up @@ -31,6 +31,7 @@ type AlertTest interface {

// AlertState is the state of the alert. They are logically ordered, so if a test says it limits on "pending", then
// any state above pending (like info or warning) will cause the test to fail.
// TODO this looks wrong, AlertState (pending|firing) and AlertLevel (info|warning|critical) are different things, but they seem lumped together here.
type AlertState string

const (
Expand Down
2 changes: 1 addition & 1 deletion pkg/monitortestlibrary/allowedalerts/matches.go
Expand Up @@ -52,7 +52,7 @@ func (d *percentileAllowances) FlakeAfter(key historicaldata2.AlertDataKey) time
// getClosestPercentilesValues uses the backend and information about the cluster to choose the best historical p99 to operate against.
// We enforce "don't get worse" for disruption by watching the aggregate data in CI over many runs.
func getClosestPercentilesValues(key historicaldata2.AlertDataKey) (historicaldata2.StatisticalDuration, string, error) {
return getCurrentResults().BestMatchDuration(key)
return GetHistoricalData().BestMatchDuration(key)
}

func alwaysFlake() AlertTestAllowanceCalculator {
Expand Down
2 changes: 1 addition & 1 deletion pkg/monitortestlibrary/allowedalerts/matches_test.go
Expand Up @@ -171,7 +171,7 @@ func TestGetClosestP99Value(t *testing.T) {
// from bigquery and commit into origin. Test ensures we can parse it and the data looks sane.
func TestAlertDataFileParsing(t *testing.T) {

alertMatcher := getCurrentResults()
alertMatcher := GetHistoricalData()

// The list of known alerts that goes into this file is composed of everything we've ever
// seen fire in that release. As such it can change from one release to the next as alerts
Expand Down
13 changes: 12 additions & 1 deletion pkg/monitortestlibrary/allowedalerts/types.go
Expand Up @@ -22,7 +22,7 @@ var (
historicalData *historicaldata.AlertBestMatcher
)

func getCurrentResults() *historicaldata.AlertBestMatcher {
func GetHistoricalData() *historicaldata.AlertBestMatcher {
readResults.Do(
func() {
var err error
Expand All @@ -34,3 +34,14 @@ func getCurrentResults() *historicaldata.AlertBestMatcher {

return historicalData
}

// AllowedAlertNames is a list of alerts we do not test against.
var AllowedAlertNames = []string{
"Watchdog",
"AlertmanagerReceiversNotConfigured",
"PrometheusRemoteWriteDesiredShards",
"KubeJobFailed", // this is a result of bug https://bugzilla.redhat.com/show_bug.cgi?id=2054426 . We should catch these in the prometheus tests.

// indicates a problem in the external Telemeter service, presently very common, does not impact our ability to e2e test:
"TelemeterClientFailures",
}
26 changes: 17 additions & 9 deletions pkg/monitortestlibrary/historicaldata/alert_types.go
Expand Up @@ -12,11 +12,15 @@ import (
)

type AlertStatisticalData struct {
AlertDataKey `json:",inline"`
Name string
P95 float64
P99 float64
JobRuns int64
AlertDataKey `json:",inline"`
Name string
P50 float64
P75 float64
P95 float64
P99 float64
FirstObserved time.Time
LastObserved time.Time
JobRuns int64
}

type AlertDataKey struct {
Expand Down Expand Up @@ -84,7 +88,7 @@ func (b *AlertBestMatcher) bestMatch(key AlertDataKey) (AlertStatisticalData, st
Debugf("searching for best match for %+v", key.JobType)

if percentiles, ok := b.HistoricalData[exactMatchKey]; ok {
if percentiles.JobRuns > minJobRuns {
if percentiles.JobRuns >= minJobRuns {
logrus.Infof("found exact match: %+v", percentiles)
return percentiles, "", nil
}
Expand Down Expand Up @@ -159,8 +163,12 @@ func (b *AlertBestMatcher) BestMatchP99(key AlertDataKey) (*time.Duration, strin

func toAlertStatisticalDuration(in AlertStatisticalData) StatisticalDuration {
return StatisticalDuration{
JobType: in.AlertDataKey.JobType,
P95: DurationOrDie(in.P95),
P99: DurationOrDie(in.P99),
JobType: in.AlertDataKey.JobType,
P50: DurationOrDie(in.P50),
P75: DurationOrDie(in.P75),
P95: DurationOrDie(in.P95),
P99: DurationOrDie(in.P99),
FirstObserved: in.FirstObserved,
LastObserved: in.LastObserved,
}
}
26 changes: 19 additions & 7 deletions pkg/monitortestlibrary/historicaldata/disruption_types.go
Expand Up @@ -20,15 +20,23 @@ const minJobRuns = 100

type StatisticalDuration struct {
platformidentification.JobType `json:",inline"`
P50 time.Duration
P75 time.Duration
P95 time.Duration
P99 time.Duration
FirstObserved time.Time
LastObserved time.Time
}

type DisruptionStatisticalData struct {
DataKey `json:",inline"`
P95 float64
P99 float64
JobRuns int64
DataKey `json:",inline"`
P50 float64
P75 float64
P95 float64
P99 float64
FirstObserved time.Time
LastObserved time.Time
JobRuns int64
}

type DataKey struct {
Expand Down Expand Up @@ -172,9 +180,13 @@ func (b *DisruptionBestMatcher) BestMatchP99(name string, jobType platformidenti

func toStatisticalDuration(in DisruptionStatisticalData) StatisticalDuration {
return StatisticalDuration{
JobType: in.DataKey.JobType,
P95: DurationOrDie(in.P95),
P99: DurationOrDie(in.P99),
JobType: in.DataKey.JobType,
P50: DurationOrDie(in.P50),
P75: DurationOrDie(in.P75),
P95: DurationOrDie(in.P95),
P99: DurationOrDie(in.P99),
FirstObserved: in.FirstObserved,
LastObserved: in.LastObserved,
}
}

Expand Down
4 changes: 2 additions & 2 deletions pkg/monitortests/node/nodestateanalyzer/node_test.go
Expand Up @@ -75,7 +75,7 @@ type nodeIntervalTest struct {
}

func (p nodeIntervalTest) test(t *testing.T) {
inputIntervals, err := monitorserialization.EventsFromJSON(p.events)
inputIntervals, err := monitorserialization.IntervalsFromJSON(p.events)
if err != nil {
t.Fatal(err)
}
Expand All @@ -89,7 +89,7 @@ func (p nodeIntervalTest) test(t *testing.T) {
}
result := intervalsFromEvents_NodeChanges(inputIntervals, nil, startTime, endTime)

resultBytes, err := monitorserialization.EventsToJSON(result)
resultBytes, err := monitorserialization.IntervalsToJSON(result)
if err != nil {
t.Fatal(err)
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/monitortests/node/watchpods/compute_intervals_test.go
Expand Up @@ -84,7 +84,7 @@ func (p podIntervalTest) test(t *testing.T) {
resourceMap["pods"] = podMap
}

inputIntervals, err := monitorserialization.EventsFromJSON(p.events)
inputIntervals, err := monitorserialization.IntervalsFromJSON(p.events)
if err != nil {
t.Fatal(err)
}
Expand All @@ -98,7 +98,7 @@ func (p podIntervalTest) test(t *testing.T) {
}
result := createPodIntervalsFromInstants(inputIntervals, resourceMap, startTime, endTime)

resultBytes, err := monitorserialization.EventsToJSON(result)
resultBytes, err := monitorserialization.IntervalsToJSON(result)
if err != nil {
t.Fatal(err)
}
Expand Down
11 changes: 9 additions & 2 deletions pkg/monitortests/testframework/alertanalyzer/alerts.go
Expand Up @@ -312,15 +312,22 @@ func createEventIntervalsForAlerts(ctx context.Context, alerts prometheustypes.V
level = monitorapi.Warning
case alert.Metric["severity"] == "critical":
level = monitorapi.Error
case alert.Metric["severity"] == "info":
case alert.Metric["severity"] == "info": // this case may not exist
level = monitorapi.Info
default:
level = monitorapi.Error
}
msg := monitorapi.NewMessage().HumanMessage(alert.Metric.String())
if len(string(alert.Metric["alertstate"])) > 0 {
msg = msg.WithAnnotation(monitorapi.AnnotationAlertState, string(alert.Metric["alertstate"]))
}
if len(string(alert.Metric["severity"])) > 0 {
msg = msg.WithAnnotation(monitorapi.AnnotationSeverity, string(alert.Metric["severity"]))
}
alertIntervalTemplate :=
monitorapi.NewInterval(monitorapi.SourceAlert, level).
Locator(lb).
Message(monitorapi.NewMessage().HumanMessage(alert.Metric.String()))
Message(msg)

var alertStartTime *time.Time
var lastTime *time.Time
Expand Down

0 comments on commit 5d28369

Please sign in to comment.