From d90ca1084da128201bf3070d317e08264638b258 Mon Sep 17 00:00:00 2001 From: Devan Goodwin Date: Tue, 17 Oct 2023 10:59:05 -0300 Subject: [PATCH 1/8] Support pinning alerts that we want to fire on any occurrence New initiative to allow pinning alerts that are simply not allowed to fire, regardless of historical data. --- pkg/alerts/check.go | 18 ++++++------------ pkg/monitortestlibrary/allowedalerts/all.go | 9 +++++++++ .../allowedalerts/basic_alert.go | 5 +++++ .../allowedalerts/matches.go | 18 ++++++++++++++++++ test/e2e/upgrade/alert/alert.go | 3 +-- test/extended/prometheus/prometheus.go | 2 +- 6 files changed, 40 insertions(+), 15 deletions(-) diff --git a/pkg/alerts/check.go b/pkg/alerts/check.go index 554b1782c333..ef8d0fb03bbc 100644 --- a/pkg/alerts/check.go +++ b/pkg/alerts/check.go @@ -6,7 +6,7 @@ import ( "strings" "time" - allowedalerts2 "github.com/openshift/origin/pkg/monitortestlibrary/allowedalerts" + allowedalerts "github.com/openshift/origin/pkg/monitortestlibrary/allowedalerts" "github.com/openshift/origin/pkg/monitortestlibrary/platformidentification" o "github.com/onsi/gomega" @@ -18,7 +18,6 @@ import ( prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/client-go/rest" "k8s.io/kubernetes/test/e2e/framework" ) @@ -26,12 +25,7 @@ type AllowedAlertsFunc func(featureSet configv1.FeatureSet) (allowedFiringWithBu // CheckAlerts will query prometheus and ensure no-unexpected alerts were pending or firing. // Used by both upgrade and conformance suites, with different allowances for each. -func CheckAlerts(allowancesFunc AllowedAlertsFunc, - restConfig *rest.Config, - prometheusClient prometheusv1.API, // TODO: remove - configClient configclient.Interface, // TODO: remove - testDuration time.Duration, - f *framework.Framework) { +func CheckAlerts(allowancesFunc AllowedAlertsFunc, prometheusClient prometheusv1.API, configClient configclient.Interface, testDuration time.Duration, f *framework.Framework) { featureSet := configv1.Default featureGate, err := configClient.ConfigV1().FeatureGates().Get(context.TODO(), "cluster", metav1.GetOptions{}) @@ -46,11 +40,11 @@ func CheckAlerts(allowancesFunc AllowedAlertsFunc, // In addition to the alert allowances passed in (which can differ for upgrades vs conformance), // we also exclude alerts that have their own separate tests codified. This is a backstop test for // everything else. - for _, alertTest := range allowedalerts2.AllAlertTests(&platformidentification.JobType{}, - allowedalerts2.DefaultAllowances) { + for _, alertTest := range allowedalerts.AllAlertTests(&platformidentification.JobType{}, + allowedalerts.DefaultAllowances) { switch alertTest.AlertState() { - case allowedalerts2.AlertPending: + case allowedalerts.AlertPending: // a pending test covers pending and everything above (firing) allowedPendingAlerts = append(allowedPendingAlerts, helper.MetricCondition{ @@ -64,7 +58,7 @@ func CheckAlerts(allowancesFunc AllowedAlertsFunc, Text: "has a separate e2e test", }, ) - case allowedalerts2.AlertInfo: + case allowedalerts.AlertInfo: // an info test covers all firing allowedFiringAlerts = append(allowedFiringAlerts, helper.MetricCondition{ diff --git a/pkg/monitortestlibrary/allowedalerts/all.go b/pkg/monitortestlibrary/allowedalerts/all.go index 9a4506b654f0..dbb483774977 100644 --- a/pkg/monitortestlibrary/allowedalerts/all.go +++ b/pkg/monitortestlibrary/allowedalerts/all.go @@ -15,6 +15,15 @@ func AllAlertTests(jobType *platformidentification.JobType, etcdAllowance AlertT ret = append(ret, newNamespacedAlert("KubePodNotReady", jobType).pending().neverFail().toTests()...) ret = append(ret, newNamespacedAlert("KubePodNotReady", jobType).firing().toTests()...) + // ClusterOperatorDown and ClusterOperatorDegraded should not occur during a normal upgrade. + // In CI firing is rare, but does happen a few times a month. Pending however occurs all the time, which implies operators are routinely in + // a state they're not supposed to be during upgrade. + // https://github.com/openshift/enhancements/blob/cb81452fddf86c1099acd87610b88369cd6192db/dev-guide/cluster-version-operator/dev/clusteroperator.md#there-are-a-set-of-guarantees-components-are-expected-to-honor-in-return + ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDown", jobType).pending().alwaysFail().toTests()...) + ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDegraded", jobType).pending().alwaysFail().toTests()...) + ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDown", jobType).firing().alwaysFail().toTests()...) + ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDegraded", jobType).firing().alwaysFail().toTests()...) + ret = append(ret, newAlert("etcd", "etcdMembersDown", jobType).pending().neverFail().toTests()...) ret = append(ret, newAlert("etcd", "etcdMembersDown", jobType).firing().toTests()...) ret = append(ret, newAlert("etcd", "etcdGRPCRequestsSlow", jobType).pending().neverFail().toTests()...) diff --git a/pkg/monitortestlibrary/allowedalerts/basic_alert.go b/pkg/monitortestlibrary/allowedalerts/basic_alert.go index 8eb9133cd4bc..84e676863f82 100644 --- a/pkg/monitortestlibrary/allowedalerts/basic_alert.go +++ b/pkg/monitortestlibrary/allowedalerts/basic_alert.go @@ -111,6 +111,11 @@ func (a *alertBuilder) neverFail() *alertBuilder { return a } +func (a *alertBuilder) alwaysFail() *alertBuilder { + a.allowanceCalculator = alwaysFail() + return a +} + func (a *alertBuilder) toTests() []AlertTest { if !a.divideByNamespaces { return []AlertTest{ diff --git a/pkg/monitortestlibrary/allowedalerts/matches.go b/pkg/monitortestlibrary/allowedalerts/matches.go index b4771d6d0d63..93eebf66285d 100644 --- a/pkg/monitortestlibrary/allowedalerts/matches.go +++ b/pkg/monitortestlibrary/allowedalerts/matches.go @@ -54,3 +54,21 @@ func (d *percentileAllowances) FlakeAfter(key historicaldata2.AlertDataKey) time func getClosestPercentilesValues(key historicaldata2.AlertDataKey) (historicaldata2.StatisticalDuration, string, error) { return getCurrentResults().BestMatchDuration(key) } + +func alwaysFail() AlertTestAllowanceCalculator { + return &alwaysFailAllowance{} +} + +// alwaysFailAllowance is for alerts we want to fail a test if they occur at all. +type alwaysFailAllowance struct { +} + +func (d *alwaysFailAllowance) FailAfter(key historicaldata2.AlertDataKey) (time.Duration, error) { + // TODO: right now we're just flaking until we're certain this doesn't happen too often. Once we're sure, + // change to 1 second. + return 24 * time.Hour, nil +} + +func (d *alwaysFailAllowance) FlakeAfter(key historicaldata2.AlertDataKey) time.Duration { + return 1 * time.Second +} diff --git a/test/e2e/upgrade/alert/alert.go b/test/e2e/upgrade/alert/alert.go index 77cb11c7ee89..e2242866f2ac 100644 --- a/test/e2e/upgrade/alert/alert.go +++ b/test/e2e/upgrade/alert/alert.go @@ -48,8 +48,7 @@ func (t *UpgradeTest) Test(ctx context.Context, f *framework.Framework, done <-c testDuration := time.Now().Sub(startTime).Round(time.Second) - alerts.CheckAlerts(alerts.AllowedAlertsDuringUpgrade, t.oc.AdminConfig(), - t.oc.NewPrometheusClient(context.TODO()), t.oc.AdminConfigClient(), testDuration, f) + alerts.CheckAlerts(alerts.AllowedAlertsDuringUpgrade, t.oc.NewPrometheusClient(context.TODO()), t.oc.AdminConfigClient(), testDuration, f) } // Teardown cleans up any remaining resources. diff --git a/test/extended/prometheus/prometheus.go b/test/extended/prometheus/prometheus.go index de83aeb55af9..8ba84ca8f2c8 100644 --- a/test/extended/prometheus/prometheus.go +++ b/test/extended/prometheus/prometheus.go @@ -255,7 +255,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() { g.It("shouldn't report any unexpected alerts in firing or pending state", func() { // we only consider samples since the beginning of the test testDuration := exutil.DurationSinceStartInSeconds() - alerts.CheckAlerts(alerts.AllowedAlertsDuringConformance, oc.AdminConfig(), oc.NewPrometheusClient(context.TODO()), oc.AdminConfigClient(), testDuration, nil) + alerts.CheckAlerts(alerts.AllowedAlertsDuringConformance, oc.NewPrometheusClient(context.TODO()), oc.AdminConfigClient(), testDuration, nil) }) g.It("shouldn't exceed the series limit of total series sent via telemetry from each cluster", func() { From 713410ab98eb557986b94dd1b62e3fe74534c194 Mon Sep 17 00:00:00 2001 From: Devan Goodwin Date: Tue, 17 Oct 2023 13:41:55 -0300 Subject: [PATCH 2/8] Remove duplicated alert testing stack This is confusing but this commit removes two alert tests: [sig-instrumentation][Late] Alerts shouldn't report any unexpected alerts in firing or pending state [apigroup:config.openshift.io] [Suite:openshift/conformance/parallel] - Ginko test run Late during e2e, live queries prometheus for the duration of the run, skips any alerts that have their own test, and flakes if any are found. It cannot fail. - Flake rate is about 80%. - This test is being removed in favor of the new one based on intervals observed. (see pkg/monitortests/testframework/legacytestframeworkmonitortests/monitortest.go. [sig-arch] Check if alerts are firing during or after upgrade success - Monitor test run after upgrade. - Uses basically the same approach/code as above. - Flake rate is 97%. - This test is being removed in favor of the new one based on intervals observed. (see pkg/monitortests/testframework/legacytestframeworkmonitortests/monitortest.go. Removing these two tests dramatically simplies our stack so we can begin working on resurrecting some form of alert testing in just one spot. The remaining backstop test is: [sig-trt][invariant] No alerts without an explicit test should be firing/pending more than historically It is a 100% flake. NOTE: A similarly named test that can fail and does catch real problems is NOT being removed: [sig-instrumentation] Prometheus [apigroup:image.openshift.io] when installed on the cluster shouldn't report any alerts in firing state apart from Watchdog and AlertmanagerReceiversNotConfigured [Early][apigroup:config.openshift.io] [Skipped:Disconnected] [Suite:openshift/conformance/parallel] --- pkg/alerts/check.go | 146 ------------------ .../legacytestframeworkmonitortests/alerts.go | 9 +- test/e2e/upgrade/alert/alert.go | 57 ------- test/e2e/upgrade/upgrade.go | 2 - test/extended/prometheus/prometheus.go | 7 - .../generated/zz_generated.annotations.go | 2 - 6 files changed, 5 insertions(+), 218 deletions(-) delete mode 100644 pkg/alerts/check.go delete mode 100644 test/e2e/upgrade/alert/alert.go diff --git a/pkg/alerts/check.go b/pkg/alerts/check.go deleted file mode 100644 index ef8d0fb03bbc..000000000000 --- a/pkg/alerts/check.go +++ /dev/null @@ -1,146 +0,0 @@ -package alerts - -import ( - "context" - "fmt" - "strings" - "time" - - allowedalerts "github.com/openshift/origin/pkg/monitortestlibrary/allowedalerts" - "github.com/openshift/origin/pkg/monitortestlibrary/platformidentification" - - o "github.com/onsi/gomega" - configv1 "github.com/openshift/api/config/v1" - configclient "github.com/openshift/client-go/config/clientset/versioned" - testresult "github.com/openshift/origin/pkg/test/ginkgo/result" - "github.com/openshift/origin/test/extended/util/disruption" - helper "github.com/openshift/origin/test/extended/util/prometheus" - prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/kubernetes/test/e2e/framework" -) - -type AllowedAlertsFunc func(featureSet configv1.FeatureSet) (allowedFiringWithBugs, allowedFiring, allowedPendingWithBugs, allowedPending helper.MetricConditions) - -// CheckAlerts will query prometheus and ensure no-unexpected alerts were pending or firing. -// Used by both upgrade and conformance suites, with different allowances for each. -func CheckAlerts(allowancesFunc AllowedAlertsFunc, prometheusClient prometheusv1.API, configClient configclient.Interface, testDuration time.Duration, f *framework.Framework) { - - featureSet := configv1.Default - featureGate, err := configClient.ConfigV1().FeatureGates().Get(context.TODO(), "cluster", metav1.GetOptions{}) - if err != nil { - framework.Logf("ERROR: error checking feature gates in cluster, ignoring: %v", err) - } else { - featureSet = featureGate.Spec.FeatureSet - } - firingAlertsWithBugs, allowedFiringAlerts, pendingAlertsWithBugs, allowedPendingAlerts := - allowancesFunc(featureSet) - - // In addition to the alert allowances passed in (which can differ for upgrades vs conformance), - // we also exclude alerts that have their own separate tests codified. This is a backstop test for - // everything else. - for _, alertTest := range allowedalerts.AllAlertTests(&platformidentification.JobType{}, - allowedalerts.DefaultAllowances) { - - switch alertTest.AlertState() { - case allowedalerts.AlertPending: - // a pending test covers pending and everything above (firing) - allowedPendingAlerts = append(allowedPendingAlerts, - helper.MetricCondition{ - Selector: map[string]string{"alertname": alertTest.AlertName()}, - Text: "has a separate e2e test", - }, - ) - allowedFiringAlerts = append(allowedFiringAlerts, - helper.MetricCondition{ - Selector: map[string]string{"alertname": alertTest.AlertName()}, - Text: "has a separate e2e test", - }, - ) - case allowedalerts.AlertInfo: - // an info test covers all firing - allowedFiringAlerts = append(allowedFiringAlerts, - helper.MetricCondition{ - Selector: map[string]string{"alertname": alertTest.AlertName()}, - Text: "has a separate e2e test", - }, - ) - } - } - - knownViolations := sets.NewString() - unexpectedViolations := sets.NewString() - unexpectedViolationsAsFlakes := sets.NewString() - debug := sets.NewString() - - // Invariant: No non-info level alerts should have fired - firingAlertQuery := fmt.Sprintf(` -sort_desc( -count_over_time(ALERTS{alertstate="firing",severity!="info",alertname!~"Watchdog|AlertmanagerReceiversNotConfigured"}[%[1]s:1s]) -) > 0 -`, testDuration) - result, err := helper.RunQuery(context.TODO(), prometheusClient, firingAlertQuery) - o.Expect(err).NotTo(o.HaveOccurred(), "unable to check firing alerts") - for _, series := range result.Data.Result { - labels := helper.StripLabels(series.Metric, "alertname", "alertstate", "prometheus") - violation := fmt.Sprintf("alert %s fired for %s seconds with labels: %s", series.Metric["alertname"], series.Value, helper.LabelsAsSelector(labels)) - if cause := allowedFiringAlerts.Matches(series); cause != nil { - // TODO: this seems to never be firing? no search.ci results show allowed - debug.Insert(fmt.Sprintf("%s result=allow (%s)", violation, cause.Text)) - continue - } - if cause := firingAlertsWithBugs.Matches(series); cause != nil { - knownViolations.Insert(fmt.Sprintf("%s result=allow bug=%s", violation, cause.Text)) - } else { - unexpectedViolations.Insert(fmt.Sprintf("%s result=reject", violation)) - } - } - - // Invariant: There should be no pending alerts - pendingAlertQuery := fmt.Sprintf(` -sort_desc( - time() * ALERTS + 1 - - - last_over_time(( - time() * ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured",alertstate="pending",severity!="info"} - unless - ALERTS offset 1s - )[%[1]s:1s]) -) -`, testDuration) - result, err = helper.RunQuery(context.TODO(), prometheusClient, pendingAlertQuery) - o.Expect(err).NotTo(o.HaveOccurred(), "unable to retrieve pending alerts") - for _, series := range result.Data.Result { - labels := helper.StripLabels(series.Metric, "alertname", "alertstate", "prometheus") - violation := fmt.Sprintf("alert %s pending for %s seconds with labels: %s", series.Metric["alertname"], series.Value, helper.LabelsAsSelector(labels)) - if cause := allowedPendingAlerts.Matches(series); cause != nil { - // TODO: this seems to never be firing? no search.ci results show allowed - debug.Insert(fmt.Sprintf("%s result=allow (%s)", violation, cause.Text)) - continue - } - if cause := pendingAlertsWithBugs.Matches(series); cause != nil { - knownViolations.Insert(fmt.Sprintf("%s result=allow bug=%s", violation, cause.Text)) - } else { - // treat pending errors as a flake right now because we are still trying to determine the scope - // TODO: move this to unexpectedViolations later - unexpectedViolationsAsFlakes.Insert(fmt.Sprintf("%s result=allow", violation)) - } - } - - if len(debug) > 0 { - framework.Logf("Alerts were detected which are allowed:\n\n%s", strings.Join(debug.List(), "\n")) - } - if flakes := sets.NewString().Union(knownViolations).Union(unexpectedViolations).Union(unexpectedViolationsAsFlakes); len(flakes) > 0 { - // The two duplicated code paths merged together here had slightly different ways of reporting flakes: - if f != nil { - // when called from alert.go within an UpgradeTest with a framework available - disruption.FrameworkFlakef(f, "Unexpected alert behavior:\n\n%s", strings.Join(flakes.List(), "\n")) - } else { - // when called from prometheus.go with no framework available - testresult.Flakef("Unexpected alert behavior:\n\n%s", strings.Join(flakes.List(), "\n")) - } - } - framework.Logf("No alerts fired") - -} diff --git a/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts.go b/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts.go index 39ece2e5ac79..b04fd727ebfe 100644 --- a/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts.go +++ b/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts.go @@ -12,7 +12,6 @@ import ( configv1 "github.com/openshift/api/config/v1" "github.com/sirupsen/logrus" - "github.com/openshift/origin/pkg/alerts" "github.com/openshift/origin/pkg/monitor/monitorapi" "github.com/openshift/origin/pkg/test/ginkgo/junitapi" helper "github.com/openshift/origin/test/extended/util/prometheus" @@ -26,8 +25,10 @@ import ( "k8s.io/kubernetes/test/e2e/framework" ) +type AllowedAlertsFunc func(featureSet configv1.FeatureSet) (allowedFiringWithBugs, allowedFiring, allowedPendingWithBugs, allowedPending helper.MetricConditions) + func testAlerts(events monitorapi.Intervals, - allowancesFunc alerts.AllowedAlertsFunc, + allowancesFunc AllowedAlertsFunc, jobType *platformidentification.JobType, restConfig *rest.Config, duration time.Duration, @@ -74,7 +75,7 @@ func testAlerts(events monitorapi.Intervals, } func RunAlertTests(jobType *platformidentification.JobType, - allowancesFunc alerts.AllowedAlertsFunc, + allowancesFunc AllowedAlertsFunc, featureSet configv1.FeatureSet, etcdAllowance allowedalerts2.AlertTestAllowanceCalculator, events monitorapi.Intervals, @@ -109,7 +110,7 @@ func RunAlertTests(jobType *platformidentification.JobType, // runBackstopTest will process the intervals for any alerts which do not have their own explicit test, // and look for any pending/firing intervals that are not within sufficient range. func runBackstopTest( - allowancesFunc alerts.AllowedAlertsFunc, + allowancesFunc AllowedAlertsFunc, featureSet configv1.FeatureSet, alertIntervals monitorapi.Intervals, alertTests []allowedalerts2.AlertTest) []*junitapi.JUnitTestCase { diff --git a/test/e2e/upgrade/alert/alert.go b/test/e2e/upgrade/alert/alert.go deleted file mode 100644 index e2242866f2ac..000000000000 --- a/test/e2e/upgrade/alert/alert.go +++ /dev/null @@ -1,57 +0,0 @@ -package alert - -import ( - "context" - "time" - - g "github.com/onsi/ginkgo/v2" - "github.com/openshift/origin/pkg/alerts" - exutil "github.com/openshift/origin/test/extended/util" - "k8s.io/kubernetes/test/e2e/framework" - "k8s.io/kubernetes/test/e2e/upgrades" -) - -// UpgradeTest runs verifies invariants regarding what alerts are allowed to fire -// during the upgrade process. -type UpgradeTest struct { - oc *exutil.CLI -} - -func (UpgradeTest) Name() string { return "check-for-alerts" } -func (UpgradeTest) DisplayName() string { - return "[sig-arch] Check if alerts are firing during or after upgrade success" -} - -// Setup creates parameters to query Prometheus -func (t *UpgradeTest) Setup(ctx context.Context, f *framework.Framework) { - g.By("Setting up upgrade alert test") - - t.oc = exutil.NewCLIWithFramework(f) - - framework.Logf("Post-upgrade alert test setup complete") -} - -// Test checks if alerts are firing at various points during upgrade. -// An alert firing during an upgrade is a high severity bug - it either points to a real issue in -// a dependency, or a failure of the component, and therefore must be fixed. -func (t *UpgradeTest) Test(ctx context.Context, f *framework.Framework, done <-chan struct{}, upgrade upgrades.UpgradeType) { - g.By("Checking for alerts") - startTime := time.Now() - - // Block until upgrade is done - g.By("Waiting for upgrade to finish before checking for alerts") - <-done - - // Additonal delay after upgrade completion to allow pending alerts to settle - g.By("Waiting before checking for alerts") - time.Sleep(1 * time.Minute) - - testDuration := time.Now().Sub(startTime).Round(time.Second) - - alerts.CheckAlerts(alerts.AllowedAlertsDuringUpgrade, t.oc.NewPrometheusClient(context.TODO()), t.oc.AdminConfigClient(), testDuration, f) -} - -// Teardown cleans up any remaining resources. -func (t *UpgradeTest) Teardown(ctx context.Context, f *framework.Framework) { - // rely on the namespace deletion to clean up everything -} diff --git a/test/e2e/upgrade/upgrade.go b/test/e2e/upgrade/upgrade.go index 6ae2c39a10f0..0bf484c74c65 100644 --- a/test/e2e/upgrade/upgrade.go +++ b/test/e2e/upgrade/upgrade.go @@ -38,7 +38,6 @@ import ( "k8s.io/kubernetes/test/e2e/upgrades/node" "github.com/openshift/origin/test/e2e/upgrade/adminack" - "github.com/openshift/origin/test/e2e/upgrade/alert" "github.com/openshift/origin/test/e2e/upgrade/dns" "github.com/openshift/origin/test/e2e/upgrade/manifestdelete" "github.com/openshift/origin/test/extended/prometheus" @@ -56,7 +55,6 @@ func AllTests() []upgrades.Test { return []upgrades.Test{ &adminack.UpgradeTest{}, &manifestdelete.UpgradeTest{}, - &alert.UpgradeTest{}, &node.SecretUpgradeTest{}, &apps.ReplicaSetUpgradeTest{}, diff --git a/test/extended/prometheus/prometheus.go b/test/extended/prometheus/prometheus.go index 8ba84ca8f2c8..9600c3fd0a0d 100644 --- a/test/extended/prometheus/prometheus.go +++ b/test/extended/prometheus/prometheus.go @@ -16,7 +16,6 @@ import ( g "github.com/onsi/ginkgo/v2" o "github.com/onsi/gomega" - "github.com/openshift/origin/pkg/alerts" promv1 "github.com/prometheus/client_golang/api/prometheus/v1" dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" @@ -252,12 +251,6 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() { } }) - g.It("shouldn't report any unexpected alerts in firing or pending state", func() { - // we only consider samples since the beginning of the test - testDuration := exutil.DurationSinceStartInSeconds() - alerts.CheckAlerts(alerts.AllowedAlertsDuringConformance, oc.NewPrometheusClient(context.TODO()), oc.AdminConfigClient(), testDuration, nil) - }) - g.It("shouldn't exceed the series limit of total series sent via telemetry from each cluster", func() { if enabledErr, err := telemetryIsEnabled(ctx, oc.AdminKubeClient()); err != nil { e2e.Failf("could not determine if Telemetry is enabled: %v", err) diff --git a/test/extended/util/annotate/generated/zz_generated.annotations.go b/test/extended/util/annotate/generated/zz_generated.annotations.go index 336e4dd5c723..3f66e6facd0f 100644 --- a/test/extended/util/annotate/generated/zz_generated.annotations.go +++ b/test/extended/util/annotate/generated/zz_generated.annotations.go @@ -1197,8 +1197,6 @@ var Annotations = map[string]string{ "[sig-instrumentation][Late] Alerts shouldn't exceed the series limit of total series sent via telemetry from each cluster": " [Suite:openshift/conformance/parallel]", - "[sig-instrumentation][Late] Alerts shouldn't report any unexpected alerts in firing or pending state": " [Suite:openshift/conformance/parallel]", - "[sig-instrumentation][Late] OpenShift alerting rules [apigroup:image.openshift.io] should have a runbook_url annotation if the alert is critical": " [Skipped:Disconnected] [Suite:openshift/conformance/parallel]", "[sig-instrumentation][Late] OpenShift alerting rules [apigroup:image.openshift.io] should have a valid severity label": " [Skipped:Disconnected] [Suite:openshift/conformance/parallel]", From f990e703db0aef9adf182bcd6ffc0cafe9be70fa Mon Sep 17 00:00:00 2001 From: Devan Goodwin Date: Tue, 17 Oct 2023 14:39:59 -0300 Subject: [PATCH 3/8] Calm down some logging --- pkg/monitortestlibrary/historicaldata/alert_types.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/monitortestlibrary/historicaldata/alert_types.go b/pkg/monitortestlibrary/historicaldata/alert_types.go index 22615bafebbd..aa8ce48ba4b1 100644 --- a/pkg/monitortestlibrary/historicaldata/alert_types.go +++ b/pkg/monitortestlibrary/historicaldata/alert_types.go @@ -80,8 +80,8 @@ func NewAlertMatcherWithHistoricalData(data map[AlertDataKey]AlertStatisticalDat func (b *AlertBestMatcher) bestMatch(key AlertDataKey) (AlertStatisticalData, string, error) { exactMatchKey := key - logrus.WithField("alertName", key.AlertName).Infof("searching for bestMatch for %+v", key.JobType) - logrus.Infof("historicalData has %d entries", len(b.HistoricalData)) + logrus.WithField("alertName", key.AlertName).WithField("entries", len(b.HistoricalData)). + Debugf("searching for best match for %+v", key.JobType) if percentiles, ok := b.HistoricalData[exactMatchKey]; ok { if percentiles.JobRuns > minJobRuns { From 51032976dce351776410d02c9f722b15897f2c23 Mon Sep 17 00:00:00 2001 From: Devan Goodwin Date: Wed, 18 Oct 2023 08:47:25 -0300 Subject: [PATCH 4/8] Clarify that we're flaking on any occurrence of alert, not failing --- pkg/monitortestlibrary/allowedalerts/all.go | 8 ++++---- .../allowedalerts/basic_alert.go | 4 ++-- pkg/monitortestlibrary/allowedalerts/matches.go | 15 +++++++-------- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/pkg/monitortestlibrary/allowedalerts/all.go b/pkg/monitortestlibrary/allowedalerts/all.go index dbb483774977..a5361082f1bc 100644 --- a/pkg/monitortestlibrary/allowedalerts/all.go +++ b/pkg/monitortestlibrary/allowedalerts/all.go @@ -19,10 +19,10 @@ func AllAlertTests(jobType *platformidentification.JobType, etcdAllowance AlertT // In CI firing is rare, but does happen a few times a month. Pending however occurs all the time, which implies operators are routinely in // a state they're not supposed to be during upgrade. // https://github.com/openshift/enhancements/blob/cb81452fddf86c1099acd87610b88369cd6192db/dev-guide/cluster-version-operator/dev/clusteroperator.md#there-are-a-set-of-guarantees-components-are-expected-to-honor-in-return - ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDown", jobType).pending().alwaysFail().toTests()...) - ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDegraded", jobType).pending().alwaysFail().toTests()...) - ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDown", jobType).firing().alwaysFail().toTests()...) - ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDegraded", jobType).firing().alwaysFail().toTests()...) + ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDown", jobType).pending().alwaysFlake().toTests()...) + ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDegraded", jobType).pending().alwaysFlake().toTests()...) + ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDown", jobType).firing().alwaysFlake().toTests()...) + ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDegraded", jobType).firing().alwaysFlake().toTests()...) ret = append(ret, newAlert("etcd", "etcdMembersDown", jobType).pending().neverFail().toTests()...) ret = append(ret, newAlert("etcd", "etcdMembersDown", jobType).firing().toTests()...) diff --git a/pkg/monitortestlibrary/allowedalerts/basic_alert.go b/pkg/monitortestlibrary/allowedalerts/basic_alert.go index 84e676863f82..dd3cd000ae2f 100644 --- a/pkg/monitortestlibrary/allowedalerts/basic_alert.go +++ b/pkg/monitortestlibrary/allowedalerts/basic_alert.go @@ -111,8 +111,8 @@ func (a *alertBuilder) neverFail() *alertBuilder { return a } -func (a *alertBuilder) alwaysFail() *alertBuilder { - a.allowanceCalculator = alwaysFail() +func (a *alertBuilder) alwaysFlake() *alertBuilder { + a.allowanceCalculator = alwaysFlake() return a } diff --git a/pkg/monitortestlibrary/allowedalerts/matches.go b/pkg/monitortestlibrary/allowedalerts/matches.go index 93eebf66285d..b36e7833d538 100644 --- a/pkg/monitortestlibrary/allowedalerts/matches.go +++ b/pkg/monitortestlibrary/allowedalerts/matches.go @@ -55,20 +55,19 @@ func getClosestPercentilesValues(key historicaldata2.AlertDataKey) (historicalda return getCurrentResults().BestMatchDuration(key) } -func alwaysFail() AlertTestAllowanceCalculator { - return &alwaysFailAllowance{} +func alwaysFlake() AlertTestAllowanceCalculator { + return &alwaysFlakeAllowance{} } -// alwaysFailAllowance is for alerts we want to fail a test if they occur at all. -type alwaysFailAllowance struct { +// alwaysFlakeAllowance is for alerts we want to flake a test if they occur at all. +type alwaysFlakeAllowance struct { } -func (d *alwaysFailAllowance) FailAfter(key historicaldata2.AlertDataKey) (time.Duration, error) { - // TODO: right now we're just flaking until we're certain this doesn't happen too often. Once we're sure, - // change to 1 second. +func (d *alwaysFlakeAllowance) FailAfter(key historicaldata2.AlertDataKey) (time.Duration, error) { + // make it effectively impossible for a test failure here, we only want flakes return 24 * time.Hour, nil } -func (d *alwaysFailAllowance) FlakeAfter(key historicaldata2.AlertDataKey) time.Duration { +func (d *alwaysFlakeAllowance) FlakeAfter(key historicaldata2.AlertDataKey) time.Duration { return 1 * time.Second } From 70e7a31e645ae87720481a1b39299a703ffd50e3 Mon Sep 17 00:00:00 2001 From: Devan Goodwin Date: Wed, 18 Oct 2023 11:31:43 -0300 Subject: [PATCH 5/8] Drop the ClusterOperatorDown/Degraded alert tests These turn out to be a duplication of tests that already flake if an operator does Available=false or Degraded=true in a test run. (i.e. [bz-DNS] clusteroperator/dns should not change condition/Degraded) --- pkg/monitortestlibrary/allowedalerts/all.go | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pkg/monitortestlibrary/allowedalerts/all.go b/pkg/monitortestlibrary/allowedalerts/all.go index a5361082f1bc..9a4506b654f0 100644 --- a/pkg/monitortestlibrary/allowedalerts/all.go +++ b/pkg/monitortestlibrary/allowedalerts/all.go @@ -15,15 +15,6 @@ func AllAlertTests(jobType *platformidentification.JobType, etcdAllowance AlertT ret = append(ret, newNamespacedAlert("KubePodNotReady", jobType).pending().neverFail().toTests()...) ret = append(ret, newNamespacedAlert("KubePodNotReady", jobType).firing().toTests()...) - // ClusterOperatorDown and ClusterOperatorDegraded should not occur during a normal upgrade. - // In CI firing is rare, but does happen a few times a month. Pending however occurs all the time, which implies operators are routinely in - // a state they're not supposed to be during upgrade. - // https://github.com/openshift/enhancements/blob/cb81452fddf86c1099acd87610b88369cd6192db/dev-guide/cluster-version-operator/dev/clusteroperator.md#there-are-a-set-of-guarantees-components-are-expected-to-honor-in-return - ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDown", jobType).pending().alwaysFlake().toTests()...) - ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDegraded", jobType).pending().alwaysFlake().toTests()...) - ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDown", jobType).firing().alwaysFlake().toTests()...) - ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDegraded", jobType).firing().alwaysFlake().toTests()...) - ret = append(ret, newAlert("etcd", "etcdMembersDown", jobType).pending().neverFail().toTests()...) ret = append(ret, newAlert("etcd", "etcdMembersDown", jobType).firing().toTests()...) ret = append(ret, newAlert("etcd", "etcdGRPCRequestsSlow", jobType).pending().neverFail().toTests()...) From d334db8b0547c90904b2224c7b21713680d9e709 Mon Sep 17 00:00:00 2001 From: Devan Goodwin Date: Thu, 19 Oct 2023 14:51:11 -0300 Subject: [PATCH 6/8] Add new test that will fail on new alerts firing Attempts to be forgiving, scan entire data file regardless of nurp looking to see if we've seen this alert name before anywhere. If not, or we did but first observed is less than two weeks ago, fail the test. This approach should help us survive the cutover to new releases. --- pkg/cmd/openshift-tests/dev/dev.go | 2 +- .../monitor/timeline/timeline_command.go | 2 +- pkg/disruption/backend/sampler/remote.go | 2 +- pkg/monitor/monitorapi/types.go | 2 + pkg/monitor/serialization/serialize.go | 29 +++-- .../allowedalerts/basic_alert.go | 1 + .../allowedalerts/matches.go | 2 +- .../allowedalerts/matches_test.go | 2 +- pkg/monitortestlibrary/allowedalerts/types.go | 13 +- .../historicaldata/alert_types.go | 26 ++-- .../historicaldata/disruption_types.go | 26 +++- .../node/nodestateanalyzer/node_test.go | 4 +- .../node/watchpods/compute_intervals_test.go | 4 +- .../testframework/alertanalyzer/alerts.go | 11 +- .../legacytestframeworkmonitortests/alerts.go | 118 +++++++++++++++-- .../alerts_test.go | 123 ++++++++++++++++++ .../timelineserializer/rendering.go | 2 +- .../timelineserializer/rendering_test.go | 2 +- test/extended/prometheus/prometheus.go | 11 +- test/extended/util/prometheus/helpers.go | 22 +--- 20 files changed, 324 insertions(+), 80 deletions(-) create mode 100644 pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts_test.go diff --git a/pkg/cmd/openshift-tests/dev/dev.go b/pkg/cmd/openshift-tests/dev/dev.go index ad711cbafd71..b7481f306a32 100644 --- a/pkg/cmd/openshift-tests/dev/dev.go +++ b/pkg/cmd/openshift-tests/dev/dev.go @@ -135,7 +135,7 @@ func readIntervalsFromFile(intervalsFile string) (monitorapi.Intervals, error) { return nil, err } - return monitorserialization.EventsFromJSON(jsonBytes) + return monitorserialization.IntervalsFromJSON(jsonBytes) } func newRunDisruptionInvariantsCommand() *cobra.Command { diff --git a/pkg/cmd/openshift-tests/monitor/timeline/timeline_command.go b/pkg/cmd/openshift-tests/monitor/timeline/timeline_command.go index 9aadc605b9d1..70b1b496e444 100644 --- a/pkg/cmd/openshift-tests/monitor/timeline/timeline_command.go +++ b/pkg/cmd/openshift-tests/monitor/timeline/timeline_command.go @@ -57,7 +57,7 @@ func NewTimelineOptions(ioStreams genericclioptions.IOStreams) *TimelineOptions IOStreams: ioStreams, KnownRenderers: map[string]RenderFunc{ - "json": monitorserialization.EventsToJSON, + "json": monitorserialization.IntervalsToJSON, "html": renderHTML, }, KnownTimelines: map[string]monitorapi.EventIntervalMatchesFunc{ diff --git a/pkg/disruption/backend/sampler/remote.go b/pkg/disruption/backend/sampler/remote.go index 77c2765560df..ded48fe88cc4 100644 --- a/pkg/disruption/backend/sampler/remote.go +++ b/pkg/disruption/backend/sampler/remote.go @@ -380,7 +380,7 @@ func fetchEventsFromFileOnNode(ctx context.Context, clientset *kubernetes.Client if err != nil { return filteredEvents, fmt.Errorf("failed to fetch file %s on node %s: %v", filePath, nodeName, err) } - allEvents, err := monitorserialization.EventsFromJSON(allBytes) + allEvents, err := monitorserialization.IntervalsFromJSON(allBytes) if err != nil { return nil, fmt.Errorf("failed to convert file %s from node %s to intervals: %v", filePath, nodeName, err) } diff --git a/pkg/monitor/monitorapi/types.go b/pkg/monitor/monitorapi/types.go index 7228fd82052e..1b6bc6574f6c 100644 --- a/pkg/monitor/monitorapi/types.go +++ b/pkg/monitor/monitorapi/types.go @@ -199,6 +199,8 @@ const ( type AnnotationKey string const ( + AnnotationAlertState AnnotationKey = "alertstate" + AnnotationSeverity AnnotationKey = "severity" AnnotationReason AnnotationKey = "reason" AnnotationContainerExitCode AnnotationKey = "code" AnnotationCause AnnotationKey = "cause" diff --git a/pkg/monitor/serialization/serialize.go b/pkg/monitor/serialization/serialize.go index a00c63d84757..cf96d249af0c 100644 --- a/pkg/monitor/serialization/serialize.go +++ b/pkg/monitor/serialization/serialize.go @@ -21,7 +21,8 @@ type EventInterval struct { // TODO: Remove the omitempty, just here to keep from having to repeatedly updated the json // files used in some new tests - Source string `json:"tempSource,omitempty"` // also temporary, unsure if this concept will survive + Source string `json:"tempSource,omitempty"` // also temporary, unsure if this concept will survive + Display bool `json:"display"` // TODO: we're hoping to move these to just locator/message when everything is ready. StructuredLocator monitorapi.Locator `json:"tempStructuredLocator"` @@ -37,7 +38,7 @@ type EventIntervalList struct { } func EventsToFile(filename string, events monitorapi.Intervals) error { - json, err := EventsToJSON(events) + json, err := IntervalsToJSON(events) if err != nil { return err } @@ -49,10 +50,10 @@ func EventsFromFile(filename string) (monitorapi.Intervals, error) { if err != nil { return nil, err } - return EventsFromJSON(data) + return IntervalsFromJSON(data) } -func EventsFromJSON(data []byte) (monitorapi.Intervals, error) { +func IntervalsFromJSON(data []byte) (monitorapi.Intervals, error) { var list EventIntervalList if err := json.Unmarshal(data, &list); err != nil { return nil, err @@ -64,12 +65,13 @@ func EventsFromJSON(data []byte) (monitorapi.Intervals, error) { return nil, err } events = append(events, monitorapi.Interval{ - Source: monitorapi.IntervalSource(interval.Source), + Source: monitorapi.IntervalSource(interval.Source), + Display: interval.Display, Condition: monitorapi.Condition{ Level: level, Locator: interval.Locator, - Message: interval.Message, StructuredLocator: interval.StructuredLocator, + Message: interval.Message, StructuredMessage: interval.StructuredMessage, }, @@ -91,7 +93,8 @@ func IntervalFromJSON(data []byte) (*monitorapi.Interval, error) { return nil, err } return &monitorapi.Interval{ - Source: monitorapi.IntervalSource(serializedInterval.Source), + Source: monitorapi.IntervalSource(serializedInterval.Source), + Display: serializedInterval.Display, Condition: monitorapi.Condition{ Level: level, Locator: serializedInterval.Locator, @@ -120,9 +123,9 @@ func IntervalToOneLineJSON(interval monitorapi.Interval) ([]byte, error) { return buf.Bytes(), nil } -func EventsToJSON(events monitorapi.Intervals) ([]byte, error) { +func IntervalsToJSON(intervals monitorapi.Intervals) ([]byte, error) { outputEvents := []EventInterval{} - for _, curr := range events { + for _, curr := range intervals { outputEvents = append(outputEvents, monitorEventIntervalToEventInterval(curr)) } @@ -131,14 +134,16 @@ func EventsToJSON(events monitorapi.Intervals) ([]byte, error) { return json.MarshalIndent(list, "", " ") } -func EventsIntervalsToFile(filename string, events monitorapi.Intervals) error { - json, err := EventsIntervalsToJSON(events) +func IntervalsToFile(filename string, intervals monitorapi.Intervals) error { + json, err := EventsIntervalsToJSON(intervals) if err != nil { return err } return ioutil.WriteFile(filename, json, 0644) } +// TODO: this is very similar but subtly different to the function above, what is the purpose of skipping those +// with from/to equal or empty to? func EventsIntervalsToJSON(events monitorapi.Intervals) ([]byte, error) { outputEvents := []EventInterval{} for _, curr := range events { @@ -161,11 +166,11 @@ func monitorEventIntervalToEventInterval(interval monitorapi.Interval) EventInte StructuredLocator: interval.StructuredLocator, StructuredMessage: interval.StructuredMessage, Source: string(interval.Source), + Display: interval.Display, From: metav1.Time{Time: interval.From}, To: metav1.Time{Time: interval.To}, } - return ret } diff --git a/pkg/monitortestlibrary/allowedalerts/basic_alert.go b/pkg/monitortestlibrary/allowedalerts/basic_alert.go index dd3cd000ae2f..16a8ab761bab 100644 --- a/pkg/monitortestlibrary/allowedalerts/basic_alert.go +++ b/pkg/monitortestlibrary/allowedalerts/basic_alert.go @@ -31,6 +31,7 @@ type AlertTest interface { // AlertState is the state of the alert. They are logically ordered, so if a test says it limits on "pending", then // any state above pending (like info or warning) will cause the test to fail. +// TODO this looks wrong, AlertState (pending|firing) and AlertLevel (info|warning|critical) are different things, but they seem lumped together here. type AlertState string const ( diff --git a/pkg/monitortestlibrary/allowedalerts/matches.go b/pkg/monitortestlibrary/allowedalerts/matches.go index b36e7833d538..d2de8094ceae 100644 --- a/pkg/monitortestlibrary/allowedalerts/matches.go +++ b/pkg/monitortestlibrary/allowedalerts/matches.go @@ -52,7 +52,7 @@ func (d *percentileAllowances) FlakeAfter(key historicaldata2.AlertDataKey) time // getClosestPercentilesValues uses the backend and information about the cluster to choose the best historical p99 to operate against. // We enforce "don't get worse" for disruption by watching the aggregate data in CI over many runs. func getClosestPercentilesValues(key historicaldata2.AlertDataKey) (historicaldata2.StatisticalDuration, string, error) { - return getCurrentResults().BestMatchDuration(key) + return GetHistoricalData().BestMatchDuration(key) } func alwaysFlake() AlertTestAllowanceCalculator { diff --git a/pkg/monitortestlibrary/allowedalerts/matches_test.go b/pkg/monitortestlibrary/allowedalerts/matches_test.go index 0ae3e2dd5d54..bb1601d71d3a 100644 --- a/pkg/monitortestlibrary/allowedalerts/matches_test.go +++ b/pkg/monitortestlibrary/allowedalerts/matches_test.go @@ -171,7 +171,7 @@ func TestGetClosestP99Value(t *testing.T) { // from bigquery and commit into origin. Test ensures we can parse it and the data looks sane. func TestAlertDataFileParsing(t *testing.T) { - alertMatcher := getCurrentResults() + alertMatcher := GetHistoricalData() // The list of known alerts that goes into this file is composed of everything we've ever // seen fire in that release. As such it can change from one release to the next as alerts diff --git a/pkg/monitortestlibrary/allowedalerts/types.go b/pkg/monitortestlibrary/allowedalerts/types.go index 400055a822f2..38f79e5404e0 100644 --- a/pkg/monitortestlibrary/allowedalerts/types.go +++ b/pkg/monitortestlibrary/allowedalerts/types.go @@ -22,7 +22,7 @@ var ( historicalData *historicaldata.AlertBestMatcher ) -func getCurrentResults() *historicaldata.AlertBestMatcher { +func GetHistoricalData() *historicaldata.AlertBestMatcher { readResults.Do( func() { var err error @@ -34,3 +34,14 @@ func getCurrentResults() *historicaldata.AlertBestMatcher { return historicalData } + +// AllowedAlertNames is a list of alerts we do not test against. +var AllowedAlertNames = []string{ + "Watchdog", + "AlertmanagerReceiversNotConfigured", + "PrometheusRemoteWriteDesiredShards", + "KubeJobFailed", // this is a result of bug https://bugzilla.redhat.com/show_bug.cgi?id=2054426 . We should catch these in the prometheus tests. + + // indicates a problem in the external Telemeter service, presently very common, does not impact our ability to e2e test: + "TelemeterClientFailures", +} diff --git a/pkg/monitortestlibrary/historicaldata/alert_types.go b/pkg/monitortestlibrary/historicaldata/alert_types.go index aa8ce48ba4b1..16ec2b1e6e93 100644 --- a/pkg/monitortestlibrary/historicaldata/alert_types.go +++ b/pkg/monitortestlibrary/historicaldata/alert_types.go @@ -12,11 +12,15 @@ import ( ) type AlertStatisticalData struct { - AlertDataKey `json:",inline"` - Name string - P95 float64 - P99 float64 - JobRuns int64 + AlertDataKey `json:",inline"` + Name string + P50 float64 + P75 float64 + P95 float64 + P99 float64 + FirstObserved time.Time + LastObserved time.Time + JobRuns int64 } type AlertDataKey struct { @@ -84,7 +88,7 @@ func (b *AlertBestMatcher) bestMatch(key AlertDataKey) (AlertStatisticalData, st Debugf("searching for best match for %+v", key.JobType) if percentiles, ok := b.HistoricalData[exactMatchKey]; ok { - if percentiles.JobRuns > minJobRuns { + if percentiles.JobRuns >= minJobRuns { logrus.Infof("found exact match: %+v", percentiles) return percentiles, "", nil } @@ -159,8 +163,12 @@ func (b *AlertBestMatcher) BestMatchP99(key AlertDataKey) (*time.Duration, strin func toAlertStatisticalDuration(in AlertStatisticalData) StatisticalDuration { return StatisticalDuration{ - JobType: in.AlertDataKey.JobType, - P95: DurationOrDie(in.P95), - P99: DurationOrDie(in.P99), + JobType: in.AlertDataKey.JobType, + P50: DurationOrDie(in.P50), + P75: DurationOrDie(in.P75), + P95: DurationOrDie(in.P95), + P99: DurationOrDie(in.P99), + FirstObserved: in.FirstObserved, + LastObserved: in.LastObserved, } } diff --git a/pkg/monitortestlibrary/historicaldata/disruption_types.go b/pkg/monitortestlibrary/historicaldata/disruption_types.go index d4c78c6f1ff0..d43df4d17a09 100644 --- a/pkg/monitortestlibrary/historicaldata/disruption_types.go +++ b/pkg/monitortestlibrary/historicaldata/disruption_types.go @@ -20,15 +20,23 @@ const minJobRuns = 100 type StatisticalDuration struct { platformidentification.JobType `json:",inline"` + P50 time.Duration + P75 time.Duration P95 time.Duration P99 time.Duration + FirstObserved time.Time + LastObserved time.Time } type DisruptionStatisticalData struct { - DataKey `json:",inline"` - P95 float64 - P99 float64 - JobRuns int64 + DataKey `json:",inline"` + P50 float64 + P75 float64 + P95 float64 + P99 float64 + FirstObserved time.Time + LastObserved time.Time + JobRuns int64 } type DataKey struct { @@ -172,9 +180,13 @@ func (b *DisruptionBestMatcher) BestMatchP99(name string, jobType platformidenti func toStatisticalDuration(in DisruptionStatisticalData) StatisticalDuration { return StatisticalDuration{ - JobType: in.DataKey.JobType, - P95: DurationOrDie(in.P95), - P99: DurationOrDie(in.P99), + JobType: in.DataKey.JobType, + P50: DurationOrDie(in.P50), + P75: DurationOrDie(in.P75), + P95: DurationOrDie(in.P95), + P99: DurationOrDie(in.P99), + FirstObserved: in.FirstObserved, + LastObserved: in.LastObserved, } } diff --git a/pkg/monitortests/node/nodestateanalyzer/node_test.go b/pkg/monitortests/node/nodestateanalyzer/node_test.go index 9821a9e3343d..e32a555dc2d4 100644 --- a/pkg/monitortests/node/nodestateanalyzer/node_test.go +++ b/pkg/monitortests/node/nodestateanalyzer/node_test.go @@ -75,7 +75,7 @@ type nodeIntervalTest struct { } func (p nodeIntervalTest) test(t *testing.T) { - inputIntervals, err := monitorserialization.EventsFromJSON(p.events) + inputIntervals, err := monitorserialization.IntervalsFromJSON(p.events) if err != nil { t.Fatal(err) } @@ -89,7 +89,7 @@ func (p nodeIntervalTest) test(t *testing.T) { } result := intervalsFromEvents_NodeChanges(inputIntervals, nil, startTime, endTime) - resultBytes, err := monitorserialization.EventsToJSON(result) + resultBytes, err := monitorserialization.IntervalsToJSON(result) if err != nil { t.Fatal(err) } diff --git a/pkg/monitortests/node/watchpods/compute_intervals_test.go b/pkg/monitortests/node/watchpods/compute_intervals_test.go index 88722c1a696b..660f7d7b9683 100644 --- a/pkg/monitortests/node/watchpods/compute_intervals_test.go +++ b/pkg/monitortests/node/watchpods/compute_intervals_test.go @@ -84,7 +84,7 @@ func (p podIntervalTest) test(t *testing.T) { resourceMap["pods"] = podMap } - inputIntervals, err := monitorserialization.EventsFromJSON(p.events) + inputIntervals, err := monitorserialization.IntervalsFromJSON(p.events) if err != nil { t.Fatal(err) } @@ -98,7 +98,7 @@ func (p podIntervalTest) test(t *testing.T) { } result := createPodIntervalsFromInstants(inputIntervals, resourceMap, startTime, endTime) - resultBytes, err := monitorserialization.EventsToJSON(result) + resultBytes, err := monitorserialization.IntervalsToJSON(result) if err != nil { t.Fatal(err) } diff --git a/pkg/monitortests/testframework/alertanalyzer/alerts.go b/pkg/monitortests/testframework/alertanalyzer/alerts.go index 35633ae636e5..fa942735c955 100644 --- a/pkg/monitortests/testframework/alertanalyzer/alerts.go +++ b/pkg/monitortests/testframework/alertanalyzer/alerts.go @@ -312,15 +312,22 @@ func createEventIntervalsForAlerts(ctx context.Context, alerts prometheustypes.V level = monitorapi.Warning case alert.Metric["severity"] == "critical": level = monitorapi.Error - case alert.Metric["severity"] == "info": + case alert.Metric["severity"] == "info": // this case may not exist level = monitorapi.Info default: level = monitorapi.Error } + msg := monitorapi.NewMessage().HumanMessage(alert.Metric.String()) + if len(string(alert.Metric["alertstate"])) > 0 { + msg = msg.WithAnnotation(monitorapi.AnnotationAlertState, string(alert.Metric["alertstate"])) + } + if len(string(alert.Metric["severity"])) > 0 { + msg = msg.WithAnnotation(monitorapi.AnnotationSeverity, string(alert.Metric["severity"])) + } alertIntervalTemplate := monitorapi.NewInterval(monitorapi.SourceAlert, level). Locator(lb). - Message(monitorapi.NewMessage().HumanMessage(alert.Metric.String())) + Message(msg) var alertStartTime *time.Time var lastTime *time.Time diff --git a/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts.go b/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts.go index b04fd727ebfe..4eb788e6f1bc 100644 --- a/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts.go +++ b/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts.go @@ -6,7 +6,8 @@ import ( "strings" "time" - allowedalerts2 "github.com/openshift/origin/pkg/monitortestlibrary/allowedalerts" + "github.com/openshift/origin/pkg/monitortestlibrary/allowedalerts" + "github.com/openshift/origin/pkg/monitortestlibrary/historicaldata" "github.com/openshift/origin/pkg/monitortestlibrary/platformidentification" configv1 "github.com/openshift/api/config/v1" @@ -47,8 +48,8 @@ func testAlerts(events monitorapi.Intervals, featureSet = featureGate.Spec.FeatureSet } - var etcdAllowance allowedalerts2.AlertTestAllowanceCalculator - etcdAllowance = allowedalerts2.DefaultAllowances + var etcdAllowance allowedalerts.AlertTestAllowanceCalculator + etcdAllowance = allowedalerts.DefaultAllowances // if we have a restConfig, use it. var kubeClient *kubernetes.Clientset if restConfig != nil { @@ -56,7 +57,7 @@ func testAlerts(events monitorapi.Intervals, if err != nil { panic(err) } - etcdAllowance, err = allowedalerts2.NewAllowedWhenEtcdRevisionChange(context.TODO(), + etcdAllowance, err = allowedalerts.NewAllowedWhenEtcdRevisionChange(context.TODO(), kubeClient, duration) if err != nil { panic(err) @@ -74,15 +75,17 @@ func testAlerts(events monitorapi.Intervals, return ret } +// RunAlertTests is a key entry point for running all per-Alert tests we've defined in all.go AllAlertTests, +// as well as backstop tests on things we observe outside those specific tests. func RunAlertTests(jobType *platformidentification.JobType, allowancesFunc AllowedAlertsFunc, featureSet configv1.FeatureSet, - etcdAllowance allowedalerts2.AlertTestAllowanceCalculator, + etcdAllowance allowedalerts.AlertTestAllowanceCalculator, events monitorapi.Intervals, recordedResource monitorapi.ResourcesMap) []*junitapi.JUnitTestCase { ret := []*junitapi.JUnitTestCase{} - alertTests := allowedalerts2.AllAlertTests(jobType, etcdAllowance) + alertTests := allowedalerts.AllAlertTests(jobType, etcdAllowance) // Run the per-alert tests we've hardcoded: for i := range alertTests { @@ -101,8 +104,14 @@ func RunAlertTests(jobType *platformidentification.JobType, ret = append(ret, junit...) } + pendingIntervals := events.Filter(monitorapi.AlertPending()) + firingIntervals := events.Filter(monitorapi.AlertFiring()) + // Run the backstop catch all for all other alerts: - ret = append(ret, runBackstopTest(allowancesFunc, featureSet, events, alertTests)...) + ret = append(ret, runBackstopTest(allowancesFunc, featureSet, pendingIntervals, firingIntervals, alertTests)...) + + // TODO: Run a test to ensure no new alerts fired: + ret = append(ret, runNoNewAlertsFiringTest(allowedalerts.GetHistoricalData(), firingIntervals)...) return ret } @@ -112,14 +121,13 @@ func RunAlertTests(jobType *platformidentification.JobType, func runBackstopTest( allowancesFunc AllowedAlertsFunc, featureSet configv1.FeatureSet, - alertIntervals monitorapi.Intervals, - alertTests []allowedalerts2.AlertTest) []*junitapi.JUnitTestCase { + pendingIntervals monitorapi.Intervals, + firingIntervals monitorapi.Intervals, + alertTests []allowedalerts.AlertTest) []*junitapi.JUnitTestCase { firingAlertsWithBugs, allowedFiringAlerts, pendingAlertsWithBugs, allowedPendingAlerts := allowancesFunc(featureSet) - pendingIntervals := alertIntervals.Filter(monitorapi.AlertPending()) - firingIntervals := alertIntervals.Filter(monitorapi.AlertFiring()) logrus.Infof("filtered down to %d pending intervals", len(pendingIntervals)) logrus.Infof("filtered down to %d firing intervals", len(firingIntervals)) @@ -129,7 +137,7 @@ func runBackstopTest( for _, alertTest := range alertTests { switch alertTest.AlertState() { - case allowedalerts2.AlertPending: + case allowedalerts.AlertPending: // a pending test covers pending and everything above (firing) allowedPendingAlerts = append(allowedPendingAlerts, helper.MetricCondition{ @@ -143,7 +151,7 @@ func runBackstopTest( Text: "has a separate e2e test", }, ) - case allowedalerts2.AlertInfo: + case allowedalerts.AlertInfo: // an info test covers all firing allowedFiringAlerts = append(allowedFiringAlerts, helper.MetricCondition{ @@ -162,6 +170,9 @@ func runBackstopTest( // New version for alert testing against intervals instead of directly from prometheus: for _, firing := range firingIntervals { fan := monitorapi.AlertFromLocator(firing.Locator) + if isSkippedAlert(fan) { + continue + } seconds := firing.To.Sub(firing.From) violation := fmt.Sprintf("V2 alert %s fired for %s seconds with labels: %s", fan, seconds, firing.Message) if cause := allowedFiringAlerts.MatchesInterval(firing); cause != nil { @@ -178,6 +189,9 @@ func runBackstopTest( // New version for alert testing against intervals instead of directly from prometheus: for _, pending := range pendingIntervals { fan := monitorapi.AlertFromLocator(pending.Locator) + if isSkippedAlert(fan) { + continue + } seconds := pending.To.Sub(pending.From) violation := fmt.Sprintf("V2 alert %s pending for %s seconds with labels: %s", fan, seconds, pending.Message) if cause := allowedPendingAlerts.MatchesInterval(pending); cause != nil { @@ -219,3 +233,81 @@ func runBackstopTest( } return ret } + +func isSkippedAlert(alertName string) bool { + // Some alerts we always skip over in CI: + for _, a := range allowedalerts.AllowedAlertNames { + if a == alertName { + return true + } + } + return false +} + +// runNoNewAlertsFiringTest checks all alerts to see if we: +// 1. have no historical data for that alert in that namespace for this release +// 2. have historical data but it was first observed less than 2 weeks ago +// +// If either is true, this test will fail. We do not want new product alerts being added to the product that +// will trigger routinely and affect the fleet when they ship. +// The two week limit is our window to address these kinds of problems, after that the failure will stop. +func runNoNewAlertsFiringTest(historicalData *historicaldata.AlertBestMatcher, + firingIntervals monitorapi.Intervals) []*junitapi.JUnitTestCase { + testName := "[sig-trt][invariant] No new alerts should be firing" + ret := []*junitapi.JUnitTestCase{ + { + // Success test to force a flake until we're ready to let things fail here. + Name: testName, + }, + } + + // accumulate all alerts firing that we have no historical data for this release, or we know it only + // recently appeared. (less than two weeks ago) Any alerts in either category will fail this test. + newAlertsFiring := []string{} + + for _, interval := range firingIntervals { + alertName := interval.StructuredLocator.Keys[monitorapi.LocatorAlertKey] + + if isSkippedAlert(alertName) { + continue + } + + // Scan historical data to see if this alert appears new. Ignore release, namespace, level, everything but + // AlertName, just see if it's in the data file and when we first observed it in either of the two releases we expect in there. + var firstObserved *time.Time + var sawAlertName bool + for _, hd := range historicalData.HistoricalData { + if hd.AlertName == alertName { + sawAlertName = true + if firstObserved == nil || (!hd.FirstObserved.IsZero() && hd.FirstObserved.Before(*firstObserved)) { + firstObserved = &hd.FirstObserved + } + } + } + + if !sawAlertName { + violation := fmt.Sprintf("%s has no test data, this alert appears new and should not be firing", alertName) + logrus.Warn(violation) + newAlertsFiring = append(newAlertsFiring, violation) + } else if firstObserved != nil && time.Now().Sub(*firstObserved) <= 14*24*time.Hour { + // if we did get data, check how old the first observed timestamp was, too new and we're going to fire as well. + violation := fmt.Sprintf("%s was first observed on %s, this alert appears new and should not be firing", alertName, firstObserved.UTC().Format(time.RFC3339)) + logrus.Warn(violation) + newAlertsFiring = append(newAlertsFiring, violation) + } + // if firstObserved was still nil, we can't do the two week check, but we saw the alert, so assume a test pass, the alert must be known + } + + if len(newAlertsFiring) > 0 { + output := fmt.Sprintf("Found alerts firing which are new or less than two weeks old, which should not be firing: \n\n%s", + strings.Join(newAlertsFiring, "\n")) + ret = append(ret, &junitapi.JUnitTestCase{ + Name: testName, + FailureOutput: &junitapi.FailureOutput{ + Output: output, + }, + SystemOut: output, + }) + } + return ret +} diff --git a/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts_test.go b/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts_test.go new file mode 100644 index 000000000000..301771c57161 --- /dev/null +++ b/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts_test.go @@ -0,0 +1,123 @@ +package legacytestframeworkmonitortests + +import ( + "testing" + "time" + + "github.com/openshift/origin/pkg/monitor/monitorapi" + "github.com/openshift/origin/pkg/monitortestlibrary/historicaldata" + "github.com/openshift/origin/pkg/monitortestlibrary/platformidentification" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestNoNewAlertsFiringBackstop(t *testing.T) { + + awsJob := platformidentification.JobType{ + Release: "4.15", + FromRelease: "4.14", + Platform: "aws", + Architecture: "amd64", + Network: "ovn", + Topology: "ha", + } + fakeAlertKey := historicaldata.AlertDataKey{ + AlertName: "FakeAlert", + AlertNamespace: "fakens", + AlertLevel: "Warning", + JobType: awsJob, + } + firstObvRecent := time.Now().Add(-4 * 24 * time.Hour) // 4 days ago + firstObvAncient := time.Now().Add(-60 * 24 * time.Hour) // 60 days ago + + interval := monitorapi.Interval{ + Condition: monitorapi.Condition{ + Level: monitorapi.Warning, + StructuredLocator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeAlert, + Keys: map[monitorapi.LocatorKey]string{ + monitorapi.LocatorAlertKey: "FakeAlert", + monitorapi.LocatorNamespaceKey: "fakens", + }, + }, + StructuredMessage: monitorapi.Message{ + HumanMessage: "jibberish", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationAlertState: "firing", + monitorapi.AnnotationSeverity: "warning", + }, + }, + }, + Source: monitorapi.SourceAlert, + Display: false, + From: time.Now().Add(-5 * time.Hour), + To: time.Now().Add(-6 * time.Hour), + } + + tests := []struct { + name string + historicalData *historicaldata.AlertBestMatcher + firingIntervals monitorapi.Intervals + expectedStatus []string // "pass", "fail", in the order we expect them to appear, one of each for flakes + }{ + { + name: "firing alert first observed recently in few jobs", + historicalData: historicaldata.NewAlertMatcherWithHistoricalData( + map[historicaldata.AlertDataKey]historicaldata.AlertStatisticalData{ + fakeAlertKey: { + AlertDataKey: fakeAlertKey, + Name: "FakeAlert", + FirstObserved: firstObvRecent, + LastObserved: time.Now(), + JobRuns: 5, // less than 100 + }, + }), + firingIntervals: monitorapi.Intervals{interval}, + expectedStatus: []string{"pass", "fail"}, + }, + { + name: "firing alert never seen before", + historicalData: historicaldata.NewAlertMatcherWithHistoricalData( + map[historicaldata.AlertDataKey]historicaldata.AlertStatisticalData{}), + firingIntervals: monitorapi.Intervals{interval}, + expectedStatus: []string{"pass", "fail"}, + }, + { + name: "firing alert observed more than two weeks ago", + historicalData: historicaldata.NewAlertMatcherWithHistoricalData( + map[historicaldata.AlertDataKey]historicaldata.AlertStatisticalData{ + fakeAlertKey: { + AlertDataKey: fakeAlertKey, + Name: "FakeAlert", + FirstObserved: firstObvAncient, + LastObserved: time.Now(), + JobRuns: 5, // less than 100 + }, + }), + firingIntervals: monitorapi.Intervals{interval}, + expectedStatus: []string{"pass"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + results := runNoNewAlertsFiringTest(tt.historicalData, tt.firingIntervals) + for _, r := range results { + t.Logf("%s failure output was: %s", r.Name, r.FailureOutput) + } + require.Equal(t, len(tt.expectedStatus), len(results)) + for i := range tt.expectedStatus { + switch tt.expectedStatus[i] { + case "pass": + assert.Nil(t, results[i].FailureOutput) + case "fail": + assert.NotNil(t, results[i].FailureOutput) + default: + t.Logf("invalid test input, should be 'pass' or 'fail': %s", tt.expectedStatus[i]) + t.Fail() + } + } + }) + } + +} diff --git a/pkg/monitortests/testframework/timelineserializer/rendering.go b/pkg/monitortests/testframework/timelineserializer/rendering.go index b103281e87b3..8fded66677db 100644 --- a/pkg/monitortests/testframework/timelineserializer/rendering.go +++ b/pkg/monitortests/testframework/timelineserializer/rendering.go @@ -53,7 +53,7 @@ func (r eventIntervalRenderer) writeEventData(artifactDir, filenameBase string, errs := []error{} interestingEvents := events.Filter(r.filter) - if err := monitorserialization.EventsIntervalsToFile(filepath.Join(artifactDir, fmt.Sprintf("%s.json", filenameBase)), interestingEvents); err != nil { + if err := monitorserialization.IntervalsToFile(filepath.Join(artifactDir, fmt.Sprintf("%s.json", filenameBase)), interestingEvents); err != nil { errs = append(errs, err) } diff --git a/pkg/monitortests/testframework/timelineserializer/rendering_test.go b/pkg/monitortests/testframework/timelineserializer/rendering_test.go index 0ad29ddc4b9c..3f55f1b8b4ff 100644 --- a/pkg/monitortests/testframework/timelineserializer/rendering_test.go +++ b/pkg/monitortests/testframework/timelineserializer/rendering_test.go @@ -11,7 +11,7 @@ import ( var skipE2e []byte func TestBelongsInKubeAPIServer(t *testing.T) { - inputIntervals, err := monitorserialization.EventsFromJSON(skipE2e) + inputIntervals, err := monitorserialization.IntervalsFromJSON(skipE2e) if err != nil { t.Fatal(err) } diff --git a/test/extended/prometheus/prometheus.go b/test/extended/prometheus/prometheus.go index 9600c3fd0a0d..5d253837abf0 100644 --- a/test/extended/prometheus/prometheus.go +++ b/test/extended/prometheus/prometheus.go @@ -543,14 +543,11 @@ var _ = g.Describe("[sig-instrumentation] Prometheus [apigroup:image.openshift.i }) g.It("shouldn't report any alerts in firing state apart from Watchdog and AlertmanagerReceiversNotConfigured [Early][apigroup:config.openshift.io]", func() { - // Checking Watchdog alert state is done in "should have a Watchdog alert in firing state". - allowedAlertNames := []string{ - "Watchdog", - "AlertmanagerReceiversNotConfigured", - "PrometheusRemoteWriteDesiredShards", - "KubeJobFailed", // this is a result of bug https://bugzilla.redhat.com/show_bug.cgi?id=2054426 . We should catch these in the late test above. - } + // Copy so we can expand: + allowedAlertNames := make([]string, len(allowedalerts2.AllowedAlertNames)) + copy(allowedAlertNames, allowedalerts2.AllowedAlertNames) + // Checking Watchdog alert state is done in "should have a Watchdog alert in firing state". // we exclude alerts that have their own separate tests. for _, alertTest := range allowedalerts2.AllAlertTests(&platformidentification.JobType{}, allowedalerts2.DefaultAllowances) { allowedAlertNames = append(allowedAlertNames, alertTest.AlertName()) diff --git a/test/extended/util/prometheus/helpers.go b/test/extended/util/prometheus/helpers.go index 1bfa2fac6b46..6b7058a63d7a 100644 --- a/test/extended/util/prometheus/helpers.go +++ b/test/extended/util/prometheus/helpers.go @@ -161,9 +161,10 @@ func (c MetricConditions) Matches(sample *model.Sample) *MetricCondition { func (c MetricConditions) MatchesInterval(alertInterval monitorapi.Interval) *MetricCondition { - // Parse out the alertInterval: - checkAlertName := monitorapi.AlertFromLocator(alertInterval.Locator) - checkAlertNamespace := monitorapi.NamespaceFromLocator(alertInterval.Locator) + // TODO: Source check for SourceAlert would be a good idea here. + + checkAlertName := alertInterval.StructuredLocator.Keys[monitorapi.LocatorAlertKey] + checkAlertNamespace := alertInterval.StructuredLocator.Keys[monitorapi.LocatorNamespaceKey] for _, condition := range c { if checkAlertName == condition.AlertName && checkAlertNamespace == condition.AlertNamespace { @@ -173,21 +174,6 @@ func (c MetricConditions) MatchesInterval(alertInterval monitorapi.Interval) *Me return nil } -func LabelsAsSelector(l model.LabelSet) string { - return l.String() -} - -func StripLabels(m model.Metric, names ...string) model.LabelSet { - labels := make(model.LabelSet) - for k := range m { - labels[k] = m[k] - } - for _, name := range names { - delete(labels, model.LabelName(name)) - } - return labels -} - func RunQuery(ctx context.Context, prometheusClient prometheusv1.API, query string) (*PrometheusResponse, error) { return RunQueryAtTime(ctx, prometheusClient, query, time.Now()) } From 747fd093d51f8095ab3a7f18e980e337029a69cc Mon Sep 17 00:00:00 2001 From: Devan Goodwin Date: Fri, 20 Oct 2023 08:07:33 -0300 Subject: [PATCH 7/8] Don't fire on alerts at sev info --- .../legacytestframeworkmonitortests/alerts.go | 14 +++++++-- .../alerts_test.go | 29 +++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts.go b/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts.go index 4eb788e6f1bc..cb6946b6874f 100644 --- a/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts.go +++ b/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts.go @@ -244,9 +244,10 @@ func isSkippedAlert(alertName string) bool { return false } -// runNoNewAlertsFiringTest checks all alerts to see if we: -// 1. have no historical data for that alert in that namespace for this release -// 2. have historical data but it was first observed less than 2 weeks ago +// runNoNewAlertsFiringTest checks all firing non-info alerts to see if we: +// +// - have no historical data for that alert in that namespace for this release, or +// - have historical data but it was first observed less than 2 weeks ago // // If either is true, this test will fail. We do not want new product alerts being added to the product that // will trigger routinely and affect the fleet when they ship. @@ -272,6 +273,13 @@ func runNoNewAlertsFiringTest(historicalData *historicaldata.AlertBestMatcher, continue } + // Skip alerts with severity info, I don't totally understand the semantics here but it appears some components + // use this for informational alerts. (saw two examples from Insights) + // We're only interested in warning+critical for the purposes of this test. + if interval.StructuredMessage.Annotations[monitorapi.AnnotationSeverity] == "info" { + continue + } + // Scan historical data to see if this alert appears new. Ignore release, namespace, level, everything but // AlertName, just see if it's in the data file and when we first observed it in either of the two releases we expect in there. var firstObserved *time.Time diff --git a/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts_test.go b/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts_test.go index 301771c57161..124375f4904e 100644 --- a/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts_test.go +++ b/pkg/monitortests/testframework/legacytestframeworkmonitortests/alerts_test.go @@ -82,6 +82,35 @@ func TestNoNewAlertsFiringBackstop(t *testing.T) { firingIntervals: monitorapi.Intervals{interval}, expectedStatus: []string{"pass", "fail"}, }, + { + name: "firing severity info alert never seen before", + historicalData: historicaldata.NewAlertMatcherWithHistoricalData( + map[historicaldata.AlertDataKey]historicaldata.AlertStatisticalData{}), + firingIntervals: monitorapi.Intervals{monitorapi.Interval{ + Condition: monitorapi.Condition{ + Level: monitorapi.Warning, + StructuredLocator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeAlert, + Keys: map[monitorapi.LocatorKey]string{ + monitorapi.LocatorAlertKey: "FakeAlert", + monitorapi.LocatorNamespaceKey: "fakens", + }, + }, + StructuredMessage: monitorapi.Message{ + HumanMessage: "jibberish", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationAlertState: "firing", + monitorapi.AnnotationSeverity: "info", + }, + }, + }, + Source: monitorapi.SourceAlert, + Display: false, + From: time.Now().Add(-5 * time.Hour), + To: time.Now().Add(-6 * time.Hour), + }}, + expectedStatus: []string{"pass"}, // info severity alerts should not fail this test + }, { name: "firing alert observed more than two weeks ago", historicalData: historicaldata.NewAlertMatcherWithHistoricalData( From dc3c6fc745c684bed3cc93eb29281c5df591a1b7 Mon Sep 17 00:00:00 2001 From: Devan Goodwin Date: Fri, 20 Oct 2023 08:11:32 -0300 Subject: [PATCH 8/8] Remove the display bool for now, will re-add in another PR as it fails tests --- pkg/monitor/serialization/serialize.go | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pkg/monitor/serialization/serialize.go b/pkg/monitor/serialization/serialize.go index cf96d249af0c..0c47779c4d53 100644 --- a/pkg/monitor/serialization/serialize.go +++ b/pkg/monitor/serialization/serialize.go @@ -21,8 +21,7 @@ type EventInterval struct { // TODO: Remove the omitempty, just here to keep from having to repeatedly updated the json // files used in some new tests - Source string `json:"tempSource,omitempty"` // also temporary, unsure if this concept will survive - Display bool `json:"display"` + Source string `json:"tempSource,omitempty"` // also temporary, unsure if this concept will survive // TODO: we're hoping to move these to just locator/message when everything is ready. StructuredLocator monitorapi.Locator `json:"tempStructuredLocator"` @@ -65,8 +64,7 @@ func IntervalsFromJSON(data []byte) (monitorapi.Intervals, error) { return nil, err } events = append(events, monitorapi.Interval{ - Source: monitorapi.IntervalSource(interval.Source), - Display: interval.Display, + Source: monitorapi.IntervalSource(interval.Source), Condition: monitorapi.Condition{ Level: level, Locator: interval.Locator, @@ -93,8 +91,7 @@ func IntervalFromJSON(data []byte) (*monitorapi.Interval, error) { return nil, err } return &monitorapi.Interval{ - Source: monitorapi.IntervalSource(serializedInterval.Source), - Display: serializedInterval.Display, + Source: monitorapi.IntervalSource(serializedInterval.Source), Condition: monitorapi.Condition{ Level: level, Locator: serializedInterval.Locator, @@ -166,7 +163,6 @@ func monitorEventIntervalToEventInterval(interval monitorapi.Interval) EventInte StructuredLocator: interval.StructuredLocator, StructuredMessage: interval.StructuredMessage, Source: string(interval.Source), - Display: interval.Display, From: metav1.Time{Time: interval.From}, To: metav1.Time{Time: interval.To},