Skip to content

Commit 097e7a6

Browse files
simonpasquierbison
authored andcommitted
test/extended/prometheus: Validate alerting rules
The [Alerting Consistency][1] enhancement, and the proposed updates to it in [openshift/enhancements #897][2], define a style-guide for the alerts shipped as part of OpenShift. This adds a test validating some of the guidelines considered required. [1]: https://github.com/openshift/enhancements/blob/master/enhancements/monitoring/alerting-consistency.md [2]: openshift/enhancements#897
1 parent a061ca7 commit 097e7a6

File tree

4 files changed

+280
-0
lines changed

4 files changed

+280
-0
lines changed

test/extended/prometheus/prometheus.go

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313

1414
g "github.com/onsi/ginkgo"
1515
o "github.com/onsi/gomega"
16+
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
1617
dto "github.com/prometheus/client_model/go"
1718
"github.com/prometheus/common/expfmt"
1819
"github.com/prometheus/common/model"
@@ -42,6 +43,171 @@ import (
4243
helper "github.com/openshift/origin/test/extended/util/prometheus"
4344
)
4445

46+
var _ = g.Describe("[sig-instrumentation][Late] OpenShift alerting rules", func() {
47+
defer g.GinkgoRecover()
48+
49+
// These alerts are known to be missing the summary and/or description
50+
// labels. Bugzillas are being filed, and will be linked here. These
51+
// should be fixed one-by-one and removed from this list.
52+
descriptionExceptions := sets.NewString(
53+
"APIRemovedInNextEUSReleaseInUse",
54+
"APIRemovedInNextReleaseInUse",
55+
"CertifiedOperatorsCatalogError",
56+
"CloudCredentialOperatorDeprovisioningFailed",
57+
"CloudCredentialOperatorInsufficientCloudCreds",
58+
"CloudCredentialOperatorProvisioningFailed",
59+
"CloudCredentialOperatorTargetNamespaceMissing",
60+
"ClusterProxyApplySlow",
61+
"CommunityOperatorsCatalogError",
62+
"CsvAbnormalFailedOver2Min",
63+
"CsvAbnormalOver30Min",
64+
"ExtremelyHighIndividualControlPlaneCPU",
65+
"HAProxyDown",
66+
"HAProxyReloadFail",
67+
"HighOverallControlPlaneCPU",
68+
"ImageRegistryStorageReconfigured",
69+
"IngressControllerDegraded",
70+
"IngressControllerUnavailable",
71+
"InstallPlanStepAppliedWithWarnings",
72+
"KubeControllerManagerDown",
73+
"KubeSchedulerDown",
74+
"KubeletHealthState",
75+
"MCDDrainError",
76+
"MCDPivotError",
77+
"MCDRebootError",
78+
"MachineAPIOperatorMetricsCollectionFailing",
79+
"MachineApproverMaxPendingCSRsReached",
80+
"MachineHealthCheckUnterminatedShortCircuit",
81+
"MachineNotYetDeleted",
82+
"MachineWithNoRunningPhase",
83+
"MachineWithoutValidNode",
84+
"MasterNodesHighMemoryUsage",
85+
"NodeProxyApplySlow",
86+
"NodeProxyApplyStale",
87+
"NodeWithoutSDNPod",
88+
"PodDisruptionBudgetAtLimit",
89+
"PodDisruptionBudgetLimit",
90+
"RedhatMarketplaceCatalogError",
91+
"RedhatOperatorsCatalogError",
92+
"SDNPodNotReady",
93+
"SamplesDegraded",
94+
"SamplesImagestreamImportFailing",
95+
"SamplesInvalidConfig",
96+
"SamplesMissingSecret",
97+
"SamplesMissingTBRCredential",
98+
"SamplesRetriesMissingOnImagestreamImportFailing",
99+
"SamplesTBRInaccessibleOnBoot",
100+
"SchedulerLegacyPolicySet",
101+
"SystemMemoryExceedsReservation",
102+
"TechPreviewNoUpgrade",
103+
"etcdBackendQuotaLowSpace",
104+
"etcdExcessiveDatabaseGrowth",
105+
"etcdHighFsyncDurations",
106+
)
107+
108+
var alertingRules map[string][]promv1.AlertingRule
109+
oc := exutil.NewCLIWithoutNamespace("prometheus")
110+
111+
g.BeforeEach(func() {
112+
url, _, bearerToken, ok := helper.LocatePrometheus(oc)
113+
if !ok {
114+
e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test")
115+
}
116+
117+
if alertingRules == nil {
118+
var err error
119+
120+
alertingRules, err = helper.FetchAlertingRules(oc, url, bearerToken)
121+
if err != nil {
122+
e2e.Failf("Failed to fetch alerting rules: %v", err)
123+
}
124+
}
125+
})
126+
127+
g.It("should have a valid severity label", func() {
128+
helper.ForEachAlertingRule(alertingRules, func(alert promv1.AlertingRule) sets.String {
129+
severityRe := regexp.MustCompile("^critical|warning|info$")
130+
131+
severity, found := alert.Labels["severity"]
132+
if !found {
133+
return sets.NewString("has no 'severity' label")
134+
}
135+
136+
if !severityRe.MatchString(string(severity)) {
137+
return sets.NewString(
138+
fmt.Sprintf("has a 'severity' label value of %q which doesn't match %q",
139+
severity, severityRe.String(),
140+
),
141+
)
142+
}
143+
144+
return nil
145+
})
146+
})
147+
148+
g.It("should have description and summary annotations", func() {
149+
helper.ForEachAlertingRule(alertingRules, func(alert promv1.AlertingRule) sets.String {
150+
if descriptionExceptions.Has(alert.Name) {
151+
framework.Logf("Alerting rule %q is known to have missing annotations.")
152+
return nil
153+
}
154+
155+
violations := sets.NewString()
156+
157+
if _, found := alert.Annotations["description"]; !found {
158+
// If there's no 'description' annotation, but there is a
159+
// 'message' annotation, suggest renaming it.
160+
if _, found := alert.Annotations["message"]; found {
161+
violations.Insert("has no 'description' annotation, but has a 'message' annotation." +
162+
" OpenShift alerts must use 'description' -- consider renaming the annotation")
163+
} else {
164+
violations.Insert("has no 'description' annotation")
165+
}
166+
}
167+
168+
if _, found := alert.Annotations["summary"]; !found {
169+
violations.Insert("has no 'summary' annotation")
170+
}
171+
172+
return violations
173+
})
174+
})
175+
176+
g.It("should have a runbook_url annotation if the alert is critical", func() {
177+
violations := sets.NewString()
178+
179+
helper.ForEachAlertingRule(alertingRules, func(alert promv1.AlertingRule) sets.String {
180+
severity := string(alert.Labels["severity"])
181+
runbook := string(alert.Annotations["runbook_url"])
182+
183+
if severity == "critical" && runbook == "" {
184+
violations.Insert(
185+
fmt.Sprintf("WARNING: Alert %q is critical and has no 'runbook_url' annotation", alert.Name),
186+
)
187+
} else if runbook != "" {
188+
// If there's a 'runbook_url' annotation, make sure it's a
189+
// valid URL and that we can fetch the contents.
190+
if err := helper.ValidateURL(runbook, 10*time.Second); err != nil {
191+
violations.Insert(
192+
fmt.Sprintf("WARNING: Alert %q has an invalid 'runbook_url' annotation: %v",
193+
alert.Name, err),
194+
)
195+
}
196+
}
197+
198+
return nil // Always return nil, because these aren't required yet.
199+
})
200+
201+
// These aren't enforced yet. They are just reporting the errors by
202+
// calling the g.Skip() method.
203+
if len(violations) > 0 {
204+
g.Skip(
205+
fmt.Sprintf("Incompliant rules detected:\n\n%s", strings.Join(violations.List(), "\n")),
206+
)
207+
}
208+
})
209+
})
210+
45211
var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
46212
defer g.GinkgoRecover()
47213
var (

test/extended/util/annotate/generated/zz_generated.annotations.go

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

test/extended/util/annotate/rules.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,9 @@ var (
208208
`\[sig-instrumentation\] Prometheus when installed on the cluster shouldn't have failing rules evaluation`,
209209
`\[sig-instrumentation\] Prometheus when installed on the cluster shouldn't report any alerts in firing state apart from Watchdog and AlertmanagerReceiversNotConfigured \[Early\]`,
210210
`\[sig-instrumentation\] Prometheus when installed on the cluster when using openshift-sdn should be able to get the sdn ovs flows`,
211+
`\[sig-instrumentation\]\[Late\] OpenShift alerting rules should have a valid severity label`,
212+
`\[sig-instrumentation\]\[Late\] OpenShift alerting rules should have description and summary annotations`,
213+
`\[sig-instrumentation\]\[Late\] OpenShift alerting rules should have a runbook_url annotation if the alert is critical`,
211214
`\[sig-instrumentation\]\[Late\] Alerts should have a Watchdog alert in firing state the entire cluster run`,
212215
`\[sig-instrumentation\]\[Late\] Alerts shouldn't exceed the 500 series limit of total series sent via telemetry from each cluster`,
213216
`\[sig-instrumentation\]\[Late\] Alerts shouldn't report any alerts in firing or pending state apart from Watchdog and AlertmanagerReceiversNotConfigured and have no gaps in Watchdog firing`,

test/extended/util/prometheus/helpers.go

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"encoding/json"
66
"fmt"
7+
"net/http"
78
"net/url"
89
"strconv"
910
"strings"
@@ -12,8 +13,11 @@ import (
1213
g "github.com/onsi/ginkgo"
1314
o "github.com/onsi/gomega"
1415
"k8s.io/apimachinery/pkg/util/errors"
16+
"k8s.io/apimachinery/pkg/util/sets"
17+
"k8s.io/apimachinery/pkg/util/wait"
1518

1619
exutil "github.com/openshift/origin/test/extended/util"
20+
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
1721
"github.com/prometheus/common/model"
1822

1923
v1 "k8s.io/api/core/v1"
@@ -257,3 +261,104 @@ func ExpectPrometheusEndpoint(namespace, podName, url string) {
257261
}
258262
o.Expect(err).NotTo(o.HaveOccurred())
259263
}
264+
265+
// FetchAlertingRules fetchs all alerting rules from the Prometheus at
266+
// the given URL using the given bearer token. The results are returned
267+
// as a map of group names to lists of alerting rules.
268+
func FetchAlertingRules(oc *exutil.CLI, promURL, bearerToken string) (map[string][]promv1.AlertingRule, error) {
269+
ns := oc.SetupNamespace()
270+
execPod := exutil.CreateExecPodOrFail(oc.AdminKubeClient(), ns, "execpod")
271+
defer func() {
272+
oc.AdminKubeClient().CoreV1().Pods(ns).Delete(context.Background(), execPod.Name, *metav1.NewDeleteOptions(1))
273+
}()
274+
275+
url := fmt.Sprintf("%s/api/v1/rules", promURL)
276+
contents, err := GetBearerTokenURLViaPod(ns, execPod.Name, url, bearerToken)
277+
if err != nil {
278+
return nil, fmt.Errorf("unable to query %s: %v", url, err)
279+
}
280+
281+
var result struct {
282+
Status string `json:"status"`
283+
Data promv1.RulesResult `json:"data"`
284+
}
285+
if err := json.Unmarshal([]byte(contents), &result); err != nil {
286+
return nil, fmt.Errorf("unable to parse response %q from %s: %v", contents, url, err)
287+
}
288+
289+
alertingRules := make(map[string][]promv1.AlertingRule)
290+
291+
for _, rg := range result.Data.Groups {
292+
for _, r := range rg.Rules {
293+
switch v := r.(type) {
294+
case promv1.RecordingRule:
295+
continue
296+
case promv1.AlertingRule:
297+
alertingRules[rg.Name] = append(alertingRules[rg.Name], v)
298+
default:
299+
return nil, fmt.Errorf("unexpected rule of type %T", r)
300+
}
301+
}
302+
}
303+
304+
return alertingRules, nil
305+
}
306+
307+
// ValidateURL takes a URL as a string and a timeout. The URL is
308+
// parsed, then fetched until a 200 response is received or the timeout
309+
// is reached.
310+
func ValidateURL(rawURL string, timeout time.Duration) error {
311+
if _, err := url.Parse(rawURL); err != nil {
312+
return err
313+
}
314+
315+
return wait.PollImmediate(1*time.Second, timeout, func() (done bool, err error) {
316+
resp, err := http.Get(rawURL)
317+
if err != nil {
318+
framework.Logf("validateURL(%q) error: %v", err)
319+
return false, nil
320+
}
321+
322+
defer resp.Body.Close()
323+
324+
if resp.StatusCode == 200 {
325+
return true, nil
326+
}
327+
328+
framework.Logf("validateURL(%q) got non-200 response: %d", rawURL, resp.StatusCode)
329+
return false, nil
330+
})
331+
}
332+
333+
// ForEachAlertingRule takes a map of rule group names to a list of
334+
// alerting rules, and for each rule in each group runs the given
335+
// function. The function takes the alerting rule, and returns a set of
336+
// violations, which maye be empty or nil. If after all rules are
337+
// checked, there are any violations, the calling test is failed, and
338+
// all violations are reported.
339+
func ForEachAlertingRule(rules map[string][]promv1.AlertingRule, f func(a promv1.AlertingRule) sets.String) {
340+
allViolations := sets.NewString()
341+
342+
for group, alerts := range rules {
343+
for _, alert := range alerts {
344+
// The Watchdog alert is special because it is only there to
345+
// test the end-to-end alert flow and it isn't meant to be
346+
// routed to cluster admins.
347+
if alert.Name == "Watchdog" {
348+
continue
349+
}
350+
351+
if violations := f(alert); violations != nil {
352+
for _, v := range violations.List() {
353+
allViolations.Insert(
354+
fmt.Sprintf("Alerting rule %q (group: %s) %s", alert.Name, group, v),
355+
)
356+
}
357+
}
358+
}
359+
}
360+
361+
if len(allViolations) > 0 {
362+
framework.Failf("Incompliant rules detected:\n\n%s", strings.Join(allViolations.List(), "\n"))
363+
}
364+
}

0 commit comments

Comments
 (0)