/
alert.go
104 lines (82 loc) · 3.5 KB
/
alert.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
package alert
import (
"context"
"fmt"
"time"
g "github.com/onsi/ginkgo"
o "github.com/onsi/gomega"
exutil "github.com/openshift/origin/test/extended/util"
helper "github.com/openshift/origin/test/extended/util/prometheus"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/kubernetes/test/e2e/framework"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
"k8s.io/kubernetes/test/e2e/upgrades"
)
const (
// Delay after upgrade is complete before checking for critical alerts
alertCheckSleepMinutes = 5
alertCheckSleep = alertCheckSleepMinutes * time.Minute
// Previous period in which to check for critical alerts
alertPeriodCheckMinutes = 1
)
// UpgradeTest runs post-upgrade after alertCheckSleep delay and tests if any critical alerts are firing.
type UpgradeTest struct {
url string
bearerToken string
oc *exutil.CLI
}
func (UpgradeTest) Name() string { return "check-for-critical-alerts" }
func (UpgradeTest) DisplayName() string {
return "[sig-arch] Check if critical alerts are firing after upgrade success"
}
// Setup creates parameters to query Prometheus
func (t *UpgradeTest) Setup(f *framework.Framework) {
g.By("Setting up post-upgrade alert test")
url, bearerToken, oc, ok := helper.ExpectPrometheus(f)
if !ok {
framework.Failf("Prometheus could not be located on this cluster, failing test %s", t.Name())
}
t.url = url
t.bearerToken = bearerToken
t.oc = oc
framework.Logf("Post-upgrade alert test setup complete")
}
// Test checks if any critical alerts are firing.
func (t *UpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade upgrades.UpgradeType) {
g.By("Checking for critical alerts")
// Recover current test if it fails so test suite can complete
defer g.GinkgoRecover()
// Block until upgrade is done
g.By("Waiting for upgrade to finish before checking for critical alerts")
<-done
ctx, cancel := context.WithCancel(context.Background())
// Additonal delay after upgrade completion
g.By("Waiting before checking for critical alerts")
time.Sleep(alertCheckSleep)
cancel()
if exutil.TolerateVersionSkewInTests() {
e2eskipper.Skipf("Test is disabled to allow cluster components to have different versions, and skewed versions trigger multiple other alerts")
}
t.oc.SetupProject()
ns := t.oc.Namespace()
execPod := exutil.CreateExecPodOrFail(t.oc.AdminKubeClient(), ns, "execpod")
defer func() {
t.oc.AdminKubeClient().CoreV1().Pods(ns).Delete(ctx, execPod.Name, *metav1.NewDeleteOptions(1))
}()
// Query to check if Prometheus has been up and running for entire post-upgrade
// period by verifying Watchdog alert has been in firing state
watchdogQuery := fmt.Sprintf(`count_over_time(ALERTS{alertstate="firing",alertname="Watchdog", severity="none"}[%dm])`, alertCheckSleepMinutes)
// Query to check for any critical severity alerts that have occurred within the last alertPeriodCheckMinutes.
criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity="critical"}[%dm]) >= 1`, alertPeriodCheckMinutes)
tests := map[string]bool{
watchdogQuery: true,
criticalAlertQuery: false,
}
err := helper.RunQueries(tests, t.oc, ns, execPod.Name, t.url, t.bearerToken)
o.Expect(err).NotTo(o.HaveOccurred())
framework.Logf("No critical alerts firing post-upgrade")
}
// Teardown cleans up any remaining resources.
func (t *UpgradeTest) Teardown(f *framework.Framework) {
// rely on the namespace deletion to clean up everything
}