Skip to content

Commit

Permalink
Revert "TRT-1625: Revert #28741 "OCPBUGS-32923: Fix allowed firing al…
Browse files Browse the repository at this point in the history
…erts not triggering""
  • Loading branch information
dgoodwin committed Apr 26, 2024
1 parent b04fe1a commit db91c45
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 107 deletions.
54 changes: 28 additions & 26 deletions pkg/alerts/allowed_conformance.go
Expand Up @@ -2,61 +2,63 @@ package alerts

import (
configv1 "github.com/openshift/api/config/v1"
helper "github.com/openshift/origin/test/extended/util/prometheus"
)

// AllowedAlertsDuringConformance lists all alerts that are allowed to be pending or firing during
// conformance testing.
// WARNING: there is a parallel list for allowed alerts during upgrade in allowed_upgrade.go,
// ensure that alerts we want to allow in both are added to both.
func AllowedAlertsDuringConformance(featureSet configv1.FeatureSet) (allowedFiringWithBugs, allowedFiring, allowedPendingWithBugs, allowedPending helper.MetricConditions) {
func AllowedAlertsDuringConformance(featureSet configv1.FeatureSet) (allowedFiringWithBugs, allowedFiring, allowedPendingWithBugs, allowedPending MetricConditions) {

firingAlertsWithBugs := helper.MetricConditions{}
allowedFiringAlerts := helper.MetricConditions{
firingAlertsWithBugs := MetricConditions{}
allowedFiringAlerts := MetricConditions{
{
Selector: map[string]string{"alertname": "TargetDown", "namespace": "openshift-e2e-loki"},
Text: "Loki is nice to have, but we can allow it to be down",
AlertName: "TargetDown",
AlertNamespace: "openshift-e2e-loki",
Text: "Loki is nice to have, but we can allow it to be down",
},
{
Selector: map[string]string{"alertname": "KubePodNotReady", "namespace": "openshift-e2e-loki"},
Text: "Loki is nice to have, but we can allow it to be down",
AlertName: "KubePodNotReady",
AlertNamespace: "openshift-e2e-loki",
Text: "Loki is nice to have, but we can allow it to be down",
},
{
Selector: map[string]string{"alertname": "KubeDeploymentReplicasMismatch", "namespace": "openshift-e2e-loki"},
Text: "Loki is nice to have, but we can allow it to be down",
AlertName: "KubeDeploymentReplicasMismatch",
AlertNamespace: "openshift-e2e-loki",
Text: "Loki is nice to have, but we can allow it to be down",
},
{
Selector: map[string]string{"alertname": "HighOverallControlPlaneCPU"},
Text: "high CPU utilization during e2e runs is normal",
AlertName: "HighOverallControlPlaneCPU",
Text: "high CPU utilization during e2e runs is normal",
},
{
Selector: map[string]string{"alertname": "ExtremelyHighIndividualControlPlaneCPU"},
Text: "high CPU utilization during e2e runs is normal",
AlertName: "ExtremelyHighIndividualControlPlaneCPU",
Text: "high CPU utilization during e2e runs is normal",
},
}
pendingAlertsWithBugs := helper.MetricConditions{}
allowedPendingAlerts := helper.MetricConditions{
pendingAlertsWithBugs := MetricConditions{}
allowedPendingAlerts := MetricConditions{
{
Selector: map[string]string{"alertname": "HighOverallControlPlaneCPU"},
Text: "high CPU utilization during e2e runs is normal",
AlertName: "HighOverallControlPlaneCPU",
Text: "high CPU utilization during e2e runs is normal",
},
{
Selector: map[string]string{"alertname": "ExtremelyHighIndividualControlPlaneCPU"},
Text: "high CPU utilization during e2e runs is normal",
AlertName: "ExtremelyHighIndividualControlPlaneCPU",
Text: "high CPU utilization during e2e runs is normal",
},
}

switch featureSet {
case configv1.TechPreviewNoUpgrade:
allowedFiringAlerts = append(
allowedFiringAlerts,
helper.MetricCondition{
Selector: map[string]string{"alertname": "TechPreviewNoUpgrade"},
Text: "Allow testing of TechPreviewNoUpgrade clusters, this will only fire when a FeatureGate has been enabled",
MetricCondition{
AlertName: "TechPreviewNoUpgrade",
Text: "Allow testing of TechPreviewNoUpgrade clusters, this will only fire when a FeatureGate has been enabled",
},
helper.MetricCondition{
Selector: map[string]string{"alertname": "ClusterNotUpgradeable"},
Text: "Allow testing of ClusterNotUpgradeable clusters, this will only fire when a FeatureGate has been enabled",
MetricCondition{
AlertName: "ClusterNotUpgradeable",
Text: "Allow testing of ClusterNotUpgradeable clusters, this will only fire when a FeatureGate has been enabled",
})
}

Expand Down
42 changes: 22 additions & 20 deletions pkg/alerts/allowed_upgrade.go
Expand Up @@ -2,50 +2,52 @@ package alerts

import (
configv1 "github.com/openshift/api/config/v1"
helper "github.com/openshift/origin/test/extended/util/prometheus"
)

// AllowedAlertsDuringUpgrade lists all alerts that are allowed to be pending or firing during
// upgrade.
// WARNING: there is a parallel list for allowed alerts during conformance in allowed_conformance.go,
// ensure that alerts we want to allow in both are added to both.
func AllowedAlertsDuringUpgrade(featureSet configv1.FeatureSet) (allowedFiringWithBugs, allowedFiring, allowedPendingWithBugs, allowedPending helper.MetricConditions) {
func AllowedAlertsDuringUpgrade(featureSet configv1.FeatureSet) (allowedFiringWithBugs, allowedFiring, allowedPendingWithBugs, allowedPending MetricConditions) {

firingAlertsWithBugs := helper.MetricConditions{}
firingAlertsWithBugs := MetricConditions{}

allowedFiringAlerts := helper.MetricConditions{
allowedFiringAlerts := MetricConditions{
{
Selector: map[string]string{"alertname": "TargetDown", "namespace": "openshift-e2e-loki"},
Text: "Loki is nice to have, but we can allow it to be down",
AlertName: "TargetDown",
AlertNamespace: "openshift-e2e-loki",
Text: "Loki is nice to have, but we can allow it to be down",
},
{
Selector: map[string]string{"alertname": "KubePodNotReady", "namespace": "openshift-e2e-loki"},
Text: "Loki is nice to have, but we can allow it to be down",
AlertName: "KubePodNotReady",
AlertNamespace: "openshift-e2e-loki",
Text: "Loki is nice to have, but we can allow it to be down",
},
{
Selector: map[string]string{"alertname": "KubeDeploymentReplicasMismatch", "namespace": "openshift-e2e-loki"},
Text: "Loki is nice to have, but we can allow it to be down",
AlertName: "KubeDeploymentReplicasMismatch",
AlertNamespace: "openshift-e2e-loki",
Text: "Loki is nice to have, but we can allow it to be down",
},
}
pendingAlertsWithBugs := helper.MetricConditions{}
allowedPendingAlerts := helper.MetricConditions{
pendingAlertsWithBugs := MetricConditions{}
allowedPendingAlerts := MetricConditions{
{
Selector: map[string]string{"alertname": "etcdMemberCommunicationSlow"},
Text: "Excluded because it triggers during upgrade (detects ~5m of high latency immediately preceeding the end of the test), and we don't want to change the alert because it is correct",
AlertName: "etcdMemberCommunicationSlow",
Text: "Excluded because it triggers during upgrade (detects ~5m of high latency immediately preceeding the end of the test), and we don't want to change the alert because it is correct",
},
}

switch featureSet {
case configv1.TechPreviewNoUpgrade:
allowedFiringAlerts = append(
allowedFiringAlerts,
helper.MetricCondition{
Selector: map[string]string{"alertname": "TechPreviewNoUpgrade"},
Text: "Allow testing of TechPreviewNoUpgrade clusters, this will only fire when a FeatureGate has been enabled",
MetricCondition{
AlertName: "TechPreviewNoUpgrade",
Text: "Allow testing of TechPreviewNoUpgrade clusters, this will only fire when a FeatureGate has been enabled",
},
helper.MetricCondition{
Selector: map[string]string{"alertname": "ClusterNotUpgradeable"},
Text: "Allow testing of ClusterNotUpgradeable clusters, this will only fire when a FeatureGate has been enabled",
MetricCondition{
AlertName: "ClusterNotUpgradeable",
Text: "Allow testing of ClusterNotUpgradeable clusters, this will only fire when a FeatureGate has been enabled",
})
}

Expand Down
45 changes: 45 additions & 0 deletions pkg/alerts/types.go
@@ -0,0 +1,45 @@
package alerts

import (
"github.com/openshift/origin/pkg/monitor/monitorapi"
"github.com/prometheus/common/model"
)

type MetricCondition struct {
AlertName string
AlertNamespace string
AlertLevel string

// Text is the description of why this alert condition matched.
Text string

Matches func(sample *model.Sample) bool
}

type MetricConditions []MetricCondition

func (c MetricConditions) MatchesInterval(alertInterval monitorapi.Interval) *MetricCondition {

if alertInterval.Source != monitorapi.SourceAlert {
return nil
}

checkAlertName := alertInterval.StructuredLocator.Keys[monitorapi.LocatorAlertKey]
checkAlertNamespace := alertInterval.StructuredLocator.Keys[monitorapi.LocatorNamespaceKey]

for _, condition := range c {
matches := true
// We can assume AlertName is set:
if condition.AlertName != checkAlertName {
matches = false
// But Namespace may not be:
} else if condition.AlertNamespace != "" && condition.AlertNamespace != checkAlertNamespace {
matches = false
}

if matches {
return &condition
}
}
return nil
}
Expand Up @@ -6,6 +6,7 @@ import (
"strings"
"time"

"github.com/openshift/origin/pkg/alerts"
"github.com/openshift/origin/pkg/monitortestframework"
"github.com/openshift/origin/pkg/monitortestlibrary/allowedalerts"
"github.com/openshift/origin/pkg/monitortestlibrary/historicaldata"
Expand All @@ -14,11 +15,9 @@ import (
configv1 "github.com/openshift/api/config/v1"
"github.com/sirupsen/logrus"

configv1client "github.com/openshift/client-go/config/clientset/versioned"
"github.com/openshift/origin/pkg/monitor/monitorapi"
"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
helper "github.com/openshift/origin/test/extended/util/prometheus"

configv1client "github.com/openshift/client-go/config/clientset/versioned"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
Expand All @@ -27,7 +26,7 @@ import (
"k8s.io/kubernetes/test/e2e/framework"
)

type AllowedAlertsFunc func(featureSet configv1.FeatureSet) (allowedFiringWithBugs, allowedFiring, allowedPendingWithBugs, allowedPending helper.MetricConditions)
type AllowedAlertsFunc func(featureSet configv1.FeatureSet) (allowedFiringWithBugs, allowedFiring, allowedPendingWithBugs, allowedPending alerts.MetricConditions)

func testAlerts(events monitorapi.Intervals,
allowancesFunc AllowedAlertsFunc,
Expand Down Expand Up @@ -143,23 +142,23 @@ func runBackstopTest(
case allowedalerts.AlertPending:
// a pending test covers pending and everything above (firing)
allowedPendingAlerts = append(allowedPendingAlerts,
helper.MetricCondition{
Selector: map[string]string{"alertname": alertTest.AlertName()},
Text: "has a separate e2e test",
alerts.MetricCondition{
AlertName: alertTest.AlertName(),
Text: "has a separate e2e test",
},
)
allowedFiringAlerts = append(allowedFiringAlerts,
helper.MetricCondition{
Selector: map[string]string{"alertname": alertTest.AlertName()},
Text: "has a separate e2e test",
alerts.MetricCondition{
AlertName: alertTest.AlertName(),
Text: "has a separate e2e test",
},
)
case allowedalerts.AlertInfo:
// an info test covers all firing
allowedFiringAlerts = append(allowedFiringAlerts,
helper.MetricCondition{
Selector: map[string]string{"alertname": alertTest.AlertName()},
Text: "has a separate e2e test",
alerts.MetricCondition{
AlertName: alertTest.AlertName(),
Text: "has a separate e2e test",
},
)
}
Expand Down
48 changes: 0 additions & 48 deletions test/extended/util/prometheus/helpers.go
Expand Up @@ -15,7 +15,6 @@ import (

g "github.com/onsi/ginkgo/v2"
o "github.com/onsi/gomega"
"github.com/openshift/origin/pkg/monitor/monitorapi"
exutil "github.com/openshift/origin/test/extended/util"
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"
Expand Down Expand Up @@ -151,53 +150,6 @@ func RequestPrometheusServiceAccountAPIToken(ctx context.Context, oc *exutil.CLI
return req.Status.Token, nil
}

type MetricCondition struct {
// TODO: Remove in favor of explicit fields
Selector map[string]string

AlertName string
AlertNamespace string
AlertLevel string

// Text is the description of why this alert condition matched.
Text string

Matches func(sample *model.Sample) bool
}

type MetricConditions []MetricCondition

func (c MetricConditions) Matches(sample *model.Sample) *MetricCondition {
for i, condition := range c {
matches := true
for name, value := range condition.Selector {
if sample.Metric[model.LabelName(name)] != model.LabelValue(value) {
matches = false
break
}
}
if matches && (condition.Matches == nil || condition.Matches(sample)) {
return &c[i]
}
}
return nil
}

func (c MetricConditions) MatchesInterval(alertInterval monitorapi.Interval) *MetricCondition {

// TODO: Source check for SourceAlert would be a good idea here.

checkAlertName := alertInterval.StructuredLocator.Keys[monitorapi.LocatorAlertKey]
checkAlertNamespace := alertInterval.StructuredLocator.Keys[monitorapi.LocatorNamespaceKey]

for _, condition := range c {
if checkAlertName == condition.AlertName && checkAlertNamespace == condition.AlertNamespace {
return &condition
}
}
return nil
}

func RunQuery(ctx context.Context, prometheusClient prometheusv1.API, query string) (*PrometheusResponse, error) {
return RunQueryAtTime(ctx, prometheusClient, query, time.Now())
}
Expand Down

0 comments on commit db91c45

Please sign in to comment.