Skip to content

Commit

Permalink
Merge pull request #27710 from dgoodwin/alert-revamp
Browse files Browse the repository at this point in the history
Alert Testing on new Namespace and Level
  • Loading branch information
openshift-merge-robot committed Feb 9, 2023
2 parents 47e582a + b5db4bb commit c2edff5
Show file tree
Hide file tree
Showing 10 changed files with 46,263 additions and 1,444 deletions.
11 changes: 5 additions & 6 deletions pkg/synthetictests/allowedalerts/allow_on_revision_change.go
Expand Up @@ -8,8 +8,7 @@ import (
"time"

operatorv1client "github.com/openshift/client-go/operator/clientset/versioned/typed/operator/v1"
"github.com/openshift/origin/pkg/synthetictests/platformidentification"

"github.com/openshift/origin/pkg/synthetictests/historicaldata"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
)
Expand All @@ -32,7 +31,7 @@ func NewAllowedWhenEtcdRevisionChange(ctx context.Context, kubeClient kubernetes
}, nil
}

func (d *etcdRevisionChangeAllowance) FailAfter(alertName string, jobType platformidentification.JobType) (time.Duration, error) {
func (d *etcdRevisionChangeAllowance) FailAfter(key historicaldata.AlertDataKey) (time.Duration, error) {
// if the number of revisions is different compared to what we have collected at the beginning of the test suite
// increase allowed time for the alert
// the rationale is that some tests might roll out a new version of etcd during each rollout we allow max 3 elections per revision (we assume there are 3 master machines at most)
Expand All @@ -42,12 +41,12 @@ func (d *etcdRevisionChangeAllowance) FailAfter(alertName string, jobType platfo
return time.Duration(d.numberOfRevisionDuringTest) * 15 * time.Minute, nil

}
allowed, _, _ := getClosestPercentilesValues(alertName, jobType)
allowed, _, _ := getClosestPercentilesValues(key)
return allowed.P99, nil
}

func (d *etcdRevisionChangeAllowance) FlakeAfter(alertName string, jobType platformidentification.JobType) time.Duration {
allowed, _, _ := getClosestPercentilesValues(alertName, jobType)
func (d *etcdRevisionChangeAllowance) FlakeAfter(key historicaldata.AlertDataKey) time.Duration {
allowed, _, _ := getClosestPercentilesValues(key)
return allowed.P95
}

Expand Down
12 changes: 10 additions & 2 deletions pkg/synthetictests/allowedalerts/basic_alert.go
Expand Up @@ -7,6 +7,7 @@ import (
"strings"
"time"

"github.com/openshift/origin/pkg/synthetictests/historicaldata"
corev1 "k8s.io/api/core/v1"

"github.com/openshift/origin/pkg/monitor/monitorapi"
Expand Down Expand Up @@ -202,11 +203,18 @@ func (a *basicAlertTest) failOrFlake(ctx context.Context, restConfig *rest.Confi
// TODO for namespaced alerts, we need to query the data on a per-namespace basis.
// For the ones we're starting with, they tend to fail one at a time, so this will hopefully not be an awful starting point until we get there.

failAfter, err := a.allowanceCalculator.FailAfter(a.alertName, *jobType)
dataKey := historicaldata.AlertDataKey{
AlertName: a.alertName,
AlertLevel: string(a.alertState),
AlertNamespace: a.namespace,
JobType: *jobType,
}

failAfter, err := a.allowanceCalculator.FailAfter(dataKey)
if err != nil {
return fail, fmt.Sprintf("unable to calculate allowance for %s which was at %s, err %v\n\n%s", a.AlertName(), a.AlertState(), err, strings.Join(describe, "\n"))
}
flakeAfter := a.allowanceCalculator.FlakeAfter(a.alertName, *jobType)
flakeAfter := a.allowanceCalculator.FlakeAfter(dataKey)

switch {
case durationAtOrAboveLevel > failAfter:
Expand Down
24 changes: 11 additions & 13 deletions pkg/synthetictests/allowedalerts/matches.go
Expand Up @@ -4,8 +4,6 @@ import (
"time"

"github.com/openshift/origin/pkg/synthetictests/historicaldata"

"github.com/openshift/origin/pkg/synthetictests/platformidentification"
)

type neverFailAllowance struct {
Expand All @@ -18,41 +16,41 @@ func neverFail(flakeDelegate AlertTestAllowanceCalculator) AlertTestAllowanceCal
}
}

func (d *neverFailAllowance) FailAfter(alertName string, jobType platformidentification.JobType) (time.Duration, error) {
func (d *neverFailAllowance) FailAfter(key historicaldata.AlertDataKey) (time.Duration, error) {
return 24 * time.Hour, nil
}

func (d *neverFailAllowance) FlakeAfter(alertName string, jobType platformidentification.JobType) time.Duration {
return d.flakeDelegate.FlakeAfter(alertName, jobType)
func (d *neverFailAllowance) FlakeAfter(key historicaldata.AlertDataKey) time.Duration {
return d.flakeDelegate.FlakeAfter(key)
}

// AlertTestAllowanceCalculator provides the duration after which an alert test should flake and fail.
// For instance, for if the alert test is checking pending, and the alert is pending for 4s and the FailAfter
// returns 6s and the FlakeAfter returns 2s, then test will flake.
type AlertTestAllowanceCalculator interface {
// FailAfter returns a duration an alert can be at or above the required state before failing.
FailAfter(alertName string, jobType platformidentification.JobType) (time.Duration, error)
FailAfter(key historicaldata.AlertDataKey) (time.Duration, error)
// FlakeAfter returns a duration an alert can be at or above the required state before flaking.
FlakeAfter(alertName string, jobType platformidentification.JobType) time.Duration
FlakeAfter(key historicaldata.AlertDataKey) time.Duration
}

type percentileAllowances struct {
}

var defaultAllowances = &percentileAllowances{}

func (d *percentileAllowances) FailAfter(alertName string, jobType platformidentification.JobType) (time.Duration, error) {
allowed, _, _ := getClosestPercentilesValues(alertName, jobType)
func (d *percentileAllowances) FailAfter(key historicaldata.AlertDataKey) (time.Duration, error) {
allowed, _, _ := getClosestPercentilesValues(key)
return allowed.P99, nil
}

func (d *percentileAllowances) FlakeAfter(alertName string, jobType platformidentification.JobType) time.Duration {
allowed, _, _ := getClosestPercentilesValues(alertName, jobType)
func (d *percentileAllowances) FlakeAfter(key historicaldata.AlertDataKey) time.Duration {
allowed, _, _ := getClosestPercentilesValues(key)
return allowed.P95
}

// getClosestPercentilesValues uses the backend and information about the cluster to choose the best historical p99 to operate against.
// We enforce "don't get worse" for disruption by watching the aggregate data in CI over many runs.
func getClosestPercentilesValues(alertName string, jobType platformidentification.JobType) (historicaldata.StatisticalDuration, string, error) {
return getCurrentResults().BestMatchDuration(alertName, jobType)
func getClosestPercentilesValues(key historicaldata.AlertDataKey) (historicaldata.StatisticalDuration, string, error) {
return getCurrentResults().BestMatchDuration(key)
}
183 changes: 141 additions & 42 deletions pkg/synthetictests/allowedalerts/matches_test.go
Expand Up @@ -10,7 +10,7 @@ import (
"github.com/openshift/origin/pkg/synthetictests/platformidentification"
)

func TestGetClosestP95Value(t *testing.T) {
func TestGetClosestP99Value(t *testing.T) {

mustDuration := func(durationString string) *time.Duration {
ret, err := time.ParseDuration(durationString)
Expand All @@ -20,10 +20,12 @@ func TestGetClosestP95Value(t *testing.T) {
return &ret
}

historicalData := []historicaldata.StatisticalData{
historicalData := []historicaldata.AlertStatisticalData{
{
DataKey: historicaldata.DataKey{
Name: "etcdGRPCRequestsSlow",
AlertDataKey: historicaldata.AlertDataKey{
AlertName: "etcdGRPCRequestsSlow",
AlertNamespace: "",
AlertLevel: "warning",
JobType: platformidentification.JobType{
Release: "4.12",
FromRelease: "4.12",
Expand All @@ -33,12 +35,15 @@ func TestGetClosestP95Value(t *testing.T) {
Topology: "ha",
},
},
P95: 4.8,
P99: 7.9,
P95: 4.8,
P99: 7.9,
JobRuns: 1000,
},
{
DataKey: historicaldata.DataKey{
Name: "etcdGRPCRequestsSlow",
AlertDataKey: historicaldata.AlertDataKey{
AlertName: "etcdGRPCRequestsSlow",
AlertNamespace: "",
AlertLevel: "warning",
JobType: platformidentification.JobType{
Release: "4.12",
FromRelease: "4.12",
Expand All @@ -48,69 +53,163 @@ func TestGetClosestP95Value(t *testing.T) {
Topology: "ha",
},
},
P95: 50.827,
P99: 120.458,
P95: 50.827,
P99: 120.458,
JobRuns: 1000,
},
{
AlertDataKey: historicaldata.AlertDataKey{
AlertName: "etcdGRPCRequestsSlow",
AlertNamespace: "",
AlertLevel: "warning",
JobType: platformidentification.JobType{
Release: "4.13",
FromRelease: "4.12",
Platform: "azure",
Architecture: "amd64",
Network: "sdn",
Topology: "ha",
},
},
P95: 20.100,
P99: 24.938,
JobRuns: 10, // should get ignored, not min 100 results
},
}

// Convert our slice of statistical data to a map on datakey to match what the matcher needs.
// This allows us to define our dest data without duplicating the DataKey struct.
historicalDataMap := map[historicaldata.DataKey]historicaldata.StatisticalData{}
historicalDataMap := map[historicaldata.AlertDataKey]historicaldata.AlertStatisticalData{}
for _, hd := range historicalData {
historicalDataMap[hd.DataKey] = hd
historicalDataMap[hd.AlertDataKey] = hd
}

tests := []struct {
name string
alertName string
jobType platformidentification.JobType
key historicaldata.AlertDataKey
expectedDuration *time.Duration
}{
{
name: "direct match",
alertName: "etcdGRPCRequestsSlow",
jobType: platformidentification.JobType{
Release: "4.12",
FromRelease: "4.12",
Platform: "gcp",
Architecture: "amd64",
Network: "sdn",
Topology: "ha",
name: "direct match",
key: historicaldata.AlertDataKey{
AlertName: "etcdGRPCRequestsSlow",
AlertNamespace: "",
AlertLevel: "warning",
JobType: platformidentification.JobType{
Release: "4.12",
FromRelease: "4.12",
Platform: "gcp",
Architecture: "amd64",
Network: "sdn",
Topology: "ha",
},
},
expectedDuration: mustDuration("7.9s"),
},
{
name: "choose different arch",
alertName: "etcdGRPCRequestsSlow",
jobType: platformidentification.JobType{
Release: "4.12",
FromRelease: "4.12",
Platform: "aws",
Architecture: "not-real",
Network: "sdn",
Topology: "ha",
name: "choose different arch",
key: historicaldata.AlertDataKey{
AlertName: "etcdGRPCRequestsSlow",
AlertNamespace: "",
AlertLevel: "warning",
JobType: platformidentification.JobType{
Release: "4.12",
FromRelease: "4.12",
Platform: "aws",
Architecture: "not-real",
Network: "sdn",
Topology: "ha",
},
},
expectedDuration: mustDuration("120.458s"),
},
{
name: "missing",
alertName: "notARealAlert",
jobType: platformidentification.JobType{
Release: "4.10",
FromRelease: "4.10",
Platform: "azure",
Architecture: "amd64",
Topology: "missing",
name: "missing",
key: historicaldata.AlertDataKey{
AlertName: "notARealAlert",
AlertNamespace: "",
AlertLevel: "warning",
JobType: platformidentification.JobType{
Release: "4.10",
FromRelease: "4.10",
Platform: "azure",
Architecture: "amd64",
Topology: "missing",
},
},
expectedDuration: nil,
},
{
name: "skip if insufficient data",
key: historicaldata.AlertDataKey{
AlertName: "etcdGRPCRequestsSlow",
AlertNamespace: "",
AlertLevel: "warning",
JobType: platformidentification.JobType{
Release: "4.13",
FromRelease: "4.12",
Platform: "azure",
Network: "sdn",
Architecture: "amd64",
Topology: "ha",
},
},
expectedDuration: nil,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
matcher := historicaldata.NewMatcherWithHistoricalData(historicalDataMap, 3.141)
actualDuration, _, actualErr := matcher.BestMatchP99(tt.alertName, tt.jobType)
matcher := historicaldata.NewAlertMatcherWithHistoricalData(historicalDataMap)
actualDuration, _, actualErr := matcher.BestMatchP99(tt.key)
assert.Nil(t, actualErr)
assert.EqualValues(t, tt.expectedDuration, actualDuration, "unexpected duration")
})
}
}

// TestAlertDataFileParsing uses the actual query_results.json data file we populate weekly
// from bigquery and commit into origin. Test ensures we can parse it and the data looks sane.
func TestAlertDataFileParsing(t *testing.T) {

alertMatcher := getCurrentResults()

// The list of known alerts that goes into this file is composed of everything we've ever
// seen fire in that release. As such it can change from one release to the next as alerts
// are added or removed, or just don't happen to fire.
//
// To test here we're really just looking for *something* to indicate we have valid data.

var dataOver100Runs int
var foundAWSOVN bool
var foundAzureOVN bool
var foundGCPOVN bool
var foundMetalOVN bool

for _, v := range alertMatcher.HistoricalData {
if v.JobRuns > 100 {
dataOver100Runs++
}

if v.Platform == "aws" && v.Network == "ovn" && v.Architecture == "amd64" {
foundAWSOVN = true
}
if v.Platform == "azure" && v.Network == "ovn" && v.Architecture == "amd64" {
foundAzureOVN = true
}
if v.Platform == "gcp" && v.Network == "ovn" && v.Architecture == "amd64" {
foundGCPOVN = true
}
if v.Platform == "metal" && v.Network == "ovn" && v.Architecture == "amd64" {
foundMetalOVN = true
}

}

assert.Greater(t, dataOver100Runs, 5,
"expected at least 5 entries in query_results.json to have over 100 runs")
assert.True(t, foundAWSOVN, "no aws ovn job data in query_results.json")
assert.True(t, foundGCPOVN, "no gcp ovn job data in query_results.json")
assert.True(t, foundAzureOVN, "no azure ovn job data in query_results.json")
assert.True(t, foundMetalOVN, "no metal ovn job data in query_results.json")

}

0 comments on commit c2edff5

Please sign in to comment.