Merge pull request #27710 from dgoodwin/alert-revamp

Alert Testing on new Namespace and Level
openshift · Feb 9, 2023 · c2edff5 · c2edff5
2 parents 47e582a + b5db4bb
commit c2edff5
Show file tree

Hide file tree

Showing 10 changed files with 46,263 additions and 1,444 deletions.
diff --git a/pkg/synthetictests/allowedalerts/allow_on_revision_change.go b/pkg/synthetictests/allowedalerts/allow_on_revision_change.go
@@ -8,8 +8,7 @@ import (
 	"time"
 
 	operatorv1client "github.com/openshift/client-go/operator/clientset/versioned/typed/operator/v1"
-	"github.com/openshift/origin/pkg/synthetictests/platformidentification"
-
+	"github.com/openshift/origin/pkg/synthetictests/historicaldata"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes"
 )
@@ -32,7 +31,7 @@ func NewAllowedWhenEtcdRevisionChange(ctx context.Context, kubeClient kubernetes
 	}, nil
 }
 
-func (d *etcdRevisionChangeAllowance) FailAfter(alertName string, jobType platformidentification.JobType) (time.Duration, error) {
+func (d *etcdRevisionChangeAllowance) FailAfter(key historicaldata.AlertDataKey) (time.Duration, error) {
 	// if the number of revisions is different compared to what we have collected at the beginning of the test suite
 	// increase allowed time for the alert
 	// the rationale is that some tests might roll out a new version of etcd during each rollout we allow max 3 elections per revision (we assume there are 3 master machines at most)
@@ -42,12 +41,12 @@ func (d *etcdRevisionChangeAllowance) FailAfter(alertName string, jobType platfo
 		return time.Duration(d.numberOfRevisionDuringTest) * 15 * time.Minute, nil
 
 	}
-	allowed, _, _ := getClosestPercentilesValues(alertName, jobType)
+	allowed, _, _ := getClosestPercentilesValues(key)
 	return allowed.P99, nil
 }
 
-func (d *etcdRevisionChangeAllowance) FlakeAfter(alertName string, jobType platformidentification.JobType) time.Duration {
-	allowed, _, _ := getClosestPercentilesValues(alertName, jobType)
+func (d *etcdRevisionChangeAllowance) FlakeAfter(key historicaldata.AlertDataKey) time.Duration {
+	allowed, _, _ := getClosestPercentilesValues(key)
 	return allowed.P95
 }
 

diff --git a/pkg/synthetictests/allowedalerts/basic_alert.go b/pkg/synthetictests/allowedalerts/basic_alert.go
@@ -7,6 +7,7 @@ import (
 	"strings"
 	"time"
 
+	"github.com/openshift/origin/pkg/synthetictests/historicaldata"
 	corev1 "k8s.io/api/core/v1"
 
 	"github.com/openshift/origin/pkg/monitor/monitorapi"
@@ -202,11 +203,18 @@ func (a *basicAlertTest) failOrFlake(ctx context.Context, restConfig *rest.Confi
 	// TODO for namespaced alerts, we need to query the data on a per-namespace basis.
 	//  For the ones we're starting with, they tend to fail one at a time, so this will hopefully not be an awful starting point until we get there.
 
-	failAfter, err := a.allowanceCalculator.FailAfter(a.alertName, *jobType)
+	dataKey := historicaldata.AlertDataKey{
+		AlertName:      a.alertName,
+		AlertLevel:     string(a.alertState),
+		AlertNamespace: a.namespace,
+		JobType:        *jobType,
+	}
+
+	failAfter, err := a.allowanceCalculator.FailAfter(dataKey)
 	if err != nil {
 		return fail, fmt.Sprintf("unable to calculate allowance for %s which was at %s, err %v\n\n%s", a.AlertName(), a.AlertState(), err, strings.Join(describe, "\n"))
 	}
-	flakeAfter := a.allowanceCalculator.FlakeAfter(a.alertName, *jobType)
+	flakeAfter := a.allowanceCalculator.FlakeAfter(dataKey)
 
 	switch {
 	case durationAtOrAboveLevel > failAfter:

diff --git a/pkg/synthetictests/allowedalerts/matches.go b/pkg/synthetictests/allowedalerts/matches.go
@@ -4,8 +4,6 @@ import (
 	"time"
 
 	"github.com/openshift/origin/pkg/synthetictests/historicaldata"
-
-	"github.com/openshift/origin/pkg/synthetictests/platformidentification"
 )
 
 type neverFailAllowance struct {
@@ -18,41 +16,41 @@ func neverFail(flakeDelegate AlertTestAllowanceCalculator) AlertTestAllowanceCal
 	}
 }
 
-func (d *neverFailAllowance) FailAfter(alertName string, jobType platformidentification.JobType) (time.Duration, error) {
+func (d *neverFailAllowance) FailAfter(key historicaldata.AlertDataKey) (time.Duration, error) {
 	return 24 * time.Hour, nil
 }
 
-func (d *neverFailAllowance) FlakeAfter(alertName string, jobType platformidentification.JobType) time.Duration {
-	return d.flakeDelegate.FlakeAfter(alertName, jobType)
+func (d *neverFailAllowance) FlakeAfter(key historicaldata.AlertDataKey) time.Duration {
+	return d.flakeDelegate.FlakeAfter(key)
 }
 
 // AlertTestAllowanceCalculator provides the duration after which an alert test should flake and fail.
 // For instance, for if the alert test is checking pending, and the alert is pending for 4s and the FailAfter
 // returns 6s and the FlakeAfter returns 2s, then test will flake.
 type AlertTestAllowanceCalculator interface {
 	// FailAfter returns a duration an alert can be at or above the required state before failing.
-	FailAfter(alertName string, jobType platformidentification.JobType) (time.Duration, error)
+	FailAfter(key historicaldata.AlertDataKey) (time.Duration, error)
 	// FlakeAfter returns a duration an alert can be at or above the required state before flaking.
-	FlakeAfter(alertName string, jobType platformidentification.JobType) time.Duration
+	FlakeAfter(key historicaldata.AlertDataKey) time.Duration
 }
 
 type percentileAllowances struct {
 }
 
 var defaultAllowances = &percentileAllowances{}
 
-func (d *percentileAllowances) FailAfter(alertName string, jobType platformidentification.JobType) (time.Duration, error) {
-	allowed, _, _ := getClosestPercentilesValues(alertName, jobType)
+func (d *percentileAllowances) FailAfter(key historicaldata.AlertDataKey) (time.Duration, error) {
+	allowed, _, _ := getClosestPercentilesValues(key)
 	return allowed.P99, nil
 }
 
-func (d *percentileAllowances) FlakeAfter(alertName string, jobType platformidentification.JobType) time.Duration {
-	allowed, _, _ := getClosestPercentilesValues(alertName, jobType)
+func (d *percentileAllowances) FlakeAfter(key historicaldata.AlertDataKey) time.Duration {
+	allowed, _, _ := getClosestPercentilesValues(key)
 	return allowed.P95
 }
 
 // getClosestPercentilesValues uses the backend and information about the cluster to choose the best historical p99 to operate against.
 // We enforce "don't get worse" for disruption by watching the aggregate data in CI over many runs.
-func getClosestPercentilesValues(alertName string, jobType platformidentification.JobType) (historicaldata.StatisticalDuration, string, error) {
-	return getCurrentResults().BestMatchDuration(alertName, jobType)
+func getClosestPercentilesValues(key historicaldata.AlertDataKey) (historicaldata.StatisticalDuration, string, error) {
+	return getCurrentResults().BestMatchDuration(key)
 }
diff --git a/pkg/synthetictests/allowedalerts/matches_test.go b/pkg/synthetictests/allowedalerts/matches_test.go
@@ -10,7 +10,7 @@ import (
 	"github.com/openshift/origin/pkg/synthetictests/platformidentification"
 )
 
-func TestGetClosestP95Value(t *testing.T) {
+func TestGetClosestP99Value(t *testing.T) {
 
 	mustDuration := func(durationString string) *time.Duration {
 		ret, err := time.ParseDuration(durationString)
@@ -20,10 +20,12 @@ func TestGetClosestP95Value(t *testing.T) {
 		return &ret
 	}
 
-	historicalData := []historicaldata.StatisticalData{
+	historicalData := []historicaldata.AlertStatisticalData{
 		{
-			DataKey: historicaldata.DataKey{
-				Name: "etcdGRPCRequestsSlow",
+			AlertDataKey: historicaldata.AlertDataKey{
+				AlertName:      "etcdGRPCRequestsSlow",
+				AlertNamespace: "",
+				AlertLevel:     "warning",
 				JobType: platformidentification.JobType{
 					Release:      "4.12",
 					FromRelease:  "4.12",
@@ -33,12 +35,15 @@ func TestGetClosestP95Value(t *testing.T) {
 					Topology:     "ha",
 				},
 			},
-			P95: 4.8,
-			P99: 7.9,
+			P95:     4.8,
+			P99:     7.9,
+			JobRuns: 1000,
 		},
 		{
-			DataKey: historicaldata.DataKey{
-				Name: "etcdGRPCRequestsSlow",
+			AlertDataKey: historicaldata.AlertDataKey{
+				AlertName:      "etcdGRPCRequestsSlow",
+				AlertNamespace: "",
+				AlertLevel:     "warning",
 				JobType: platformidentification.JobType{
 					Release:      "4.12",
 					FromRelease:  "4.12",
@@ -48,69 +53,163 @@ func TestGetClosestP95Value(t *testing.T) {
 					Topology:     "ha",
 				},
 			},
-			P95: 50.827,
-			P99: 120.458,
+			P95:     50.827,
+			P99:     120.458,
+			JobRuns: 1000,
+		},
+		{
+			AlertDataKey: historicaldata.AlertDataKey{
+				AlertName:      "etcdGRPCRequestsSlow",
+				AlertNamespace: "",
+				AlertLevel:     "warning",
+				JobType: platformidentification.JobType{
+					Release:      "4.13",
+					FromRelease:  "4.12",
+					Platform:     "azure",
+					Architecture: "amd64",
+					Network:      "sdn",
+					Topology:     "ha",
+				},
+			},
+			P95:     20.100,
+			P99:     24.938,
+			JobRuns: 10, // should get ignored, not min 100 results
 		},
 	}
 
 	// Convert our slice of statistical data to a map on datakey to match what the matcher needs.
 	// This allows us to define our dest data without duplicating the DataKey struct.
-	historicalDataMap := map[historicaldata.DataKey]historicaldata.StatisticalData{}
+	historicalDataMap := map[historicaldata.AlertDataKey]historicaldata.AlertStatisticalData{}
 	for _, hd := range historicalData {
-		historicalDataMap[hd.DataKey] = hd
+		historicalDataMap[hd.AlertDataKey] = hd
 	}
 
 	tests := []struct {
 		name             string
-		alertName        string
-		jobType          platformidentification.JobType
+		key              historicaldata.AlertDataKey
 		expectedDuration *time.Duration
 	}{
 		{
-			name:      "direct match",
-			alertName: "etcdGRPCRequestsSlow",
-			jobType: platformidentification.JobType{
-				Release:      "4.12",
-				FromRelease:  "4.12",
-				Platform:     "gcp",
-				Architecture: "amd64",
-				Network:      "sdn",
-				Topology:     "ha",
+			name: "direct match",
+			key: historicaldata.AlertDataKey{
+				AlertName:      "etcdGRPCRequestsSlow",
+				AlertNamespace: "",
+				AlertLevel:     "warning",
+				JobType: platformidentification.JobType{
+					Release:      "4.12",
+					FromRelease:  "4.12",
+					Platform:     "gcp",
+					Architecture: "amd64",
+					Network:      "sdn",
+					Topology:     "ha",
+				},
 			},
 			expectedDuration: mustDuration("7.9s"),
 		},
 		{
-			name:      "choose different arch",
-			alertName: "etcdGRPCRequestsSlow",
-			jobType: platformidentification.JobType{
-				Release:      "4.12",
-				FromRelease:  "4.12",
-				Platform:     "aws",
-				Architecture: "not-real",
-				Network:      "sdn",
-				Topology:     "ha",
+			name: "choose different arch",
+			key: historicaldata.AlertDataKey{
+				AlertName:      "etcdGRPCRequestsSlow",
+				AlertNamespace: "",
+				AlertLevel:     "warning",
+				JobType: platformidentification.JobType{
+					Release:      "4.12",
+					FromRelease:  "4.12",
+					Platform:     "aws",
+					Architecture: "not-real",
+					Network:      "sdn",
+					Topology:     "ha",
+				},
 			},
 			expectedDuration: mustDuration("120.458s"),
 		},
 		{
-			name:      "missing",
-			alertName: "notARealAlert",
-			jobType: platformidentification.JobType{
-				Release:      "4.10",
-				FromRelease:  "4.10",
-				Platform:     "azure",
-				Architecture: "amd64",
-				Topology:     "missing",
+			name: "missing",
+			key: historicaldata.AlertDataKey{
+				AlertName:      "notARealAlert",
+				AlertNamespace: "",
+				AlertLevel:     "warning",
+				JobType: platformidentification.JobType{
+					Release:      "4.10",
+					FromRelease:  "4.10",
+					Platform:     "azure",
+					Architecture: "amd64",
+					Topology:     "missing",
+				},
+			},
+			expectedDuration: nil,
+		},
+		{
+			name: "skip if insufficient data",
+			key: historicaldata.AlertDataKey{
+				AlertName:      "etcdGRPCRequestsSlow",
+				AlertNamespace: "",
+				AlertLevel:     "warning",
+				JobType: platformidentification.JobType{
+					Release:      "4.13",
+					FromRelease:  "4.12",
+					Platform:     "azure",
+					Network:      "sdn",
+					Architecture: "amd64",
+					Topology:     "ha",
+				},
 			},
 			expectedDuration: nil,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			matcher := historicaldata.NewMatcherWithHistoricalData(historicalDataMap, 3.141)
-			actualDuration, _, actualErr := matcher.BestMatchP99(tt.alertName, tt.jobType)
+			matcher := historicaldata.NewAlertMatcherWithHistoricalData(historicalDataMap)
+			actualDuration, _, actualErr := matcher.BestMatchP99(tt.key)
 			assert.Nil(t, actualErr)
 			assert.EqualValues(t, tt.expectedDuration, actualDuration, "unexpected duration")
 		})
 	}
 }
+
+// TestAlertDataFileParsing uses the actual query_results.json data file we populate weekly
+// from bigquery and commit into origin. Test ensures we can parse it and the data looks sane.
+func TestAlertDataFileParsing(t *testing.T) {
+
+	alertMatcher := getCurrentResults()
+
+	// The list of known alerts that goes into this file is composed of everything we've ever
+	// seen fire in that release. As such it can change from one release to the next as alerts
+	// are added or removed, or just don't happen to fire.
+	//
+	// To test here we're really just looking for *something* to indicate we have valid data.
+
+	var dataOver100Runs int
+	var foundAWSOVN bool
+	var foundAzureOVN bool
+	var foundGCPOVN bool
+	var foundMetalOVN bool
+
+	for _, v := range alertMatcher.HistoricalData {
+		if v.JobRuns > 100 {
+			dataOver100Runs++
+		}
+
+		if v.Platform == "aws" && v.Network == "ovn" && v.Architecture == "amd64" {
+			foundAWSOVN = true
+		}
+		if v.Platform == "azure" && v.Network == "ovn" && v.Architecture == "amd64" {
+			foundAzureOVN = true
+		}
+		if v.Platform == "gcp" && v.Network == "ovn" && v.Architecture == "amd64" {
+			foundGCPOVN = true
+		}
+		if v.Platform == "metal" && v.Network == "ovn" && v.Architecture == "amd64" {
+			foundMetalOVN = true
+		}
+
+	}
+
+	assert.Greater(t, dataOver100Runs, 5,
+		"expected at least 5 entries in query_results.json to have over 100 runs")
+	assert.True(t, foundAWSOVN, "no aws ovn job data in query_results.json")
+	assert.True(t, foundGCPOVN, "no gcp ovn job data in query_results.json")
+	assert.True(t, foundAzureOVN, "no azure ovn job data in query_results.json")
+	assert.True(t, foundMetalOVN, "no metal ovn job data in query_results.json")
+
+}