Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DO NOT MERGE: PRPQR test for excepted_failures #28809

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
cf4261a
add exception for vsphere due to single replica
DennisPeriquet Apr 22, 2024
6f2b9ef
See how many jobs fail with Degraded=True and Available=False
DennisPeriquet Apr 23, 2024
200a800
If Available=False outside of upgrade window, we fail
DennisPeriquet Apr 25, 2024
d16be54
Allow transitions during upgrade for cluster-storage-operator for vsp…
DennisPeriquet Apr 26, 2024
6e95796
we will work on TRT-1575 separately
DennisPeriquet Apr 26, 2024
cba84df
handle single replica operators during upgrade window
DennisPeriquet Apr 26, 2024
b158c9e
for cases besides single replica, attempt to match exceptions
DennisPeriquet Apr 29, 2024
beeb5d6
Rearrage since all exceptions refer to Available=False
DennisPeriquet Apr 30, 2024
efde445
Honor Available=False exceptions only if not in upgrade window
DennisPeriquet May 1, 2024
89a3143
make unit tests for the isInUpgradeWindow fix bugs found
DennisPeriquet May 2, 2024
b8aec3c
refactor to make unit test smaller, readable, extendable
DennisPeriquet May 3, 2024
676bb08
gofmt
DennisPeriquet May 5, 2024
cc302fd
review comments 1
DennisPeriquet May 8, 2024
a15769e
inline the testcases with the test
DennisPeriquet May 8, 2024
5ff612f
fixup to make more readable
DennisPeriquet May 8, 2024
dcaf8e1
add Jira for image-registry single replica exception
DennisPeriquet May 8, 2024
fe7305c
Revise upgrade window logic to catch all cases; update unit tests
DennisPeriquet May 8, 2024
013df0b
add exception for single node with Available=False
DennisPeriquet May 14, 2024
9fd1417
add SNO exception in both stable and upgrade modes
DennisPeriquet May 8, 2024
084c946
bring in latest from excepted_failures1
DennisPeriquet May 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions pkg/monitor/monitorapi/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,12 @@ const (
FailedToDeleteCGroupsPath IntervalReason = "FailedToDeleteCGroupsPath"
FailedToAuthenticateWithOpenShiftUser IntervalReason = "FailedToAuthenticateWithOpenShiftUser"
FailedContactingAPIReason IntervalReason = "FailedContactingAPI"

UpgradeStartedReason IntervalReason = "UpgradeStarted"
UpgradeVersionReason IntervalReason = "UpgradeVersion"
UpgradeRollbackReason IntervalReason = "UpgradeRollback"
UpgradeFailedReason IntervalReason = "UpgradeFailed"
UpgradeCompleteReason IntervalReason = "UpgradeComplete"
)

type AnnotationKey string
Expand Down Expand Up @@ -302,6 +308,10 @@ type Interval struct {
To time.Time
}

func (r IntervalReason) String() string {
return string(r)
}

func (i Interval) String() string {
if i.From.Equal(i.To) {
return fmt.Sprintf("%s.%03d %s %s %s",
Expand Down
4 changes: 2 additions & 2 deletions pkg/monitortestlibrary/platformidentification/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ func DidUpgradeHappenDuringCollection(intervals monitorapi.Intervals, beginning,
if event.Source != monitorapi.SourceKubeEvent || event.Locator.Keys[monitorapi.LocatorClusterVersionKey] != "cluster" {
continue
}
reason := string(event.Message.Reason)
if reason == "UpgradeStarted" || reason == "UpgradeRollback" {
reason := event.Message.Reason
if reason == monitorapi.UpgradeStartedReason || reason == monitorapi.UpgradeRollbackReason {
return true
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C
if isUpgrade {
junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
} else {
junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals)...)
junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
}

return junits, nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,24 @@ import (
"k8s.io/client-go/kubernetes"

"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/operatorstateanalyzer"
"github.com/sirupsen/logrus"

configv1 "github.com/openshift/api/config/v1"
clientconfigv1 "github.com/openshift/client-go/config/clientset/versioned"
"github.com/openshift/origin/pkg/monitor/monitorapi"
"github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
platformidentification2 "github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
exutil "github.com/openshift/origin/test/extended/util"
"k8s.io/client-go/rest"
)

// exceptionCallback consumes a suspicious condition and returns an
// exception string if does not think the condition should be fatal.
type exceptionCallback func(operator string, condition *configv1.ClusterOperatorStatusCondition, clientConfig *rest.Config) (string, error)
type exceptionCallback func(operator string, condition *configv1.ClusterOperatorStatusCondition, eventInterval monitorapi.Interval, clientConfig *rest.Config) (string, error)

func testStableSystemOperatorStateTransitions(events monitorapi.Intervals) []*junitapi.JUnitTestCase {
except := func(_ string, condition *configv1.ClusterOperatorStatusCondition, _ *rest.Config) (string, error) {
func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config) []*junitapi.JUnitTestCase {
except := func(_ string, condition *configv1.ClusterOperatorStatusCondition, _ monitorapi.Interval, clientConfig *rest.Config) (string, error) {
if condition.Status == configv1.ConditionTrue {
if condition.Type == configv1.OperatorAvailable {
return fmt.Sprintf("%s=%s is the happy case", condition.Type, condition.Status), nil
Expand All @@ -35,14 +38,118 @@ func testStableSystemOperatorStateTransitions(events monitorapi.Intervals) []*ju
}
}

return "We are not worried about Available=False or Degraded=True blips for stable-system tests yet.", nil
isSingleNode, err := isSingleNodeCheck(clientConfig)
if err != nil {
logrus.Warnf("Error checking for Single Node configuration on stable system (unable to make exception): %v", err)
isSingleNode = false
}

// For the non-upgrade case, if any operator has Available=False, fail the test.
if condition.Type == configv1.OperatorAvailable {

// We'll add an exception for single node for now.
if condition.Status == configv1.ConditionFalse && !isSingleNode {
return "", nil
}
}
return "We are not worried about Degraded=True blips for stable-system tests yet.", nil
}

return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig)
}

func isSingleNodeCheck(clientConfig *rest.Config) (bool, error) {
configClient, err := clientconfigv1.NewForConfig(clientConfig)
if err != nil {
logrus.Warnf("Error creating config client to check for Single Node configuration: %v", err)
return false, err
}
return exutil.IsSingleNode(context.Background(), configClient)
}

// isInUpgradeWindow determines if the given eventInterval falls within an upgrade window.
// UpgradeStart and UpgradeRollback events start upgrade windows and can end and already started upgrade window.
// UpgradeComplete and UpgradeFailed events end upgrade windows; if there was not an already started upgrade window,
// we ignore the event.
// If we don't find any upgrade ending point, we assume the ending point is at the end of the test.
func isInUpgradeWindow(eventList monitorapi.Intervals, eventInterval monitorapi.Interval) bool {
type upgradeWindowHolder struct {
startInterval *monitorapi.Interval
endInterval *monitorapi.Interval
}

var upgradeWindows []*upgradeWindowHolder
var currentWindow *upgradeWindowHolder

for _, event := range eventList {
if event.Source != monitorapi.SourceKubeEvent || event.Locator.Keys[monitorapi.LocatorClusterVersionKey] != "cluster" {
continue
}

switch event.Message.Reason {
case monitorapi.UpgradeStartedReason, monitorapi.UpgradeRollbackReason:
if currentWindow != nil {
// Close current window since there's already an upgrade window started
currentWindow.endInterval = &monitorapi.Interval{
Condition: monitorapi.Condition{
Message: monitorapi.Message{
Reason: event.Message.Reason,
},
},
From: event.From,
To: event.To,
}
}

// Start new window
currentWindow = &upgradeWindowHolder{
startInterval: &monitorapi.Interval{
Condition: monitorapi.Condition{
Message: monitorapi.Message{
Reason: event.Message.Reason,
},
},
From: event.From,
To: event.To,
},
}
upgradeWindows = append(upgradeWindows, currentWindow)
case monitorapi.UpgradeCompleteReason, monitorapi.UpgradeFailedReason:
if currentWindow != nil {
if currentWindow.endInterval == nil {
// End current window
currentWindow.endInterval = &monitorapi.Interval{
Condition: monitorapi.Condition{
Message: monitorapi.Message{
Reason: event.Message.Reason,
},
},
From: event.From,
To: event.To,
}
}
} else {
// We have no current window which means that the events indicate we completed
// or failed an upgrade without starting one. This is stange situation that
// we should not see; in this case, there is no upgrade window to check against.
logrus.Warnf("Found upgrade completion or failed event without a start or rollback event: %v", event)
}
}
}

return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except)
for _, upgradeWindow := range upgradeWindows {
if eventInterval.From.After(upgradeWindow.startInterval.From) {
if upgradeWindow.endInterval == nil || eventInterval.To.Before(upgradeWindow.endInterval.To) {
return true
}
}
}

return false
}

func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config) []*junitapi.JUnitTestCase {
except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, clientConfig *rest.Config) (string, error) {
except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, eventInterval monitorapi.Interval, clientConfig *rest.Config) (string, error) {
if condition.Status == configv1.ConditionTrue {
if condition.Type == configv1.OperatorAvailable {
return fmt.Sprintf("%s=%s is the happy case", condition.Type, condition.Status), nil
Expand All @@ -57,6 +164,22 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf
return "We are not worried about Degraded=True blips for update tests yet.", nil
}

var availableEqualsFalseAllowed bool
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse {
availableEqualsFalseAllowed = isInUpgradeWindow(events, eventInterval) && eventInterval.To.Sub(eventInterval.From) < 10*time.Minute
}

isSingleNode, err := isSingleNodeCheck(clientConfig)
if err != nil {
logrus.Warnf("Error checking for Single Node configuration on upgrade (unable to make exception): %v", err)
isSingleNode = false
}

// We'll add an exception for single node for now.
if !availableEqualsFalseAllowed && !isSingleNode {
return "", nil
}

switch operator {
case "authentication":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && (condition.Reason == "APIServices_Error" || condition.Reason == "APIServerDeployment_NoDeployment" || condition.Reason == "APIServerDeployment_NoPod" || condition.Reason == "APIServerDeployment_PreconditionNotFulfilled" || condition.Reason == "APIServices_PreconditionNotReady" || condition.Reason == "OAuthServerDeployment_NoDeployment" || condition.Reason == "OAuthServerRouteEndpointAccessibleController_EndpointUnavailable" || condition.Reason == "OAuthServerServiceEndpointAccessibleController_EndpointUnavailable" || condition.Reason == "WellKnown_NotReady") {
Expand Down Expand Up @@ -99,7 +222,7 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf
}
case "image-registry":
if replicaCount, _ := checkReplicas("openshift-image-registry", operator, clientConfig); replicaCount == 1 {
return "image-registry has only single replica", nil
return "https://issues.redhat.com/browse/OCPBUGS-22382", nil
}
}

Expand Down Expand Up @@ -128,7 +251,7 @@ func checkReplicas(namespace string, operator string, clientConfig *rest.Config)
return 0, fmt.Errorf("Error fetching replicas")
}

func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType, except exceptionCallback, clientConfig ...*rest.Config) []*junitapi.JUnitTestCase {
func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType, except exceptionCallback, clientConfig *rest.Config) []*junitapi.JUnitTestCase {
ret := []*junitapi.JUnitTestCase{}

var start, stop time.Time
Expand Down Expand Up @@ -195,11 +318,7 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
if len(concurrentE2E) > 0 {
failure = fmt.Sprintf("%s\n%d tests failed during this blip (%v to %v): %v", failure, len(concurrentE2E), eventInterval.From, eventInterval.From, strings.Join(concurrentE2E, "\n"))
}
var Config *rest.Config
if len(clientConfig) > 0 {
Config = clientConfig[0]
}
exception, err := except(operatorName, condition, Config)
exception, err := except(operatorName, condition, eventInterval, clientConfig)
if err != nil || exception == "" {
fatal = append(fatal, failure)
} else {
Expand Down
Loading