Skip to content

Commit

Permalink
Merge pull request #28421 from eggfoobar/sno-serial-resiliency-ready
Browse files Browse the repository at this point in the history
OCPBUGS-22438: feat: add extra check for ready state on resiliency test
  • Loading branch information
openshift-merge-bot[bot] committed Dec 20, 2023
2 parents 30a1d82 + f56c6e6 commit 36eceee
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,9 @@ func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalInterva
topologyAwareMatcher := newTopologyAwareHintsDisabledDuringTaintTestsPathologicalEventMatcher(finalIntervals)
registry.AddPathologicalEventMatcherOrDie(topologyAwareMatcher)

singleNodeConnectionRefusedMatcher := newSingleNodeConnectionRefusedEventMatcher(finalIntervals)
registry.AddPathologicalEventMatcherOrDie(singleNodeConnectionRefusedMatcher)

return registry
}

Expand Down Expand Up @@ -755,6 +758,7 @@ func newDuplicatedEventsAllowedWhenEtcdRevisionChange(ctx context.Context, clien
}
currentRevision, err := getBiggestRevisionForEtcdOperator(ctx, operatorClient)
if err != nil {
return nil, err
}
repeatThresholdOverride := currentRevision * (60 / 5)
logrus.WithFields(logrus.Fields{
Expand Down Expand Up @@ -828,7 +832,7 @@ func (ade *OverlapOtherIntervalsPathologicalEventMatcher) Allows(i monitorapi.In
// Match the pathological event if it overlaps with any of the given set of intervals.
for _, nui := range ade.allowIfWithinIntervals {
if nui.From.Before(i.From) && nui.To.After(i.To) {
logrus.Infof("%s was found to overlap with %s, ignoring pathological event as we expect these during master updates", i, nui)
logrus.Infof("%s was found to overlap with %s, ignoring pathological event as they fall within range of specified intervals", i, nui)
return true
}
}
Expand Down Expand Up @@ -909,3 +913,29 @@ func newTopologyAwareHintsDisabledDuringTaintTestsPathologicalEventMatcher(final
allowIfWithinIntervals: adjustedTaintTestIntervals,
}
}

// Ignore connection refused events during OCP APIServer or OAuth APIServer being down
func newSingleNodeConnectionRefusedEventMatcher(finalIntervals monitorapi.Intervals) EventMatcher {
const (
ocpAPINamespace = "openshift-apiserver"
ocpOAuthAPINamespace = "openshift-oauth-apiserver"
)
snoTopology := v1.SingleReplicaTopologyMode
ocpAPISeverTargetDownIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
return eventInterval.Source == monitorapi.SourceAlert &&
eventInterval.StructuredLocator.Keys[monitorapi.LocatorAlertKey] == "TargetDown" &&
(eventInterval.StructuredLocator.Keys[monitorapi.LocatorNamespaceKey] == ocpAPINamespace ||
eventInterval.StructuredLocator.Keys[monitorapi.LocatorNamespaceKey] == ocpOAuthAPINamespace)
})
if len(ocpAPISeverTargetDownIntervals) > 0 {
logrus.Infof("found %d OCP APIServer TargetDown intervals", len(ocpAPISeverTargetDownIntervals))
}
return &OverlapOtherIntervalsPathologicalEventMatcher{
delegate: &SimplePathologicalEventMatcher{
name: "ConnectionErrorDuringSingleNodeAPIServerTargetDown",
messageHumanRegex: regexp.MustCompile(`dial tcp .* connect: connection refused`),
topology: &snoTopology,
},
allowIfWithinIntervals: ocpAPISeverTargetDownIntervals,
}
}
12 changes: 8 additions & 4 deletions test/extended/apiserver/resiliency.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ var _ = ginkgo.Describe("[Conformance][sig-sno][Serial] Cluster", func() {
restClient, err := rest.RESTClientFor(config)
framework.ExpectNoError(err)

req, err := http.NewRequest(http.MethodGet, config.Host+"/readyz", nil)
framework.ExpectNoError(err)
req.Header.Set("X-OpenShift-Internal-If-Not-Ready", "reject")

httpClient := restClient.Client

ginkgo.By("Making sure no previous rollout is in progress")
Expand All @@ -69,7 +73,7 @@ var _ = ginkgo.Describe("[Conformance][sig-sno][Serial] Cluster", func() {
// We are taking the API down, this can often take more than a minute so we have provided a reasonably generous timeout.
ginkgo.By("Expecting API to become unavailable")
err = wait.PollImmediate(time.Second, 5*time.Minute, func() (bool, error) {
ready := isApiReady(config, httpClient)
ready := isApiReady(config, httpClient, req)
return !ready, nil
})

Expand All @@ -79,7 +83,7 @@ var _ = ginkgo.Describe("[Conformance][sig-sno][Serial] Cluster", func() {

ginkgo.By("Expecting API to become ready")
err = wait.PollImmediate(time.Second, time.Minute, func() (bool, error) {
ready := isApiReady(config, httpClient)
ready := isApiReady(config, httpClient, req)
return ready, nil
})

Expand Down Expand Up @@ -134,8 +138,8 @@ func forceApiRollout(oc *exutil.CLI) {
gomega.Expect(err).NotTo(gomega.HaveOccurred())
}

func isApiReady(clusterConfig *rest.Config, httpClient *http.Client) (ready bool) {
resp, err := httpClient.Get(clusterConfig.Host + "/readyz")
func isApiReady(clusterConfig *rest.Config, httpClient *http.Client, req *http.Request) (ready bool) {
resp, err := httpClient.Do(req)
if resp != nil && resp.Body != nil {
defer resp.Body.Close()
}
Expand Down
14 changes: 13 additions & 1 deletion test/extended/util/oc_copy.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@ import (
"context"
"net/url"
"strings"
"time"

userv1 "github.com/openshift/api/user/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apimachinery/third_party/forked/golang/netutil"
restclient "k8s.io/client-go/rest"
clientcmdapi "k8s.io/client-go/tools/clientcmd/api"
Expand Down Expand Up @@ -48,7 +51,16 @@ func getUserPartOfNickname(clientCfg *restclient.Config) (string, error) {
if err != nil {
return "", err
}
userInfo, err := userClient.Users().Get(context.Background(), "~", metav1.GetOptions{})

var userInfo *userv1.User
err = wait.PollUntilContextTimeout(context.Background(), time.Second, time.Minute, true, func(ctx context.Context) (done bool, err error) {
userInfo, err = userClient.Users().Get(ctx, "~", metav1.GetOptions{})
if err != nil && strings.Contains(err.Error(), "connect: connection refused") {
return false, nil
}
return true, err
})

if kerrors.IsNotFound(err) || kerrors.IsForbidden(err) {
// if we're talking to kube (or likely talking to kube), take a best guess consistent with login
switch {
Expand Down

0 comments on commit 36eceee

Please sign in to comment.