Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 1985997: Enable static pod fallback logic for SNO, with disruptive e2e test #1198

Merged
merged 4 commits into from
Aug 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ require (
github.com/openshift/api v0.0.0-20210730095913-85e1d547cdee
github.com/openshift/build-machinery-go v0.0.0-20210712174854-1bb7fd1518d3
github.com/openshift/client-go v0.0.0-20210730113412-1811c1b3fc0e
github.com/openshift/library-go v0.0.0-20210803154958-0e70d0844e00
github.com/openshift/library-go v0.0.0-20210804150119-965974e0af3f
github.com/pkg/profile v1.5.0 // indirect
github.com/prometheus-operator/prometheus-operator/pkg/client v0.45.0
github.com/prometheus/client_golang v1.11.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -477,8 +477,8 @@ github.com/openshift/build-machinery-go v0.0.0-20210712174854-1bb7fd1518d3 h1:hY
github.com/openshift/build-machinery-go v0.0.0-20210712174854-1bb7fd1518d3/go.mod h1:b1BuldmJlbA/xYtdZvKi+7j5YGB44qJUJDZ9zwiNCfE=
github.com/openshift/client-go v0.0.0-20210730113412-1811c1b3fc0e h1:vhwzeXUxLd6JZlWZ+miBzTEpmVctHyHNq9z43ScYxWI=
github.com/openshift/client-go v0.0.0-20210730113412-1811c1b3fc0e/go.mod h1:P1pjphFOgm/nYjmtouHGaSLGtdP25dQICJnYtcYhfEs=
github.com/openshift/library-go v0.0.0-20210803154958-0e70d0844e00 h1:3Izqgf3q6goB/rg8T+J4g2plzAy7No7KN3h4en7V330=
github.com/openshift/library-go v0.0.0-20210803154958-0e70d0844e00/go.mod h1:3GagmGg6gikg+hAqma7E7axBzs2pjx4+GrAbdl4OYdY=
github.com/openshift/library-go v0.0.0-20210804150119-965974e0af3f h1:naAxBRhCtOnTKdW3Cb/XRE7hdC7gPTrZlJvBVqMGXJs=
github.com/openshift/library-go v0.0.0-20210804150119-965974e0af3f/go.mod h1:3GagmGg6gikg+hAqma7E7axBzs2pjx4+GrAbdl4OYdY=
github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
github.com/pborman/uuid v1.2.0/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k=
Expand Down
32 changes: 14 additions & 18 deletions pkg/operator/startupmonitorreadiness/enablement.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@ import (
"bytes"
"encoding/json"

configlistersv1 "github.com/openshift/client-go/config/listers/config/v1"
"github.com/openshift/library-go/pkg/operator/v1helpers"

apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"

configv1 "github.com/openshift/api/config/v1"
configlistersv1 "github.com/openshift/client-go/config/listers/config/v1"
"github.com/openshift/library-go/pkg/operator/v1helpers"
)

// IsStartupMonitorEnabledFunction returns a function that determines if the startup monitor should be enabled on a cluster
Expand All @@ -20,16 +21,11 @@ func IsStartupMonitorEnabledFunction(infrastructureLister configlistersv1.Infras
return false, err
}

// TODO: uncomment before releasing 4.9
/*
if infra.Status.ControlPlaneTopology != configv1.SingleReplicaTopologyMode {
return false, nil
}
*/
if infra.Status.ControlPlaneTopology == configv1.SingleReplicaTopologyMode {
return true, nil
}

// TODO: remove before releasing 4.9
_ = infra
startupMonitorExplicitlyEnabled := false
// for development and debugging
operatorSpec, _, _, err := operatorClient.GetOperatorState()
if err != nil {
return false, err
Expand All @@ -39,12 +35,12 @@ func IsStartupMonitorEnabledFunction(infrastructureLister configlistersv1.Infras
if err := json.NewDecoder(bytes.NewBuffer(operatorSpec.UnsupportedConfigOverrides.Raw)).Decode(&observedUnsupportedConfig); err != nil {
return false, err
}
startupMonitorExplicitlyEnabled, _, _ = unstructured.NestedBool(observedUnsupportedConfig, "startupMonitor")
}
if !startupMonitorExplicitlyEnabled {
return false, nil
enabled, found, err := unstructured.NestedBool(observedUnsupportedConfig, "startupMonitor")
if err == nil && found {
return enabled, nil
}
}
// End of TODO
return true, nil

return false, nil
}
}
36 changes: 36 additions & 0 deletions test/e2e-sno-disruptive/helpers.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package e2e_sno_disruptive

import (
"testing"

"github.com/stretchr/testify/require"

configv1client "github.com/openshift/client-go/config/clientset/versioned"
configv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
operatorv1client "github.com/openshift/client-go/operator/clientset/versioned/typed/operator/v1"
libgotest "github.com/openshift/library-go/test/library"

"k8s.io/client-go/kubernetes"
)

type clientSet struct {
Infra configv1.InfrastructureInterface
Operator operatorv1client.KubeAPIServerInterface
Kube kubernetes.Interface
}

func getClients(t testing.TB) clientSet {
t.Helper()

kubeConfig, err := libgotest.NewClientConfigForTest()
require.NoError(t, err)
kubeClient := kubernetes.NewForConfigOrDie(kubeConfig)

operatorClient, err := operatorv1client.NewForConfig(kubeConfig)
require.NoError(t, err)

configClient, err := configv1client.NewForConfig(kubeConfig)
require.NoError(t, err)

return clientSet{Infra: configClient.ConfigV1().Infrastructures(), Operator: operatorClient.KubeAPIServers(), Kube: kubeClient}
}
239 changes: 236 additions & 3 deletions test/e2e-sno-disruptive/sno_disruptive_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,240 @@
package e2e_sno_disruptive

import "testing"
import (
"context"
"encoding/json"
"strconv"
"testing"
"time"

func TestFallback(t *testing.T) {
t.Log("implement me")
"github.com/stretchr/testify/require"

configv1 "github.com/openshift/api/config/v1"
"github.com/openshift/library-go/pkg/operator/staticpod/startupmonitor/annotations"
"github.com/openshift/library-go/pkg/operator/v1helpers"
commontesthelpers "github.com/openshift/library-go/test/library/encryption"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/util/retry"
)

func TestFallback(tt *testing.T) {
t := commontesthelpers.NewE(tt)
cs := getClients(t)

t.Log("Starting the fallback test")
clusterStateWaitPollTimeout, clusterMustBeReadyFor, waitForFallbackDegradedConditionTimeout := fallbackTimeoutsForCurrentPlatform(t, cs)

// before starting a new test make sure the current state of the cluster is good
ensureClusterInGoodState(t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor)

// cause a disruption
cfg := getDefaultUnsupportedConfigForCurrentPlatform(t, cs)
cfg["apiServerArguments"] = map[string][]string{"non-existing-flag": {"true"}}
setUnsupportedConfig(t, cs, cfg)

// validate if the fallback condition is reported and the cluster is stable
waitForFallbackDegradedCondition(t, cs, waitForFallbackDegradedConditionTimeout)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can't we derive waitForFallbackDegradedConditionTimeout from some per-platform number? E.g. oneNodeRolloutTimeout

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added TODO

nodeName, failedRevision := assertFallbackOnNodeStatus(t, cs)
assertKasPodAnnotatedOnNode(t, cs, failedRevision, nodeName)

// clean up
setUnsupportedConfig(t, cs, getDefaultUnsupportedConfigForCurrentPlatform(t, cs))
err := waitForClusterInGoodState(t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor)
require.NoError(t, err)
}

// ensureClusterInGoodState makes sure the cluster is not progressing for mustBeReadyFor period
// in addition in an HA env it applies getDefaultUnsupportedConfigForCurrentPlatform so that the feature is enabled before the tests starts
func ensureClusterInGoodState(t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) {
setUnsupportedConfig(t, cs, getDefaultUnsupportedConfigForCurrentPlatform(t, cs))
err := waitForClusterInGoodState(t, cs, waitPollTimeout, mustBeReadyFor)
require.NoError(t, err)
}

// waitForClusterInGoodState checks if the cluster is not progressing
func waitForClusterInGoodState(t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) error {
t.Helper()

startTs := time.Now()
t.Logf("Waiting %s for the cluster to be in a good condition, interval = 10s, timeout %v", mustBeReadyFor.String(), waitPollTimeout)

return wait.Poll(10*time.Second, waitPollTimeout, func() (bool, error) {
ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{})
if err != nil {
t.Log(err)
return false, nil /*retry*/
}

for _, ns := range ckaso.Status.NodeStatuses {
if ckaso.Status.LatestAvailableRevision != ns.CurrentRevision || ns.TargetRevision > 0 {
t.Logf("Node %s is progressing, latestAvailableRevision: %v, currentRevision: %v, targetRevision: %v", ns.NodeName, ckaso.Status.LatestAvailableRevision, ns.CurrentRevision, ns.TargetRevision)
return false, nil /*retry*/
}
}

if time.Since(startTs) > mustBeReadyFor {
t.Logf("The cluster has been in good condition for %s", mustBeReadyFor.String())
return true, nil /*done*/
}
return false, nil /*wait a bit more*/
})
}

// setUnsupportedConfig simply sets UnsupportedConfigOverrides config to the provided cfg
func setUnsupportedConfig(t testing.TB, cs clientSet, cfg map[string]interface{}) {
t.Helper()

t.Logf("Setting UnsupportedConfigOverrides to %v", cfg)
raw, err := json.Marshal(cfg)
require.NoError(t, err)

err = retry.OnError(retry.DefaultRetry, func(error) bool { return true }, func() error {
ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{})
if err != nil {
t.Log(err)
return err
}
ckaso.Spec.UnsupportedConfigOverrides.Raw = raw
_, err = cs.Operator.Update(context.TODO(), ckaso, metav1.UpdateOptions{})
if err != nil {
t.Log(err)
}
return err
})
require.NoError(t, err)
}

// waitForFallbackDegradedCondition waits until StaticPodFallbackRevisionDegraded condition is set to true
func waitForFallbackDegradedCondition(t testing.TB, cs clientSet, waitPollTimeout time.Duration) {
t.Helper()

t.Logf("Waiting for StaticPodFallbackRevisionDegraded condition, interval = 20s, timeout = %v", waitPollTimeout)
err := wait.Poll(20*time.Second, waitPollTimeout, func() (bool, error) {
ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{})
if err != nil {
t.Logf("unable to get kube-apiserver-operator resource: %v", err)
return false, nil /*retry*/
}

if v1helpers.IsOperatorConditionTrue(ckaso.Status.Conditions, "StaticPodFallbackRevisionDegraded") {
return true, nil /*done*/
}

t.Logf("StaticPodFallbackRevisionDegraded condition hasn't been set yet")
return false, nil /*retry*/
})
require.NoError(t, err)
}

func assertFallbackOnNodeStatus(t testing.TB, cs clientSet) (string, int32) {
t.Helper()

t.Log("Checking if a NodeStatus has been updated to report the fallback condition")

ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{})
require.NoError(t, err)

var nodeName string
var failedRevision int32
for _, ns := range ckaso.Status.NodeStatuses {
if ns.LastFallbackCount != 0 && len(nodeName) > 0 {
t.Fatalf("multiple node statuses report the fallback, previously on node %v, revision %v, currently on node %v, revision %v", nodeName, failedRevision, ns.NodeName, ns.LastFailedRevision)
}
if ns.LastFallbackCount != 0 {
nodeName = ns.NodeName
failedRevision = ns.LastFailedRevision
}
}

t.Logf("The fallback has been reported on node %v, failed revision is %v", nodeName, failedRevision)
return nodeName, failedRevision
}

func assertKasPodAnnotatedOnNode(t testing.TB, cs clientSet, expectedFailedRevision int32, nodeName string) {
t.Helper()
t.Logf("Verifying if a kube-apiserver pod has been annotated with revision: %v on node: %v", expectedFailedRevision, nodeName)

kasPods, err := cs.Kube.CoreV1().Pods("openshift-kube-apiserver").List(context.TODO(), metav1.ListOptions{LabelSelector: "apiserver=true"})
require.NoError(t, err)

var kasPod corev1.Pod
filteredKasPods := filterByNodeName(kasPods.Items, nodeName)
switch len(filteredKasPods) {
case 0:
t.Fatalf("expected to find the kube-apiserver static pod on node %s but haven't found any", nodeName)
case 1:
kasPod = filteredKasPods[0]
default:
// this should never happen for static pod as they are uniquely named for each node
podsOnCurrentNode := []string{}
for _, filteredKasPod := range filteredKasPods {
podsOnCurrentNode = append(podsOnCurrentNode, filteredKasPod.Name)
}
t.Fatalf("multiple kube-apiserver static pods for node %s found: %v", nodeName, podsOnCurrentNode)
}

if fallbackFor, ok := kasPod.Annotations[annotations.FallbackForRevision]; ok {
if len(fallbackFor) == 0 {
t.Fatalf("empty fallback revision label: %v on %s pod", annotations.FallbackForRevision, kasPod.Name)
}
revision, err := strconv.Atoi(fallbackFor)
if err != nil || revision < 0 {
t.Fatalf("invalid fallback revision: %v on pod: %s", fallbackFor, kasPod.Name)
}
return
}

t.Fatalf("kube-apiserver %v on node %v hasn't been annotated with %v", kasPod.Name, nodeName, annotations.FallbackForRevision)
}

func filterByNodeName(kasPods []corev1.Pod, currentNodeName string) []corev1.Pod {
filteredKasPods := []corev1.Pod{}

for _, potentialKasPods := range kasPods {
if potentialKasPods.Spec.NodeName == currentNodeName {
filteredKasPods = append(filteredKasPods, potentialKasPods)
}
}

return filteredKasPods
}

// getDefaultUnsupportedConfigForCurrentPlatform returns a predefined config specific to the current platform that can be extended by the tests
// it facilitates testing on an HA cluster by setting "startupMonitor:true" which enables the feature
func getDefaultUnsupportedConfigForCurrentPlatform(t testing.TB, cs clientSet) map[string]interface{} {
t.Helper()

infraConfiguration, err := cs.Infra.Get(context.TODO(), "cluster", metav1.GetOptions{})
require.NoError(t, err)

if infraConfiguration.Status.ControlPlaneTopology != configv1.SingleReplicaTopologyMode {
return map[string]interface{}{"startupMonitor": true}
}

return map[string]interface{}{}
}

// fallbackTimeoutsForCurrentPlatform provides various timeouts that are tailored for the current platform
// TODO: add timeouts for AWS and GCP
// TODO: we should be able to return only a single per-platform specific timeout and derive the rest e.g. oneNodeRolloutTimeout
func fallbackTimeoutsForCurrentPlatform(t testing.TB, cs clientSet) (time.Duration, time.Duration, time.Duration) {
/*
default timeouts that apply when the test is run on an SNO cluster

clusterStateWaitPollInterval: is the max time after the cluster is considered not ready
it should match waitForFallbackDegradedConditionTimeout
because we don't know when the previous test finished

clusterMustBeReadyFor: the time the cluster must stay stable

waitForFallbackDegradedConditionTimeout: set to 10 min, it should be much lower
the static pod monitor needs 5 min to fallback to the previous revision
but we don't know yet how much time it takes to start a new api server
including the time the server needs to become ready and be noticed by a Load Balancer
longer duration allows as to collect logs and the must-gather
*/
return 10 * time.Minute /*clusterStateWaitPollInterval*/, 1 * time.Minute /*clusterMustBeReadyFor*/, 10 * time.Minute /*waitForFallbackDegradedConditionTimeout*/
}

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.