-
Notifications
You must be signed in to change notification settings - Fork 157
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bug 1985997: Enable static pod fallback logic for SNO, with disruptive e2e test #1198
Merged
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
package e2e_sno_disruptive | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/stretchr/testify/require" | ||
|
||
configv1client "github.com/openshift/client-go/config/clientset/versioned" | ||
configv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1" | ||
operatorv1client "github.com/openshift/client-go/operator/clientset/versioned/typed/operator/v1" | ||
libgotest "github.com/openshift/library-go/test/library" | ||
|
||
"k8s.io/client-go/kubernetes" | ||
) | ||
|
||
type clientSet struct { | ||
Infra configv1.InfrastructureInterface | ||
Operator operatorv1client.KubeAPIServerInterface | ||
Kube kubernetes.Interface | ||
} | ||
|
||
func getClients(t testing.TB) clientSet { | ||
t.Helper() | ||
|
||
kubeConfig, err := libgotest.NewClientConfigForTest() | ||
require.NoError(t, err) | ||
kubeClient := kubernetes.NewForConfigOrDie(kubeConfig) | ||
|
||
operatorClient, err := operatorv1client.NewForConfig(kubeConfig) | ||
require.NoError(t, err) | ||
|
||
configClient, err := configv1client.NewForConfig(kubeConfig) | ||
require.NoError(t, err) | ||
|
||
return clientSet{Infra: configClient.ConfigV1().Infrastructures(), Operator: operatorClient.KubeAPIServers(), Kube: kubeClient} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,240 @@ | ||
package e2e_sno_disruptive | ||
|
||
import "testing" | ||
import ( | ||
"context" | ||
"encoding/json" | ||
"strconv" | ||
"testing" | ||
"time" | ||
|
||
func TestFallback(t *testing.T) { | ||
t.Log("implement me") | ||
"github.com/stretchr/testify/require" | ||
|
||
configv1 "github.com/openshift/api/config/v1" | ||
"github.com/openshift/library-go/pkg/operator/staticpod/startupmonitor/annotations" | ||
"github.com/openshift/library-go/pkg/operator/v1helpers" | ||
commontesthelpers "github.com/openshift/library-go/test/library/encryption" | ||
|
||
corev1 "k8s.io/api/core/v1" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/apimachinery/pkg/util/wait" | ||
"k8s.io/client-go/util/retry" | ||
) | ||
|
||
func TestFallback(tt *testing.T) { | ||
t := commontesthelpers.NewE(tt) | ||
cs := getClients(t) | ||
|
||
t.Log("Starting the fallback test") | ||
clusterStateWaitPollTimeout, clusterMustBeReadyFor, waitForFallbackDegradedConditionTimeout := fallbackTimeoutsForCurrentPlatform(t, cs) | ||
|
||
// before starting a new test make sure the current state of the cluster is good | ||
ensureClusterInGoodState(t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor) | ||
|
||
// cause a disruption | ||
cfg := getDefaultUnsupportedConfigForCurrentPlatform(t, cs) | ||
cfg["apiServerArguments"] = map[string][]string{"non-existing-flag": {"true"}} | ||
setUnsupportedConfig(t, cs, cfg) | ||
|
||
// validate if the fallback condition is reported and the cluster is stable | ||
waitForFallbackDegradedCondition(t, cs, waitForFallbackDegradedConditionTimeout) | ||
nodeName, failedRevision := assertFallbackOnNodeStatus(t, cs) | ||
assertKasPodAnnotatedOnNode(t, cs, failedRevision, nodeName) | ||
|
||
// clean up | ||
setUnsupportedConfig(t, cs, getDefaultUnsupportedConfigForCurrentPlatform(t, cs)) | ||
err := waitForClusterInGoodState(t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor) | ||
require.NoError(t, err) | ||
} | ||
|
||
// ensureClusterInGoodState makes sure the cluster is not progressing for mustBeReadyFor period | ||
// in addition in an HA env it applies getDefaultUnsupportedConfigForCurrentPlatform so that the feature is enabled before the tests starts | ||
func ensureClusterInGoodState(t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) { | ||
setUnsupportedConfig(t, cs, getDefaultUnsupportedConfigForCurrentPlatform(t, cs)) | ||
err := waitForClusterInGoodState(t, cs, waitPollTimeout, mustBeReadyFor) | ||
require.NoError(t, err) | ||
} | ||
|
||
// waitForClusterInGoodState checks if the cluster is not progressing | ||
func waitForClusterInGoodState(t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) error { | ||
t.Helper() | ||
|
||
startTs := time.Now() | ||
t.Logf("Waiting %s for the cluster to be in a good condition, interval = 10s, timeout %v", mustBeReadyFor.String(), waitPollTimeout) | ||
|
||
return wait.Poll(10*time.Second, waitPollTimeout, func() (bool, error) { | ||
ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{}) | ||
if err != nil { | ||
t.Log(err) | ||
return false, nil /*retry*/ | ||
} | ||
|
||
for _, ns := range ckaso.Status.NodeStatuses { | ||
if ckaso.Status.LatestAvailableRevision != ns.CurrentRevision || ns.TargetRevision > 0 { | ||
t.Logf("Node %s is progressing, latestAvailableRevision: %v, currentRevision: %v, targetRevision: %v", ns.NodeName, ckaso.Status.LatestAvailableRevision, ns.CurrentRevision, ns.TargetRevision) | ||
return false, nil /*retry*/ | ||
} | ||
} | ||
|
||
if time.Since(startTs) > mustBeReadyFor { | ||
t.Logf("The cluster has been in good condition for %s", mustBeReadyFor.String()) | ||
return true, nil /*done*/ | ||
} | ||
return false, nil /*wait a bit more*/ | ||
}) | ||
} | ||
|
||
// setUnsupportedConfig simply sets UnsupportedConfigOverrides config to the provided cfg | ||
func setUnsupportedConfig(t testing.TB, cs clientSet, cfg map[string]interface{}) { | ||
t.Helper() | ||
|
||
t.Logf("Setting UnsupportedConfigOverrides to %v", cfg) | ||
raw, err := json.Marshal(cfg) | ||
require.NoError(t, err) | ||
|
||
err = retry.OnError(retry.DefaultRetry, func(error) bool { return true }, func() error { | ||
ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{}) | ||
if err != nil { | ||
t.Log(err) | ||
return err | ||
} | ||
ckaso.Spec.UnsupportedConfigOverrides.Raw = raw | ||
_, err = cs.Operator.Update(context.TODO(), ckaso, metav1.UpdateOptions{}) | ||
if err != nil { | ||
t.Log(err) | ||
} | ||
return err | ||
}) | ||
require.NoError(t, err) | ||
} | ||
|
||
// waitForFallbackDegradedCondition waits until StaticPodFallbackRevisionDegraded condition is set to true | ||
func waitForFallbackDegradedCondition(t testing.TB, cs clientSet, waitPollTimeout time.Duration) { | ||
t.Helper() | ||
|
||
t.Logf("Waiting for StaticPodFallbackRevisionDegraded condition, interval = 20s, timeout = %v", waitPollTimeout) | ||
err := wait.Poll(20*time.Second, waitPollTimeout, func() (bool, error) { | ||
ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{}) | ||
if err != nil { | ||
t.Logf("unable to get kube-apiserver-operator resource: %v", err) | ||
return false, nil /*retry*/ | ||
} | ||
|
||
if v1helpers.IsOperatorConditionTrue(ckaso.Status.Conditions, "StaticPodFallbackRevisionDegraded") { | ||
return true, nil /*done*/ | ||
} | ||
|
||
t.Logf("StaticPodFallbackRevisionDegraded condition hasn't been set yet") | ||
return false, nil /*retry*/ | ||
}) | ||
require.NoError(t, err) | ||
} | ||
|
||
func assertFallbackOnNodeStatus(t testing.TB, cs clientSet) (string, int32) { | ||
t.Helper() | ||
|
||
t.Log("Checking if a NodeStatus has been updated to report the fallback condition") | ||
|
||
ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{}) | ||
require.NoError(t, err) | ||
|
||
var nodeName string | ||
var failedRevision int32 | ||
for _, ns := range ckaso.Status.NodeStatuses { | ||
if ns.LastFallbackCount != 0 && len(nodeName) > 0 { | ||
t.Fatalf("multiple node statuses report the fallback, previously on node %v, revision %v, currently on node %v, revision %v", nodeName, failedRevision, ns.NodeName, ns.LastFailedRevision) | ||
} | ||
if ns.LastFallbackCount != 0 { | ||
nodeName = ns.NodeName | ||
failedRevision = ns.LastFailedRevision | ||
} | ||
} | ||
|
||
t.Logf("The fallback has been reported on node %v, failed revision is %v", nodeName, failedRevision) | ||
return nodeName, failedRevision | ||
} | ||
|
||
func assertKasPodAnnotatedOnNode(t testing.TB, cs clientSet, expectedFailedRevision int32, nodeName string) { | ||
t.Helper() | ||
t.Logf("Verifying if a kube-apiserver pod has been annotated with revision: %v on node: %v", expectedFailedRevision, nodeName) | ||
|
||
kasPods, err := cs.Kube.CoreV1().Pods("openshift-kube-apiserver").List(context.TODO(), metav1.ListOptions{LabelSelector: "apiserver=true"}) | ||
require.NoError(t, err) | ||
|
||
var kasPod corev1.Pod | ||
filteredKasPods := filterByNodeName(kasPods.Items, nodeName) | ||
switch len(filteredKasPods) { | ||
case 0: | ||
t.Fatalf("expected to find the kube-apiserver static pod on node %s but haven't found any", nodeName) | ||
case 1: | ||
kasPod = filteredKasPods[0] | ||
default: | ||
// this should never happen for static pod as they are uniquely named for each node | ||
podsOnCurrentNode := []string{} | ||
for _, filteredKasPod := range filteredKasPods { | ||
podsOnCurrentNode = append(podsOnCurrentNode, filteredKasPod.Name) | ||
} | ||
t.Fatalf("multiple kube-apiserver static pods for node %s found: %v", nodeName, podsOnCurrentNode) | ||
} | ||
|
||
if fallbackFor, ok := kasPod.Annotations[annotations.FallbackForRevision]; ok { | ||
if len(fallbackFor) == 0 { | ||
t.Fatalf("empty fallback revision label: %v on %s pod", annotations.FallbackForRevision, kasPod.Name) | ||
} | ||
revision, err := strconv.Atoi(fallbackFor) | ||
if err != nil || revision < 0 { | ||
t.Fatalf("invalid fallback revision: %v on pod: %s", fallbackFor, kasPod.Name) | ||
} | ||
return | ||
} | ||
|
||
t.Fatalf("kube-apiserver %v on node %v hasn't been annotated with %v", kasPod.Name, nodeName, annotations.FallbackForRevision) | ||
} | ||
|
||
func filterByNodeName(kasPods []corev1.Pod, currentNodeName string) []corev1.Pod { | ||
filteredKasPods := []corev1.Pod{} | ||
|
||
for _, potentialKasPods := range kasPods { | ||
if potentialKasPods.Spec.NodeName == currentNodeName { | ||
filteredKasPods = append(filteredKasPods, potentialKasPods) | ||
} | ||
} | ||
|
||
return filteredKasPods | ||
} | ||
|
||
// getDefaultUnsupportedConfigForCurrentPlatform returns a predefined config specific to the current platform that can be extended by the tests | ||
// it facilitates testing on an HA cluster by setting "startupMonitor:true" which enables the feature | ||
func getDefaultUnsupportedConfigForCurrentPlatform(t testing.TB, cs clientSet) map[string]interface{} { | ||
t.Helper() | ||
|
||
infraConfiguration, err := cs.Infra.Get(context.TODO(), "cluster", metav1.GetOptions{}) | ||
require.NoError(t, err) | ||
|
||
if infraConfiguration.Status.ControlPlaneTopology != configv1.SingleReplicaTopologyMode { | ||
return map[string]interface{}{"startupMonitor": true} | ||
} | ||
|
||
return map[string]interface{}{} | ||
} | ||
|
||
// fallbackTimeoutsForCurrentPlatform provides various timeouts that are tailored for the current platform | ||
// TODO: add timeouts for AWS and GCP | ||
// TODO: we should be able to return only a single per-platform specific timeout and derive the rest e.g. oneNodeRolloutTimeout | ||
func fallbackTimeoutsForCurrentPlatform(t testing.TB, cs clientSet) (time.Duration, time.Duration, time.Duration) { | ||
/* | ||
default timeouts that apply when the test is run on an SNO cluster | ||
|
||
clusterStateWaitPollInterval: is the max time after the cluster is considered not ready | ||
it should match waitForFallbackDegradedConditionTimeout | ||
because we don't know when the previous test finished | ||
|
||
clusterMustBeReadyFor: the time the cluster must stay stable | ||
|
||
waitForFallbackDegradedConditionTimeout: set to 10 min, it should be much lower | ||
the static pod monitor needs 5 min to fallback to the previous revision | ||
but we don't know yet how much time it takes to start a new api server | ||
including the time the server needs to become ready and be noticed by a Load Balancer | ||
longer duration allows as to collect logs and the must-gather | ||
*/ | ||
return 10 * time.Minute /*clusterStateWaitPollInterval*/, 1 * time.Minute /*clusterMustBeReadyFor*/, 10 * time.Minute /*waitForFallbackDegradedConditionTimeout*/ | ||
} |
2 changes: 1 addition & 1 deletion
2
.../openshift/library-go/pkg/operator/staticpod/controller/installer/installer_controller.go
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can't we derive
waitForFallbackDegradedConditionTimeout
from some per-platform number? E.g.oneNodeRolloutTimeout
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added
TODO