openshift · openshift-ci · Aug 4, 2021 · Jul 30, 2021 · Aug 2, 2021 · Aug 4, 2021
diff --git a/go.mod b/go.mod
@@ -14,7 +14,7 @@ require (
 	github.com/openshift/api v0.0.0-20210730095913-85e1d547cdee
 	github.com/openshift/build-machinery-go v0.0.0-20210712174854-1bb7fd1518d3
 	github.com/openshift/client-go v0.0.0-20210730113412-1811c1b3fc0e
-	github.com/openshift/library-go v0.0.0-20210803154958-0e70d0844e00
+	github.com/openshift/library-go v0.0.0-20210804150119-965974e0af3f
 	github.com/pkg/profile v1.5.0 // indirect
 	github.com/prometheus-operator/prometheus-operator/pkg/client v0.45.0
 	github.com/prometheus/client_golang v1.11.0

diff --git a/go.sum b/go.sum
@@ -477,8 +477,8 @@ github.com/openshift/build-machinery-go v0.0.0-20210712174854-1bb7fd1518d3 h1:hY
 github.com/openshift/build-machinery-go v0.0.0-20210712174854-1bb7fd1518d3/go.mod h1:b1BuldmJlbA/xYtdZvKi+7j5YGB44qJUJDZ9zwiNCfE=
 github.com/openshift/client-go v0.0.0-20210730113412-1811c1b3fc0e h1:vhwzeXUxLd6JZlWZ+miBzTEpmVctHyHNq9z43ScYxWI=
 github.com/openshift/client-go v0.0.0-20210730113412-1811c1b3fc0e/go.mod h1:P1pjphFOgm/nYjmtouHGaSLGtdP25dQICJnYtcYhfEs=
-github.com/openshift/library-go v0.0.0-20210803154958-0e70d0844e00 h1:3Izqgf3q6goB/rg8T+J4g2plzAy7No7KN3h4en7V330=
-github.com/openshift/library-go v0.0.0-20210803154958-0e70d0844e00/go.mod h1:3GagmGg6gikg+hAqma7E7axBzs2pjx4+GrAbdl4OYdY=
+github.com/openshift/library-go v0.0.0-20210804150119-965974e0af3f h1:naAxBRhCtOnTKdW3Cb/XRE7hdC7gPTrZlJvBVqMGXJs=
+github.com/openshift/library-go v0.0.0-20210804150119-965974e0af3f/go.mod h1:3GagmGg6gikg+hAqma7E7axBzs2pjx4+GrAbdl4OYdY=
 github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
 github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
 github.com/pborman/uuid v1.2.0/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k=

diff --git a/pkg/operator/startupmonitorreadiness/enablement.go b/pkg/operator/startupmonitorreadiness/enablement.go
@@ -4,11 +4,12 @@ import (
 	"bytes"
 	"encoding/json"
 
-	configlistersv1 "github.com/openshift/client-go/config/listers/config/v1"
-	"github.com/openshift/library-go/pkg/operator/v1helpers"
-
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
+
+	configv1 "github.com/openshift/api/config/v1"
+	configlistersv1 "github.com/openshift/client-go/config/listers/config/v1"
+	"github.com/openshift/library-go/pkg/operator/v1helpers"
 )
 
 // IsStartupMonitorEnabledFunction returns a function that determines if the startup monitor should be enabled on a cluster
@@ -20,16 +21,11 @@ func IsStartupMonitorEnabledFunction(infrastructureLister configlistersv1.Infras
 			return false, err
 		}
 
-		// TODO: uncomment before releasing 4.9
-		/*
-			if infra.Status.ControlPlaneTopology != configv1.SingleReplicaTopologyMode {
-				return false, nil
-			}
-		*/
+		if infra.Status.ControlPlaneTopology == configv1.SingleReplicaTopologyMode {
+			return true, nil
+		}
 
-		// TODO: remove before releasing 4.9
-		_ = infra
-		startupMonitorExplicitlyEnabled := false
+		// for development and debugging
 		operatorSpec, _, _, err := operatorClient.GetOperatorState()
 		if err != nil {
 			return false, err
@@ -39,12 +35,12 @@ func IsStartupMonitorEnabledFunction(infrastructureLister configlistersv1.Infras
 			if err := json.NewDecoder(bytes.NewBuffer(operatorSpec.UnsupportedConfigOverrides.Raw)).Decode(&observedUnsupportedConfig); err != nil {
 				return false, err
 			}
-			startupMonitorExplicitlyEnabled, _, _ = unstructured.NestedBool(observedUnsupportedConfig, "startupMonitor")
-		}
-		if !startupMonitorExplicitlyEnabled {
-			return false, nil
+			enabled, found, err := unstructured.NestedBool(observedUnsupportedConfig, "startupMonitor")
+			if err == nil && found {
+				return enabled, nil
+			}
 		}
-		// End of TODO
-		return true, nil
+
+		return false, nil
 	}
 }
diff --git a/test/e2e-sno-disruptive/helpers.go b/test/e2e-sno-disruptive/helpers.go
@@ -0,0 +1,36 @@
+package e2e_sno_disruptive
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	configv1client "github.com/openshift/client-go/config/clientset/versioned"
+	configv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
+	operatorv1client "github.com/openshift/client-go/operator/clientset/versioned/typed/operator/v1"
+	libgotest "github.com/openshift/library-go/test/library"
+
+	"k8s.io/client-go/kubernetes"
+)
+
+type clientSet struct {
+	Infra    configv1.InfrastructureInterface
+	Operator operatorv1client.KubeAPIServerInterface
+	Kube     kubernetes.Interface
+}
+
+func getClients(t testing.TB) clientSet {
+	t.Helper()
+
+	kubeConfig, err := libgotest.NewClientConfigForTest()
+	require.NoError(t, err)
+	kubeClient := kubernetes.NewForConfigOrDie(kubeConfig)
+
+	operatorClient, err := operatorv1client.NewForConfig(kubeConfig)
+	require.NoError(t, err)
+
+	configClient, err := configv1client.NewForConfig(kubeConfig)
+	require.NoError(t, err)
+
+	return clientSet{Infra: configClient.ConfigV1().Infrastructures(), Operator: operatorClient.KubeAPIServers(), Kube: kubeClient}
+}
diff --git a/test/e2e-sno-disruptive/sno_disruptive_test.go b/test/e2e-sno-disruptive/sno_disruptive_test.go
@@ -1,7 +1,240 @@
 package e2e_sno_disruptive
 
-import "testing"
+import (
+	"context"
+	"encoding/json"
+	"strconv"
+	"testing"
+	"time"
 
-func TestFallback(t *testing.T) {
-	t.Log("implement me")
+	"github.com/stretchr/testify/require"
+
+	configv1 "github.com/openshift/api/config/v1"
+	"github.com/openshift/library-go/pkg/operator/staticpod/startupmonitor/annotations"
+	"github.com/openshift/library-go/pkg/operator/v1helpers"
+	commontesthelpers "github.com/openshift/library-go/test/library/encryption"
+
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/client-go/util/retry"
+)
+
+func TestFallback(tt *testing.T) {
+	t := commontesthelpers.NewE(tt)
+	cs := getClients(t)
+
+	t.Log("Starting the fallback test")
+	clusterStateWaitPollTimeout, clusterMustBeReadyFor, waitForFallbackDegradedConditionTimeout := fallbackTimeoutsForCurrentPlatform(t, cs)
+
+	// before starting a new test make sure the current state of the cluster is good
+	ensureClusterInGoodState(t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor)
+
+	// cause a disruption
+	cfg := getDefaultUnsupportedConfigForCurrentPlatform(t, cs)
+	cfg["apiServerArguments"] = map[string][]string{"non-existing-flag": {"true"}}
+	setUnsupportedConfig(t, cs, cfg)
+
+	// validate if the fallback condition is reported and the cluster is stable
+	waitForFallbackDegradedCondition(t, cs, waitForFallbackDegradedConditionTimeout)
+	nodeName, failedRevision := assertFallbackOnNodeStatus(t, cs)
+	assertKasPodAnnotatedOnNode(t, cs, failedRevision, nodeName)
+
+	// clean up
+	setUnsupportedConfig(t, cs, getDefaultUnsupportedConfigForCurrentPlatform(t, cs))
+	err := waitForClusterInGoodState(t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor)
+	require.NoError(t, err)
+}
+
+// ensureClusterInGoodState makes sure the cluster is not progressing for mustBeReadyFor period
+// in addition in an HA env it applies getDefaultUnsupportedConfigForCurrentPlatform so that the feature is enabled before the tests starts
+func ensureClusterInGoodState(t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) {
+	setUnsupportedConfig(t, cs, getDefaultUnsupportedConfigForCurrentPlatform(t, cs))
+	err := waitForClusterInGoodState(t, cs, waitPollTimeout, mustBeReadyFor)
+	require.NoError(t, err)
+}
+
+// waitForClusterInGoodState checks if the cluster is not progressing
+func waitForClusterInGoodState(t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) error {
+	t.Helper()
+
+	startTs := time.Now()
+	t.Logf("Waiting %s for the cluster to be in a good condition, interval = 10s, timeout %v", mustBeReadyFor.String(), waitPollTimeout)
+
+	return wait.Poll(10*time.Second, waitPollTimeout, func() (bool, error) {
+		ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{})
+		if err != nil {
+			t.Log(err)
+			return false, nil /*retry*/
+		}
+
+		for _, ns := range ckaso.Status.NodeStatuses {
+			if ckaso.Status.LatestAvailableRevision != ns.CurrentRevision || ns.TargetRevision > 0 {
+				t.Logf("Node %s is progressing, latestAvailableRevision: %v, currentRevision: %v, targetRevision: %v", ns.NodeName, ckaso.Status.LatestAvailableRevision, ns.CurrentRevision, ns.TargetRevision)
+				return false, nil /*retry*/
+			}
+		}
+
+		if time.Since(startTs) > mustBeReadyFor {
+			t.Logf("The cluster has been in good condition for %s", mustBeReadyFor.String())
+			return true, nil /*done*/
+		}
+		return false, nil /*wait a bit more*/
+	})
+}
+
+// setUnsupportedConfig simply sets UnsupportedConfigOverrides config to the provided cfg
+func setUnsupportedConfig(t testing.TB, cs clientSet, cfg map[string]interface{}) {
+	t.Helper()
+
+	t.Logf("Setting UnsupportedConfigOverrides to %v", cfg)
+	raw, err := json.Marshal(cfg)
+	require.NoError(t, err)
+
+	err = retry.OnError(retry.DefaultRetry, func(error) bool { return true }, func() error {
+		ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{})
+		if err != nil {
+			t.Log(err)
+			return err
+		}
+		ckaso.Spec.UnsupportedConfigOverrides.Raw = raw
+		_, err = cs.Operator.Update(context.TODO(), ckaso, metav1.UpdateOptions{})
+		if err != nil {
+			t.Log(err)
+		}
+		return err
+	})
+	require.NoError(t, err)
+}
+
+// waitForFallbackDegradedCondition waits until StaticPodFallbackRevisionDegraded condition is set to true
+func waitForFallbackDegradedCondition(t testing.TB, cs clientSet, waitPollTimeout time.Duration) {
+	t.Helper()
+
+	t.Logf("Waiting for StaticPodFallbackRevisionDegraded condition, interval = 20s, timeout = %v", waitPollTimeout)
+	err := wait.Poll(20*time.Second, waitPollTimeout, func() (bool, error) {
+		ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{})
+		if err != nil {
+			t.Logf("unable to get kube-apiserver-operator resource: %v", err)
+			return false, nil /*retry*/
+		}
+
+		if v1helpers.IsOperatorConditionTrue(ckaso.Status.Conditions, "StaticPodFallbackRevisionDegraded") {
+			return true, nil /*done*/
+		}
+
+		t.Logf("StaticPodFallbackRevisionDegraded condition hasn't been set yet")
+		return false, nil /*retry*/
+	})
+	require.NoError(t, err)
+}
+
+func assertFallbackOnNodeStatus(t testing.TB, cs clientSet) (string, int32) {
+	t.Helper()
+
+	t.Log("Checking if a NodeStatus has been updated to report the fallback condition")
+
+	ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{})
+	require.NoError(t, err)
+
+	var nodeName string
+	var failedRevision int32
+	for _, ns := range ckaso.Status.NodeStatuses {
+		if ns.LastFallbackCount != 0 && len(nodeName) > 0 {
+			t.Fatalf("multiple node statuses report the fallback, previously on node %v, revision %v, currently on node %v, revision %v", nodeName, failedRevision, ns.NodeName, ns.LastFailedRevision)
+		}
+		if ns.LastFallbackCount != 0 {
+			nodeName = ns.NodeName
+			failedRevision = ns.LastFailedRevision
+		}
+	}
+
+	t.Logf("The fallback has been reported on node %v, failed revision is %v", nodeName, failedRevision)
+	return nodeName, failedRevision
+}
+
+func assertKasPodAnnotatedOnNode(t testing.TB, cs clientSet, expectedFailedRevision int32, nodeName string) {
+	t.Helper()
+	t.Logf("Verifying if a kube-apiserver pod has been annotated with revision: %v on node: %v", expectedFailedRevision, nodeName)
+
+	kasPods, err := cs.Kube.CoreV1().Pods("openshift-kube-apiserver").List(context.TODO(), metav1.ListOptions{LabelSelector: "apiserver=true"})
+	require.NoError(t, err)
+
+	var kasPod corev1.Pod
+	filteredKasPods := filterByNodeName(kasPods.Items, nodeName)
+	switch len(filteredKasPods) {
+	case 0:
+		t.Fatalf("expected to find the kube-apiserver static pod on node %s but haven't found any", nodeName)
+	case 1:
+		kasPod = filteredKasPods[0]
+	default:
+		// this should never happen for static pod as they are uniquely named for each node
+		podsOnCurrentNode := []string{}
+		for _, filteredKasPod := range filteredKasPods {
+			podsOnCurrentNode = append(podsOnCurrentNode, filteredKasPod.Name)
+		}
+		t.Fatalf("multiple kube-apiserver static pods for node %s found: %v", nodeName, podsOnCurrentNode)
+	}
+
+	if fallbackFor, ok := kasPod.Annotations[annotations.FallbackForRevision]; ok {
+		if len(fallbackFor) == 0 {
+			t.Fatalf("empty fallback revision label: %v on %s pod", annotations.FallbackForRevision, kasPod.Name)
+		}
+		revision, err := strconv.Atoi(fallbackFor)
+		if err != nil || revision < 0 {
+			t.Fatalf("invalid fallback revision: %v on pod: %s", fallbackFor, kasPod.Name)
+		}
+		return
+	}
+
+	t.Fatalf("kube-apiserver %v on node %v hasn't been annotated with %v", kasPod.Name, nodeName, annotations.FallbackForRevision)
+}
+
+func filterByNodeName(kasPods []corev1.Pod, currentNodeName string) []corev1.Pod {
+	filteredKasPods := []corev1.Pod{}
+
+	for _, potentialKasPods := range kasPods {
+		if potentialKasPods.Spec.NodeName == currentNodeName {
+			filteredKasPods = append(filteredKasPods, potentialKasPods)
+		}
+	}
+
+	return filteredKasPods
+}
+
+// getDefaultUnsupportedConfigForCurrentPlatform returns a predefined config specific to the current platform that can be extended by the tests
+// it facilitates testing on an HA cluster by setting "startupMonitor:true" which enables the feature
+func getDefaultUnsupportedConfigForCurrentPlatform(t testing.TB, cs clientSet) map[string]interface{} {
+	t.Helper()
+
+	infraConfiguration, err := cs.Infra.Get(context.TODO(), "cluster", metav1.GetOptions{})
+	require.NoError(t, err)
+
+	if infraConfiguration.Status.ControlPlaneTopology != configv1.SingleReplicaTopologyMode {
+		return map[string]interface{}{"startupMonitor": true}
+	}
+
+	return map[string]interface{}{}
+}
+
+// fallbackTimeoutsForCurrentPlatform provides various timeouts that are tailored for the current platform
+// TODO: add timeouts for AWS and GCP
+// TODO: we should be able to return only a single per-platform specific timeout and derive the rest e.g. oneNodeRolloutTimeout
+func fallbackTimeoutsForCurrentPlatform(t testing.TB, cs clientSet) (time.Duration, time.Duration, time.Duration) {
+	/*
+	 default timeouts that apply when the test is run on an SNO cluster
+
+	 clusterStateWaitPollInterval:            is the max time after the cluster is considered not ready
+	                                          it should match waitForFallbackDegradedConditionTimeout
+	                                          because we don't know when the previous test finished
+
+	 clusterMustBeReadyFor:                   the time the cluster must stay stable
+
+	 waitForFallbackDegradedConditionTimeout: set to 10 min, it should be much lower
+	                                          the static pod monitor needs 5 min to fallback to the previous revision
+	                                          but we don't know yet how much time it takes to start a new api server
+	                                          including the time the server needs to become ready and be noticed by a Load Balancer
+	                                          longer duration allows as to collect logs and the must-gather
+	*/
+	return 10 * time.Minute /*clusterStateWaitPollInterval*/, 1 * time.Minute /*clusterMustBeReadyFor*/, 10 * time.Minute /*waitForFallbackDegradedConditionTimeout*/
 }
diff --git a/.../openshift/library-go/pkg/operator/staticpod/controller/installer/installer_controller.go b/.../openshift/library-go/pkg/operator/staticpod/controller/installer/installer_controller.go