From 2af700f691210657c0a0f1a845588e41f8340713 Mon Sep 17 00:00:00 2001
From: Mustafa Elbehery <melbeher@redhat.com>
Date: Thu, 9 Mar 2023 13:45:51 +0100
Subject: [PATCH] update-etcd-scaling-test

---
 test/extended/etcd/helpers/helpers.go  | 104 ++++++++++++++++---------
 test/extended/etcd/vertical_scaling.go |   8 +-
 2 files changed, 75 insertions(+), 37 deletions(-)

diff --git a/test/extended/etcd/helpers/helpers.go b/test/extended/etcd/helpers/helpers.go
index 760b862f5235..571ac23ef477 100644
--- a/test/extended/etcd/helpers/helpers.go
+++ b/test/extended/etcd/helpers/helpers.go
@@ -28,6 +28,7 @@ import (
 	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/client-go/dynamic"
 	"k8s.io/client-go/kubernetes"
+	v1 "k8s.io/client-go/kubernetes/typed/core/v1"
 	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
 	"k8s.io/utils/pointer"
 )
@@ -35,6 +36,7 @@ import (
 const masterMachineLabelSelector = "machine.openshift.io/cluster-api-machine-role" + "=" + "master"
 const machineDeletionHookName = "EtcdQuorumOperator"
 const machineDeletionHookOwner = "clusteroperator/etcd"
+const masterNodeRoleLabel = "node-role.kubernetes.io/master"
 
 type TestingT interface {
 	Logf(format string, args ...interface{})
@@ -184,41 +186,31 @@ func recoverClusterToInitialStateIfNeeded(ctx context.Context, t TestingT, machi
 }
 
 func DeleteSingleMachine(ctx context.Context, t TestingT, machineClient machinev1beta1client.MachineInterface) (string, error) {
-	waitPollInterval := 15 * time.Second
-	waitPollTimeout := 5 * time.Minute
-	t.Logf("Waiting up to %s to delete a machine", waitPollTimeout.String())
-
 	machineToDelete := ""
-	err := wait.Poll(waitPollInterval, waitPollTimeout, func() (bool, error) {
-		machineList, err := machineClient.List(ctx, metav1.ListOptions{LabelSelector: masterMachineLabelSelector})
-		if err != nil {
-			return isTransientAPIError(t, err)
-		}
-
-		// Machine names are suffixed with an index number (e.g "ci-op-xlbdrkvl-6a467-qcbkh-master-0")
-		// so we sort to pick the lowest index, e.g master-0 in this example
-		machineNames := []string{}
-		for _, m := range machineList.Items {
-			machineNames = append(machineNames, m.Name)
-		}
-		sort.Strings(machineNames)
-		machineToDelete = machineNames[0]
-		t.Logf("attempting to delete machine %q", machineToDelete)
+	// list master machines
+	machineList, err := machineClient.List(ctx, metav1.ListOptions{LabelSelector: masterMachineLabelSelector})
+	if err != nil {
+		return "", fmt.Errorf("error listing master machines: '%w'", err)
+	}
+	// Machine names are suffixed with an index number (e.g "ci-op-xlbdrkvl-6a467-qcbkh-master-0")
+	// so we sort to pick the lowest index, e.g master-0 in this example
+	machineNames := []string{}
+	for _, m := range machineList.Items {
+		machineNames = append(machineNames, m.Name)
+	}
+	sort.Strings(machineNames)
+	machineToDelete = machineNames[0]
 
-		if err := machineClient.Delete(ctx, machineToDelete, metav1.DeleteOptions{}); err != nil {
-			// The machine we just listed should be present but if not, error out
-			if apierrors.IsNotFound(err) {
-				t.Logf("machine %q was listed but not found or already deleted", machineToDelete)
-				return false, fmt.Errorf("machine %q was listed but not found or already deleted", machineToDelete)
-			}
-			return isTransientAPIError(t, err)
+	t.Logf("attempting to delete machine '%q'", machineToDelete)
+	if err := machineClient.Delete(ctx, machineToDelete, metav1.DeleteOptions{}); err != nil {
+		if apierrors.IsNotFound(err) {
+			t.Logf("machine '%q' was listed but not found or already deleted", machineToDelete)
+			return "", nil
 		}
-		t.Logf("successfully deleted machine %q", machineToDelete)
-
-		return true, nil
-	})
-
-	return machineToDelete, err
+		return "", err
+	}
+	t.Logf("successfully deleted machine '%q'", machineToDelete)
+	return machineToDelete, nil
 }
 
 // IsCPMSActive returns true if the current platform's has an active CPMS
@@ -246,7 +238,7 @@ func IsCPMSActive(ctx context.Context, t TestingT, cpmsClient machinev1client.Co
 
 // EnsureReadyReplicasOnCPMS checks if status.readyReplicas on the cluster CPMS is n
 // this effectively counts the number of control-plane machines with the provider state as running
-func EnsureReadyReplicasOnCPMS(ctx context.Context, t TestingT, expectedReplicaCount int, cpmsClient machinev1client.ControlPlaneMachineSetInterface) error {
+func EnsureReadyReplicasOnCPMS(ctx context.Context, t TestingT, expectedReplicaCount int, cpmsClient machinev1client.ControlPlaneMachineSetInterface, nodeClient v1.NodeInterface) error {
 	waitPollInterval := 5 * time.Second
 	waitPollTimeout := 18 * time.Minute
 	t.Logf("Waiting up to %s for the CPMS to have status.readyReplicas = %v", waitPollTimeout.String(), expectedReplicaCount)
@@ -261,13 +253,55 @@ func EnsureReadyReplicasOnCPMS(ctx context.Context, t TestingT, expectedReplicaC
 			t.Logf("expected %d ready replicas on CPMS, got: %v,", expectedReplicaCount, cpms.Status.ReadyReplicas)
 			return false, nil
 		}
-
 		t.Logf("CPMS has reached the desired number of ready replicas: %v,", cpms.Status.ReadyReplicas)
 
+		err = EnsureReadyMasterNodes(ctx, expectedReplicaCount, nodeClient)
+		if err != nil {
+			t.Logf("expected number of master nodes is not ready yet: '%w'", err)
+			return false, nil
+		}
+
 		return true, nil
 	})
 }
 
+// EnsureReadyMasterNodes checks if the current master nodes matches the expected number of master nodes,
+// and that all master nodes' are Ready
+func EnsureReadyMasterNodes(ctx context.Context, expectedReplicaCount int, nodeClient v1.NodeInterface) error {
+	masterNodes, err := nodeClient.List(ctx, metav1.ListOptions{LabelSelector: masterNodeRoleLabel})
+	if err != nil {
+		return fmt.Errorf("failed to list master nodes:'%w'", err)
+	}
+
+	if len(masterNodes.Items) != expectedReplicaCount {
+		return fmt.Errorf("expected number of master nodes is '%d', but got '%d' instead", expectedReplicaCount, len(masterNodes.Items))
+	}
+
+	for _, node := range masterNodes.Items {
+		for _, condition := range node.Status.Conditions {
+			if condition.Type == corev1.NodeReady && condition.Status != corev1.ConditionTrue {
+				return fmt.Errorf("master node '%v' is not ready", node)
+			}
+		}
+	}
+
+	return nil
+}
+
+// EnsureCPMSReplicasConverged returns error if the number of expected master machines not equals the number of actual master machines
+// otherwise it returns nil
+func EnsureCPMSReplicasConverged(ctx context.Context, cpmsClient machinev1client.ControlPlaneMachineSetInterface) error {
+	cpms, err := cpmsClient.Get(ctx, "cluster", metav1.GetOptions{})
+	if err != nil {
+		return fmt.Errorf("failed to get controlPlaneMachineSet object: '%w'", err)
+	}
+
+	if *cpms.Spec.Replicas != cpms.Status.ReadyReplicas {
+		return fmt.Errorf("CPMS replicas failed to converge, expected status.readyReplicas '%d' to be equal to spec.replicas '%v'", cpms.Status.ReadyReplicas, cpms.Spec.Replicas)
+	}
+	return nil
+}
+
 // EnsureVotingMembersCount counts the number of voting etcd members, it doesn't evaluate health conditions or any other attributes (i.e. name) of individual members
 // this method won't fail immediately on errors, this is useful during scaling down operation until the feature can ensure this operation to be graceful
 func EnsureVotingMembersCount(ctx context.Context, t TestingT, etcdClientFactory EtcdClientCreator, kubeClient kubernetes.Interface, expectedMembersCount int) error {
@@ -383,7 +417,7 @@ func MachineNameToEtcdMemberName(ctx context.Context, kubeClient kubernetes.Inte
 		return "", err
 	}
 
-	masterNodes, err := kubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{LabelSelector: "node-role.kubernetes.io/master"})
+	masterNodes, err := kubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{LabelSelector: masterNodeRoleLabel})
 	if err != nil {
 		return "", err
 	}
diff --git a/test/extended/etcd/vertical_scaling.go b/test/extended/etcd/vertical_scaling.go
index 3b9c31f4951b..17f0c4a9662d 100644
--- a/test/extended/etcd/vertical_scaling.go
+++ b/test/extended/etcd/vertical_scaling.go
@@ -44,6 +44,7 @@ var _ = g.Describe("[sig-etcd][Feature:EtcdVerticalScaling][Suite:openshift/etcd
 		machineClientSet, err := machineclient.NewForConfig(oc.KubeFramework().ClientConfig())
 		o.Expect(err).ToNot(o.HaveOccurred())
 		machineClient := machineClientSet.MachineV1beta1().Machines("openshift-machine-api")
+		nodeClient := oc.KubeClient().CoreV1().Nodes()
 		cpmsClient := machineClientSet.MachineV1().ControlPlaneMachineSets("openshift-machine-api")
 		kubeClient := oc.KubeClient()
 
@@ -76,7 +77,7 @@ var _ = g.Describe("[sig-etcd][Feature:EtcdVerticalScaling][Suite:openshift/etcd
 			// step 2: wait until the CPMSO scales-up by creating a new machine
 			// We need to check the cpms' status.readyReplicas because the phase of one machine will always be Deleting
 			// so we can't use EnsureMasterMachinesAndCount() since that counts for machines that aren't pending deletion
-			err = scalingtestinglibrary.EnsureReadyReplicasOnCPMS(ctx, g.GinkgoT(), 4, cpmsClient)
+			err = scalingtestinglibrary.EnsureReadyReplicasOnCPMS(ctx, g.GinkgoT(), 4, cpmsClient, nodeClient)
 			err = errors.Wrap(err, "scale-up: timed out waiting for CPMS to show 4 ready replicas")
 			o.Expect(err).ToNot(o.HaveOccurred())
 
@@ -87,7 +88,7 @@ var _ = g.Describe("[sig-etcd][Feature:EtcdVerticalScaling][Suite:openshift/etcd
 			// successfully
 
 			// step 3: wait for automatic scale-down as the replica count goes back down to 3
-			err = scalingtestinglibrary.EnsureReadyReplicasOnCPMS(ctx, g.GinkgoT(), 3, cpmsClient)
+			err = scalingtestinglibrary.EnsureReadyReplicasOnCPMS(ctx, g.GinkgoT(), 3, cpmsClient, nodeClient)
 			err = errors.Wrap(err, "scale-down: timed out waiting for CPMS to show 3 ready replicas")
 			o.Expect(err).ToNot(o.HaveOccurred())
 
@@ -109,6 +110,9 @@ var _ = g.Describe("[sig-etcd][Feature:EtcdVerticalScaling][Suite:openshift/etcd
 			err = errors.Wrap(err, "scale-up: timed out waiting for APIServer pods to stabilize on the same revision")
 			o.Expect(err).ToNot(o.HaveOccurred())
 
+			err = scalingtestinglibrary.EnsureCPMSReplicasConverged(ctx, cpmsClient)
+			o.Expect(err).ToNot(o.HaveOccurred())
+
 			return
 		}