Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-4.13] OCPBUGS-12700: update-etcd-scaling-test #27892

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
104 changes: 69 additions & 35 deletions test/extended/etcd/helpers/helpers.go
Expand Up @@ -28,13 +28,15 @@ import (
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/dynamic"
"k8s.io/client-go/kubernetes"
v1 "k8s.io/client-go/kubernetes/typed/core/v1"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
"k8s.io/utils/pointer"
)

const masterMachineLabelSelector = "machine.openshift.io/cluster-api-machine-role" + "=" + "master"
const machineDeletionHookName = "EtcdQuorumOperator"
const machineDeletionHookOwner = "clusteroperator/etcd"
const masterNodeRoleLabel = "node-role.kubernetes.io/master"

type TestingT interface {
Logf(format string, args ...interface{})
Expand Down Expand Up @@ -184,41 +186,31 @@ func recoverClusterToInitialStateIfNeeded(ctx context.Context, t TestingT, machi
}

func DeleteSingleMachine(ctx context.Context, t TestingT, machineClient machinev1beta1client.MachineInterface) (string, error) {
waitPollInterval := 15 * time.Second
waitPollTimeout := 5 * time.Minute
t.Logf("Waiting up to %s to delete a machine", waitPollTimeout.String())

machineToDelete := ""
err := wait.Poll(waitPollInterval, waitPollTimeout, func() (bool, error) {
machineList, err := machineClient.List(ctx, metav1.ListOptions{LabelSelector: masterMachineLabelSelector})
if err != nil {
return isTransientAPIError(t, err)
}

// Machine names are suffixed with an index number (e.g "ci-op-xlbdrkvl-6a467-qcbkh-master-0")
// so we sort to pick the lowest index, e.g master-0 in this example
machineNames := []string{}
for _, m := range machineList.Items {
machineNames = append(machineNames, m.Name)
}
sort.Strings(machineNames)
machineToDelete = machineNames[0]
t.Logf("attempting to delete machine %q", machineToDelete)
// list master machines
machineList, err := machineClient.List(ctx, metav1.ListOptions{LabelSelector: masterMachineLabelSelector})
if err != nil {
return "", fmt.Errorf("error listing master machines: '%w'", err)
}
// Machine names are suffixed with an index number (e.g "ci-op-xlbdrkvl-6a467-qcbkh-master-0")
// so we sort to pick the lowest index, e.g master-0 in this example
machineNames := []string{}
for _, m := range machineList.Items {
machineNames = append(machineNames, m.Name)
}
sort.Strings(machineNames)
machineToDelete = machineNames[0]

if err := machineClient.Delete(ctx, machineToDelete, metav1.DeleteOptions{}); err != nil {
// The machine we just listed should be present but if not, error out
if apierrors.IsNotFound(err) {
t.Logf("machine %q was listed but not found or already deleted", machineToDelete)
return false, fmt.Errorf("machine %q was listed but not found or already deleted", machineToDelete)
}
return isTransientAPIError(t, err)
t.Logf("attempting to delete machine '%q'", machineToDelete)
if err := machineClient.Delete(ctx, machineToDelete, metav1.DeleteOptions{}); err != nil {
if apierrors.IsNotFound(err) {
t.Logf("machine '%q' was listed but not found or already deleted", machineToDelete)
return "", nil
}
t.Logf("successfully deleted machine %q", machineToDelete)

return true, nil
})

return machineToDelete, err
return "", err
}
t.Logf("successfully deleted machine '%q'", machineToDelete)
return machineToDelete, nil
}

// IsCPMSActive returns true if the current platform's has an active CPMS
Expand Down Expand Up @@ -246,7 +238,7 @@ func IsCPMSActive(ctx context.Context, t TestingT, cpmsClient machinev1client.Co

// EnsureReadyReplicasOnCPMS checks if status.readyReplicas on the cluster CPMS is n
// this effectively counts the number of control-plane machines with the provider state as running
func EnsureReadyReplicasOnCPMS(ctx context.Context, t TestingT, expectedReplicaCount int, cpmsClient machinev1client.ControlPlaneMachineSetInterface) error {
func EnsureReadyReplicasOnCPMS(ctx context.Context, t TestingT, expectedReplicaCount int, cpmsClient machinev1client.ControlPlaneMachineSetInterface, nodeClient v1.NodeInterface) error {
waitPollInterval := 5 * time.Second
waitPollTimeout := 18 * time.Minute
t.Logf("Waiting up to %s for the CPMS to have status.readyReplicas = %v", waitPollTimeout.String(), expectedReplicaCount)
Expand All @@ -261,13 +253,55 @@ func EnsureReadyReplicasOnCPMS(ctx context.Context, t TestingT, expectedReplicaC
t.Logf("expected %d ready replicas on CPMS, got: %v,", expectedReplicaCount, cpms.Status.ReadyReplicas)
return false, nil
}

t.Logf("CPMS has reached the desired number of ready replicas: %v,", cpms.Status.ReadyReplicas)

err = EnsureReadyMasterNodes(ctx, expectedReplicaCount, nodeClient)
if err != nil {
t.Logf("expected number of master nodes is not ready yet: '%w'", err)
return false, nil
}

return true, nil
})
}

// EnsureReadyMasterNodes checks if the current master nodes matches the expected number of master nodes,
// and that all master nodes' are Ready
func EnsureReadyMasterNodes(ctx context.Context, expectedReplicaCount int, nodeClient v1.NodeInterface) error {
masterNodes, err := nodeClient.List(ctx, metav1.ListOptions{LabelSelector: masterNodeRoleLabel})
if err != nil {
return fmt.Errorf("failed to list master nodes:'%w'", err)
}

if len(masterNodes.Items) != expectedReplicaCount {
return fmt.Errorf("expected number of master nodes is '%d', but got '%d' instead", expectedReplicaCount, len(masterNodes.Items))
}

for _, node := range masterNodes.Items {
for _, condition := range node.Status.Conditions {
if condition.Type == corev1.NodeReady && condition.Status != corev1.ConditionTrue {
return fmt.Errorf("master node '%v' is not ready", node)
}
}
}

return nil
}

// EnsureCPMSReplicasConverged returns error if the number of expected master machines not equals the number of actual master machines
// otherwise it returns nil
func EnsureCPMSReplicasConverged(ctx context.Context, cpmsClient machinev1client.ControlPlaneMachineSetInterface) error {
cpms, err := cpmsClient.Get(ctx, "cluster", metav1.GetOptions{})
if err != nil {
return fmt.Errorf("failed to get controlPlaneMachineSet object: '%w'", err)
}

if *cpms.Spec.Replicas != cpms.Status.ReadyReplicas {
return fmt.Errorf("CPMS replicas failed to converge, expected status.readyReplicas '%d' to be equal to spec.replicas '%v'", cpms.Status.ReadyReplicas, cpms.Spec.Replicas)
}
return nil
}

// EnsureVotingMembersCount counts the number of voting etcd members, it doesn't evaluate health conditions or any other attributes (i.e. name) of individual members
// this method won't fail immediately on errors, this is useful during scaling down operation until the feature can ensure this operation to be graceful
func EnsureVotingMembersCount(ctx context.Context, t TestingT, etcdClientFactory EtcdClientCreator, kubeClient kubernetes.Interface, expectedMembersCount int) error {
Expand Down Expand Up @@ -383,7 +417,7 @@ func MachineNameToEtcdMemberName(ctx context.Context, kubeClient kubernetes.Inte
return "", err
}

masterNodes, err := kubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{LabelSelector: "node-role.kubernetes.io/master"})
masterNodes, err := kubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{LabelSelector: masterNodeRoleLabel})
if err != nil {
return "", err
}
Expand Down
8 changes: 6 additions & 2 deletions test/extended/etcd/vertical_scaling.go
Expand Up @@ -44,6 +44,7 @@ var _ = g.Describe("[sig-etcd][Feature:EtcdVerticalScaling][Suite:openshift/etcd
machineClientSet, err := machineclient.NewForConfig(oc.KubeFramework().ClientConfig())
o.Expect(err).ToNot(o.HaveOccurred())
machineClient := machineClientSet.MachineV1beta1().Machines("openshift-machine-api")
nodeClient := oc.KubeClient().CoreV1().Nodes()
cpmsClient := machineClientSet.MachineV1().ControlPlaneMachineSets("openshift-machine-api")
kubeClient := oc.KubeClient()

Expand Down Expand Up @@ -76,7 +77,7 @@ var _ = g.Describe("[sig-etcd][Feature:EtcdVerticalScaling][Suite:openshift/etcd
// step 2: wait until the CPMSO scales-up by creating a new machine
// We need to check the cpms' status.readyReplicas because the phase of one machine will always be Deleting
// so we can't use EnsureMasterMachinesAndCount() since that counts for machines that aren't pending deletion
err = scalingtestinglibrary.EnsureReadyReplicasOnCPMS(ctx, g.GinkgoT(), 4, cpmsClient)
err = scalingtestinglibrary.EnsureReadyReplicasOnCPMS(ctx, g.GinkgoT(), 4, cpmsClient, nodeClient)
err = errors.Wrap(err, "scale-up: timed out waiting for CPMS to show 4 ready replicas")
o.Expect(err).ToNot(o.HaveOccurred())

Expand All @@ -87,7 +88,7 @@ var _ = g.Describe("[sig-etcd][Feature:EtcdVerticalScaling][Suite:openshift/etcd
// successfully

// step 3: wait for automatic scale-down as the replica count goes back down to 3
err = scalingtestinglibrary.EnsureReadyReplicasOnCPMS(ctx, g.GinkgoT(), 3, cpmsClient)
err = scalingtestinglibrary.EnsureReadyReplicasOnCPMS(ctx, g.GinkgoT(), 3, cpmsClient, nodeClient)
err = errors.Wrap(err, "scale-down: timed out waiting for CPMS to show 3 ready replicas")
o.Expect(err).ToNot(o.HaveOccurred())

Expand All @@ -109,6 +110,9 @@ var _ = g.Describe("[sig-etcd][Feature:EtcdVerticalScaling][Suite:openshift/etcd
err = errors.Wrap(err, "scale-up: timed out waiting for APIServer pods to stabilize on the same revision")
o.Expect(err).ToNot(o.HaveOccurred())

err = scalingtestinglibrary.EnsureCPMSReplicasConverged(ctx, cpmsClient)
o.Expect(err).ToNot(o.HaveOccurred())

return
}

Expand Down