diff --git a/docs/scaling/010_overview.adoc b/docs/scaling/010_overview.adoc index 4ed10e86d..4c2f7deb6 100644 --- a/docs/scaling/010_overview.adoc +++ b/docs/scaling/010_overview.adoc @@ -41,9 +41,114 @@ NOTE: The Operator will only apply safe scaling functionality to deployments tha If a deployment is storage disabled then it can be scaled up or down by the required number of members in one step as there is no fear of data loss in a storage disabled member. +[#problem] +== Kubectl Scale & Autoscaling + +When using the `kubectl scale` command or the Kubernetes autoscaler to scale Coherence clusters it is important +to be aware of how replicas are controlled when later applying updates to a cluster. + +=== The Problem + +If the replica count that has been changed via one of the scaling commands, when later updating it using an edited +version of the original YAML the replicas may be reverted and the cluster inadvertently resized. + +For example, a Coherence cluster can be defined with the `storage-cluster.yaml` file shown below. + +[source,yaml] +.storage-cluster.yaml +---- +apiVersion: coherence.oracle.com/v1 +kind: Coherence +metadata: + name: storage +spec: + replicas: 6 +---- + +Applying the YAML file will create a cluster with six Pods. +[source,bash] +---- +kubectl apply -f storage-cluster.yaml +---- + +The cluster can then be scaled using kubectl scale +[source,bash] +---- +kubectl scale coh/storage --replicas=9 +---- + +The cluster now has nine Pods. Later an update is applied to add a new system property to the JVM by +editing the original YAML file: + +[source,yaml] +.storage-cluster.yaml +---- +apiVersion: coherence.oracle.com/v1 +kind: Coherence +metadata: + name: storage +spec: + replicas: 6 + jvm: + args: + - "-Dfoo=bar" +---- + +The update can be applied using kubectl apply: +[source,bash] +---- +kubectl apply -f storage-cluster.yaml +---- + +But, the YAML file contains the original replica count, so the cluster will be scaled back from nine to six Pods. +This is probably not what was desired. + +One solution is to always make sure the replicas in the yaml is set to the "correct" value before applying it. +This can be awkward in some environments, and especially if using the Kubernetes HPA to control scaling. + +=== The Solution + +If you intend to use `kubectl scale` or the Kubernetes HPA to control scaling then it is best to not set the +`replicas` field in the YAML file used to create and update the Coherence cluster. + +For example, the initial YAML above could have been created like this: + +[source,yaml] +.storage-cluster.yaml +---- +apiVersion: coherence.oracle.com/v1 +kind: Coherence +metadata: + name: storage +---- + +After applying the YAML, the cluster size would start with the default three Pods. +After creation, the cluster can easily be scaled up to its required size using `kubectl scale` + +Later when applying the system property update, again the `replicas` field is unset in the YAML file: +[source,yaml] +.storage-cluster.yaml +---- +apiVersion: coherence.oracle.com/v1 +kind: Coherence +metadata: + name: storage +spec: + jvm: + args: + - "-Dfoo=bar" +---- + +Now, when the above YAML is applied using `kubectl apply` the replicas value for the cluster will be unchanged and +stay at whatever value it was scaled to. + +Another solution would be to create the initial YAML with the `replicas` field set to the desired initial size. +Then later when applying updates, ensure that the `replicas` field has been deleted from the YAML file. + + == Controlling Safe Scaling -The `Coherence` CRD has a number of fields that control the behaviour of scaling. +The `Coherence` CRD has a number of fields that control the behavior of scaling. === Scaling Policy diff --git a/examples/200_autoscaler/README.adoc b/examples/200_autoscaler/README.adoc index 6ca412658..78c0cd007 100644 --- a/examples/200_autoscaler/README.adoc +++ b/examples/200_autoscaler/README.adoc @@ -32,12 +32,18 @@ The diagram below shows, at a high level, how this works. image::images/autoscaler.png[] -Prometheus will obtain metrics from the Coherence Pod's metrics endpoints. +Prometheus will get metrics from the Coherence Pod's metrics endpoints. The Prometheus Adapter exposes certain configured metrics polled from Prometheus as custom Kubernetes metrics. The HPA is configured to poll the custom metrics and use those to scale the `Coherence` resource (which will in turn cause the Coherence Operator to scale the `StatefulSet`). +[IMPORTANT] +==== +When using the HPA it is important to understand how the replica field in the Coherence resource +is applied to avoid inadvertently resizing an autoscaled cluster when applying updates to the YAML. +See the explanation in <> +==== == Autoscaling Coherence Clusters diff --git a/test/certification/certifiy_deployment_test.go b/test/certification/certifiy_deployment_test.go index 62b08b7c5..8d81e2f11 100644 --- a/test/certification/certifiy_deployment_test.go +++ b/test/certification/certifiy_deployment_test.go @@ -84,6 +84,40 @@ func TestCertifyScaling(t *testing.T) { g.Expect(err).NotTo(HaveOccurred()) } +// Test the scenario where we create a Coherence cluster without a replicas field, which will default to three Pods. +// Then scale up the cluster to four. +// The apply an update using the same Coherence resource with no replicas field. +// After the update is applied, the cluster should still be four and not revert to three. +func TestCertifyScalingWithUpdate(t *testing.T) { + // Ensure that everything is cleaned up after the test! + testContext.CleanupAfterTest(t) + g := NewGomegaWithT(t) + + ns := helper.GetTestClusterNamespace() + + // the name of the cluster from scale-with-update-one.yaml and scale-with-update-two.yaml + name := "certify-scale-update" + + // Start with the default three replicas + err := apply(t, ns, "scale-with-update-one.yaml") + g.Expect(err).NotTo(HaveOccurred()) + _, err = helper.WaitForPodsWithLabel(testContext, ns, "one=testOne", 3, time.Second*10, time.Minute*10) + g.Expect(err).NotTo(HaveOccurred()) + + // Scale Up to four + err = scale(t, ns, name, 4) + g.Expect(err).NotTo(HaveOccurred()) + _, err = helper.WaitForStatefulSet(testContext, ns, name, 4, time.Second*10, time.Minute*5) + g.Expect(err).NotTo(HaveOccurred()) + + // apply the update + err = apply(t, ns, "scale-with-update-two.yaml") + g.Expect(err).NotTo(HaveOccurred()) + // There should eventually be four Pods with the additional label + _, err = helper.WaitForPodsWithLabel(testContext, ns, "two=testTwo", 4, time.Second*10, time.Minute*10) + g.Expect(err).NotTo(HaveOccurred()) +} + func scale(t *testing.T, namespace, name string, replicas int32) error { cmd := exec.Command("kubectl", "-n", namespace, "scale", fmt.Sprintf("--replicas=%d", replicas), "coherence/"+name) cmd.Stdout = os.Stdout @@ -91,3 +125,15 @@ func scale(t *testing.T, namespace, name string, replicas int32) error { t.Log("Executing Scale Command: " + strings.Join(cmd.Args, " ")) return cmd.Run() } + +func apply(t *testing.T, namespace, fileName string) error { + actualFile, err := helper.FindActualFile(fileName) + if err != nil { + return err + } + cmd := exec.Command("kubectl", "-n", namespace, "apply", "-f", actualFile) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + t.Log("Executing Kubectl Command: " + strings.Join(cmd.Args, " ")) + return cmd.Run() +} diff --git a/test/certification/scale-with-update-one.yaml b/test/certification/scale-with-update-one.yaml new file mode 100644 index 000000000..e6f0b8c28 --- /dev/null +++ b/test/certification/scale-with-update-one.yaml @@ -0,0 +1,10 @@ +apiVersion: coherence.oracle.com/v1 +kind: Coherence +metadata: + name: certify-scale-update +spec: + labels: + one: testOne + readinessProbe: + initialDelaySeconds: 10 + periodSeconds: 10 \ No newline at end of file diff --git a/test/certification/scale-with-update-two.yaml b/test/certification/scale-with-update-two.yaml new file mode 100644 index 000000000..3c9d3c11f --- /dev/null +++ b/test/certification/scale-with-update-two.yaml @@ -0,0 +1,8 @@ +apiVersion: coherence.oracle.com/v1 +kind: Coherence +metadata: + name: certify-scale-update +spec: + labels: + one: testOne + two: testTwo diff --git a/test/e2e/remote/scaling_test.go b/test/e2e/remote/scaling_test.go index 5353d697e..d03b50b25 100644 --- a/test/e2e/remote/scaling_test.go +++ b/test/e2e/remote/scaling_test.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2024, Oracle and/or its affiliates. + * Copyright (c) 2020, 2025, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ @@ -9,22 +9,24 @@ package remote import ( goctx "context" "fmt" + "io" + "strings" + "testing" + "time" + cohv1 "github.com/oracle/coherence-operator/api/v1" "github.com/oracle/coherence-operator/test/e2e/helper" "golang.org/x/net/context" - "io" appsv1 "k8s.io/api/apps/v1" + "k8s.io/apimachinery/pkg/types" "k8s.io/utils/ptr" "sigs.k8s.io/testing_frameworks/integration" - "strings" - "testing" - "time" . "github.com/onsi/gomega" ) // Test scaling up and down with different policies. -// This test is an example of using sub-tests to run the test with different test cases. +// This test is an example of using subtests to run the test with different test cases. func TestScaling(t *testing.T) { // Ensure that everything is cleaned up after the test! testContext.CleanupAfterTest(t) @@ -91,7 +93,7 @@ func TestScaleDownToZeroWithSuspendFalse(t *testing.T) { } // If a deployment is scaled down to zero it should be deleted and just its parent Coherence resource should remain. -// This test scales down using the "kubectl scale --relicas=0" command +// This test scales down using the "kubectl scale --replicas=0" command func TestScaleDownToZeroUsingKubectl(t *testing.T) { // Ensure that everything is cleaned up after the test! testContext.CleanupAfterTest(t) @@ -138,7 +140,7 @@ var kubeCtlScaler = func(t *testing.T, d *cohv1.Coherence, replicas int32) error } // Assert that a deployment can be created and scaled using the specified policy. -func assertScale(t *testing.T, id string, policy cohv1.ScalingPolicy, replicasStart, replicasScale int32, scaler ScaleFunction) { +func assertScale(t *testing.T, id string, policy cohv1.ScalingPolicy, replicasStart, replicasScale int32, scaler ScaleFunction) types.NamespacedName { g := NewGomegaWithT(t) testContext.CleanupAfterTest(t) @@ -153,16 +155,24 @@ func assertScale(t *testing.T, id string, policy cohv1.ScalingPolicy, replicasSt // Give the deployment a unique name based on the test name deployment.SetName(fmt.Sprintf("%s-%s", deployment.GetName(), strings.ToLower(id))) - // update the replica count and scaling policy - deployment.SetReplicas(replicasStart) + // update the replica count if greater than or equal zero, otherwise do not set the replica count field + var initialReplicas int32 + if replicasStart >= 0 { + deployment.SetReplicas(replicasStart) + initialReplicas = replicasStart + } else { + deployment.Spec.Replicas = nil + initialReplicas = cohv1.DefaultReplicas + } + // update the scaling policy if deployment.Spec.Scaling == nil { deployment.Spec.Scaling = &cohv1.ScalingSpec{} } deployment.Spec.Scaling.Policy = &policy // Do the canary test unless parallel scaling down - doCanary := replicasStart < replicasScale || policy != cohv1.ParallelScaling + doCanary := initialReplicas < replicasScale || policy != cohv1.ParallelScaling t.Logf("assertScale() - doCanary=%t", doCanary) t.Log("assertScale() - Installing Coherence deployment...") @@ -186,6 +196,8 @@ func assertScale(t *testing.T, id string, policy cohv1.ScalingPolicy, replicasSt err = helper.CheckCanary(testContext, namespace, deployment.Name) g.Expect(err).NotTo(HaveOccurred()) } + + return types.NamespacedName{Namespace: deployment.Namespace, Name: deployment.Name} } func assertScaleDownToZero(t *testing.T, id string, scaler ScaleFunction, suspend *bool) {