From 5adadc11b1586dcb0abbb8e7a4b0ca241d5c0a67 Mon Sep 17 00:00:00 2001 From: Todd Short Date: Wed, 10 Dec 2025 16:03:21 -0500 Subject: [PATCH 1/2] Don't assume there is only one replica in e2e tests For the upgrade e2e tests, don't assume there is onle one replica. Get the number of replicas from the deployment and wait for the deployment to have that many available. Use the lease to determine the leader pod and reference that. Note that the name format of leases for operator-controller and catalogd are quite different; this doesn't change that, as it may have an impact on the upgrade test itself. Signed-off-by: Todd Short Assisted-by: Claude Code --- test/upgrade-e2e/post_upgrade_test.go | 58 +++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/test/upgrade-e2e/post_upgrade_test.go b/test/upgrade-e2e/post_upgrade_test.go index 785d91ea3..05f1e4f12 100644 --- a/test/upgrade-e2e/post_upgrade_test.go +++ b/test/upgrade-e2e/post_upgrade_test.go @@ -11,6 +11,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" appsv1 "k8s.io/api/apps/v1" + coordinationv1 "k8s.io/api/coordination/v1" corev1 "k8s.io/api/core/v1" apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -195,13 +196,14 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) { // waitForDeployment checks that the updated deployment with the given app.kubernetes.io/name label // has reached the desired number of replicas and that the number pods matches that number -// i.e. no old pods remain. It will return a pointer to the first pod. This is only necessary +// i.e. no old pods remain. It will return a pointer to the leader pod. This is only necessary // to facilitate the mitigation put in place for https://github.com/operator-framework/operator-controller/issues/1626 func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel string) *corev1.Pod { deploymentLabelSelector := labels.Set{"app.kubernetes.io/name": controlPlaneLabel}.AsSelector() - t.Log("Checking that the deployment is updated") + t.Log("Checking that the deployment is updated and available") var desiredNumReplicas int32 + var deploymentNamespace string require.EventuallyWithT(t, func(ct *assert.CollectT) { var managerDeployments appsv1.DeploymentList require.NoError(ct, c.List(ctx, &managerDeployments, client.MatchingLabelsSelector{Selector: deploymentLabelSelector})) @@ -214,16 +216,64 @@ func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel stri managerDeployment.Status.AvailableReplicas == *managerDeployment.Spec.Replicas && managerDeployment.Status.ReadyReplicas == *managerDeployment.Spec.Replicas, ) + + // Check that the deployment has the Available condition set to True + var availableCond *appsv1.DeploymentCondition + for i := range managerDeployment.Status.Conditions { + if managerDeployment.Status.Conditions[i].Type == appsv1.DeploymentAvailable { + availableCond = &managerDeployment.Status.Conditions[i] + break + } + } + require.NotNil(ct, availableCond, "Available condition not found") + require.Equal(ct, corev1.ConditionTrue, availableCond.Status, "Deployment Available condition is not True") + desiredNumReplicas = *managerDeployment.Spec.Replicas + deploymentNamespace = managerDeployment.Namespace }, time.Minute, time.Second) var managerPods corev1.PodList t.Logf("Ensure the number of remaining pods equal the desired number of replicas (%d)", desiredNumReplicas) require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.List(ctx, &managerPods, client.MatchingLabelsSelector{Selector: deploymentLabelSelector})) - require.Len(ct, managerPods.Items, 1) + require.Len(ct, managerPods.Items, int(desiredNumReplicas)) }, time.Minute, time.Second) - return &managerPods.Items[0] + + // Find the leader pod by checking the lease + t.Log("Finding the leader pod") + // Map component labels to their leader election lease names + leaseNames := map[string]string{ + "catalogd": "catalogd-operator-lock", + "operator-controller": "9c4404e7.operatorframework.io", + } + + leaseName, ok := leaseNames[controlPlaneLabel] + if !ok { + t.Fatalf("Unknown control plane component: %s", controlPlaneLabel) + } + + var leaderPod *corev1.Pod + require.EventuallyWithT(t, func(ct *assert.CollectT) { + var lease coordinationv1.Lease + require.NoError(ct, c.Get(ctx, types.NamespacedName{Name: leaseName, Namespace: deploymentNamespace}, &lease)) + require.NotNil(ct, lease.Spec.HolderIdentity) + + leaderIdentity := *lease.Spec.HolderIdentity + // The lease holder identity format is: _ + // Extract just the pod name by splitting on '_' + podName := strings.Split(leaderIdentity, "_")[0] + + // Find the pod with matching name + for i := range managerPods.Items { + if managerPods.Items[i].Name == podName { + leaderPod = &managerPods.Items[i] + break + } + } + require.NotNil(ct, leaderPod, "leader pod not found with identity: %s (pod name: %s)", leaderIdentity, podName) + }, time.Minute, time.Second) + + return leaderPod } func watchPodLogsForSubstring(ctx context.Context, pod *corev1.Pod, substrings ...string) (bool, error) { From 3823eb579c7b625a72d3dcc8cbe8252d00b57e23 Mon Sep 17 00:00:00 2001 From: Todd Short Date: Wed, 10 Dec 2025 17:13:55 -0500 Subject: [PATCH 2/2] Update metrics test to scape all pods Signed-off-by: Todd Short --- test/e2e/metrics_test.go | 96 +++++++++++++++++++++++++++++----------- 1 file changed, 70 insertions(+), 26 deletions(-) diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go index e41827987..2686fa95f 100644 --- a/test/e2e/metrics_test.go +++ b/test/e2e/metrics_test.go @@ -19,6 +19,7 @@ import ( "fmt" "io" "os/exec" + "strings" "testing" "time" @@ -33,16 +34,17 @@ func TestOperatorControllerMetricsExportedEndpoint(t *testing.T) { client := utils.FindK8sClient(t) curlNamespace := createRandomNamespace(t, client) componentNamespace := getComponentNamespace(t, client, "control-plane=operator-controller-controller-manager") - metricsURL := fmt.Sprintf("https://operator-controller-service.%s.svc.cluster.local:8443/metrics", componentNamespace) config := NewMetricsTestConfig( client, curlNamespace, + componentNamespace, "operator-controller-metrics-reader", "operator-controller-metrics-binding", "operator-controller-metrics-reader", "oper-curl-metrics", - metricsURL, + "app.kubernetes.io/name=operator-controller", + operatorControllerMetricsPort, ) config.run(t) @@ -53,16 +55,17 @@ func TestCatalogdMetricsExportedEndpoint(t *testing.T) { client := utils.FindK8sClient(t) curlNamespace := createRandomNamespace(t, client) componentNamespace := getComponentNamespace(t, client, "control-plane=catalogd-controller-manager") - metricsURL := fmt.Sprintf("https://catalogd-service.%s.svc.cluster.local:7443/metrics", componentNamespace) config := NewMetricsTestConfig( client, curlNamespace, + componentNamespace, "catalogd-metrics-reader", "catalogd-metrics-binding", "catalogd-metrics-reader", "catalogd-curl-metrics", - metricsURL, + "app.kubernetes.io/name=catalogd", + catalogdMetricsPort, ) config.run(t) @@ -70,25 +73,29 @@ func TestCatalogdMetricsExportedEndpoint(t *testing.T) { // MetricsTestConfig holds the necessary configurations for testing metrics endpoints. type MetricsTestConfig struct { - client string - namespace string - clusterRole string - clusterBinding string - serviceAccount string - curlPodName string - metricsURL string + client string + namespace string + componentNamespace string + clusterRole string + clusterBinding string + serviceAccount string + curlPodName string + componentSelector string + metricsPort int } // NewMetricsTestConfig initializes a new MetricsTestConfig. -func NewMetricsTestConfig(client, namespace, clusterRole, clusterBinding, serviceAccount, curlPodName, metricsURL string) *MetricsTestConfig { +func NewMetricsTestConfig(client, namespace, componentNamespace, clusterRole, clusterBinding, serviceAccount, curlPodName, componentSelector string, metricsPort int) *MetricsTestConfig { return &MetricsTestConfig{ - client: client, - namespace: namespace, - clusterRole: clusterRole, - clusterBinding: clusterBinding, - serviceAccount: serviceAccount, - curlPodName: curlPodName, - metricsURL: metricsURL, + client: client, + namespace: namespace, + componentNamespace: componentNamespace, + clusterRole: clusterRole, + clusterBinding: clusterBinding, + serviceAccount: serviceAccount, + curlPodName: curlPodName, + componentSelector: componentSelector, + metricsPort: metricsPort, } } @@ -154,19 +161,33 @@ func (c *MetricsTestConfig) createCurlMetricsPod(t *testing.T) { require.NoError(t, err, "Error creating curl pod: %s", string(output)) } -// validate verifies if is possible to access the metrics +// validate verifies if is possible to access the metrics from all pods func (c *MetricsTestConfig) validate(t *testing.T, token string) { t.Log("Waiting for the curl pod to be ready") waitCmd := exec.Command(c.client, "wait", "--for=condition=Ready", "pod", c.curlPodName, "--namespace", c.namespace, "--timeout=60s") waitOutput, waitErr := waitCmd.CombinedOutput() require.NoError(t, waitErr, "Error waiting for curl pod to be ready: %s", string(waitOutput)) - t.Log("Validating the metrics endpoint") - curlCmd := exec.Command(c.client, "exec", c.curlPodName, "--namespace", c.namespace, "--", - "curl", "-v", "-k", "-H", "Authorization: Bearer "+token, c.metricsURL) - output, err := curlCmd.CombinedOutput() - require.NoError(t, err, "Error calling metrics endpoint: %s", string(output)) - require.Contains(t, string(output), "200 OK", "Metrics endpoint did not return 200 OK") + // Get all pod IPs for the component + podIPs := c.getComponentPodIPs(t) + require.NotEmpty(t, podIPs, "No pod IPs found for component") + t.Logf("Found %d pod(s) to scrape metrics from", len(podIPs)) + + // Validate metrics endpoint for each pod + for i, podIP := range podIPs { + // Build metrics URL with pod FQDN: ..pod.cluster.local + // Convert IP dots to dashes (e.g., 10.244.0.11 -> 10-244-0-11) + podIPDashes := strings.ReplaceAll(podIP, ".", "-") + metricsURL := fmt.Sprintf("https://%s.%s.pod.cluster.local:%d/metrics", podIPDashes, c.componentNamespace, c.metricsPort) + t.Logf("Validating metrics endpoint for pod %d/%d: %s", i+1, len(podIPs), metricsURL) + + curlCmd := exec.Command(c.client, "exec", c.curlPodName, "--namespace", c.namespace, "--", + "curl", "-v", "-k", "-H", "Authorization: Bearer "+token, metricsURL) + output, err := curlCmd.CombinedOutput() + require.NoError(t, err, "Error calling metrics endpoint %s: %s", metricsURL, string(output)) + require.Contains(t, string(output), "200 OK", "Metrics endpoint %s did not return 200 OK", metricsURL) + t.Logf("Successfully scraped metrics from pod %d/%d", i+1, len(podIPs)) + } } // cleanup removes the created resources. Uses a context with timeout to prevent hangs. @@ -243,6 +264,29 @@ func getComponentNamespace(t *testing.T, client, selector string) string { return namespace } +// getComponentPodIPs returns the IP addresses of all pods matching the component selector +func (c *MetricsTestConfig) getComponentPodIPs(t *testing.T) []string { + cmd := exec.Command(c.client, "get", "pods", + "--namespace="+c.componentNamespace, + "--selector="+c.componentSelector, + "--output=jsonpath={.items[*].status.podIP}") + output, err := cmd.CombinedOutput() + require.NoError(t, err, "Error getting pod IPs: %s", string(output)) + + podIPsStr := string(bytes.TrimSpace(output)) + if podIPsStr == "" { + return []string{} + } + + // Split space-separated IPs + fields := bytes.Fields([]byte(podIPsStr)) + ips := make([]string, len(fields)) + for i, field := range fields { + ips[i] = string(field) + } + return ips +} + func stdoutAndCombined(cmd *exec.Cmd) ([]byte, []byte, error) { var outOnly, outAndErr bytes.Buffer allWriter := io.MultiWriter(&outOnly, &outAndErr)