operator-framework · openshift-merge-bot · Dec 11, 2025 · Dec 10, 2025 · Dec 10, 2025 · pedjak
@@ -19,6 +19,7 @@ import (
 	"fmt"
 	"io"
 	"os/exec"
+	"strings"
 	"testing"
 	"time"
 
@@ -33,16 +34,17 @@ func TestOperatorControllerMetricsExportedEndpoint(t *testing.T) {
 	client := utils.FindK8sClient(t)
 	curlNamespace := createRandomNamespace(t, client)
 	componentNamespace := getComponentNamespace(t, client, "control-plane=operator-controller-controller-manager")
-	metricsURL := fmt.Sprintf("https://operator-controller-service.%s.svc.cluster.local:8443/metrics", componentNamespace)
 
 	config := NewMetricsTestConfig(
 		client,
 		curlNamespace,
+		componentNamespace,
 		"operator-controller-metrics-reader",
 		"operator-controller-metrics-binding",
 		"operator-controller-metrics-reader",
 		"oper-curl-metrics",
-		metricsURL,
+		"app.kubernetes.io/name=operator-controller",
+		operatorControllerMetricsPort,
 	)
 
 	config.run(t)
@@ -53,42 +55,47 @@ func TestCatalogdMetricsExportedEndpoint(t *testing.T) {
 	client := utils.FindK8sClient(t)
 	curlNamespace := createRandomNamespace(t, client)
 	componentNamespace := getComponentNamespace(t, client, "control-plane=catalogd-controller-manager")
-	metricsURL := fmt.Sprintf("https://catalogd-service.%s.svc.cluster.local:7443/metrics", componentNamespace)
 
 	config := NewMetricsTestConfig(
 		client,
 		curlNamespace,
+		componentNamespace,
 		"catalogd-metrics-reader",
 		"catalogd-metrics-binding",
 		"catalogd-metrics-reader",
 		"catalogd-curl-metrics",
-		metricsURL,
+		"app.kubernetes.io/name=catalogd",
+		catalogdMetricsPort,
 	)
 
 	config.run(t)
 }
 
 // MetricsTestConfig holds the necessary configurations for testing metrics endpoints.
 type MetricsTestConfig struct {
-	client         string
-	namespace      string
-	clusterRole    string
-	clusterBinding string
-	serviceAccount string
-	curlPodName    string
-	metricsURL     string
+	client             string
+	namespace          string
+	componentNamespace string
+	clusterRole        string
+	clusterBinding     string
+	serviceAccount     string
+	curlPodName        string
+	componentSelector  string
+	metricsPort        int
 }
 
 // NewMetricsTestConfig initializes a new MetricsTestConfig.
-func NewMetricsTestConfig(client, namespace, clusterRole, clusterBinding, serviceAccount, curlPodName, metricsURL string) *MetricsTestConfig {
+func NewMetricsTestConfig(client, namespace, componentNamespace, clusterRole, clusterBinding, serviceAccount, curlPodName, componentSelector string, metricsPort int) *MetricsTestConfig {
 	return &MetricsTestConfig{
-		client:         client,
-		namespace:      namespace,
-		clusterRole:    clusterRole,
-		clusterBinding: clusterBinding,
-		serviceAccount: serviceAccount,
-		curlPodName:    curlPodName,
-		metricsURL:     metricsURL,
+		client:             client,
+		namespace:          namespace,
+		componentNamespace: componentNamespace,
+		clusterRole:        clusterRole,
+		clusterBinding:     clusterBinding,
+		serviceAccount:     serviceAccount,
+		curlPodName:        curlPodName,
+		componentSelector:  componentSelector,
+		metricsPort:        metricsPort,
 	}
 }
 
@@ -154,19 +161,33 @@ func (c *MetricsTestConfig) createCurlMetricsPod(t *testing.T) {
 	require.NoError(t, err, "Error creating curl pod: %s", string(output))
 }
 
-// validate verifies if is possible to access the metrics
+// validate verifies if is possible to access the metrics from all pods
 func (c *MetricsTestConfig) validate(t *testing.T, token string) {
 	t.Log("Waiting for the curl pod to be ready")
 	waitCmd := exec.Command(c.client, "wait", "--for=condition=Ready", "pod", c.curlPodName, "--namespace", c.namespace, "--timeout=60s")
 	waitOutput, waitErr := waitCmd.CombinedOutput()
 	require.NoError(t, waitErr, "Error waiting for curl pod to be ready: %s", string(waitOutput))
 
-	t.Log("Validating the metrics endpoint")
-	curlCmd := exec.Command(c.client, "exec", c.curlPodName, "--namespace", c.namespace, "--",
-		"curl", "-v", "-k", "-H", "Authorization: Bearer "+token, c.metricsURL)
-	output, err := curlCmd.CombinedOutput()
-	require.NoError(t, err, "Error calling metrics endpoint: %s", string(output))
-	require.Contains(t, string(output), "200 OK", "Metrics endpoint did not return 200 OK")
+	// Get all pod IPs for the component
+	podIPs := c.getComponentPodIPs(t)
+	require.NotEmpty(t, podIPs, "No pod IPs found for component")
+	t.Logf("Found %d pod(s) to scrape metrics from", len(podIPs))
+
+	// Validate metrics endpoint for each pod
+	for i, podIP := range podIPs {
+		// Build metrics URL with pod FQDN: <pod-ip-with-dashes>.<namespace>.pod.cluster.local
+		// Convert IP dots to dashes (e.g., 10.244.0.11 -> 10-244-0-11)
+		podIPDashes := strings.ReplaceAll(podIP, ".", "-")
+		metricsURL := fmt.Sprintf("https://%s.%s.pod.cluster.local:%d/metrics", podIPDashes, c.componentNamespace, c.metricsPort)
+		t.Logf("Validating metrics endpoint for pod %d/%d: %s", i+1, len(podIPs), metricsURL)
+
+		curlCmd := exec.Command(c.client, "exec", c.curlPodName, "--namespace", c.namespace, "--",
+			"curl", "-v", "-k", "-H", "Authorization: Bearer "+token, metricsURL)
+		output, err := curlCmd.CombinedOutput()
+		require.NoError(t, err, "Error calling metrics endpoint %s: %s", metricsURL, string(output))
+		require.Contains(t, string(output), "200 OK", "Metrics endpoint %s did not return 200 OK", metricsURL)
+		t.Logf("Successfully scraped metrics from pod %d/%d", i+1, len(podIPs))
+	}
 }
 
 // cleanup removes the created resources. Uses a context with timeout to prevent hangs.
@@ -243,6 +264,29 @@ func getComponentNamespace(t *testing.T, client, selector string) string {
 	return namespace
 }
 
+// getComponentPodIPs returns the IP addresses of all pods matching the component selector
+func (c *MetricsTestConfig) getComponentPodIPs(t *testing.T) []string {
+	cmd := exec.Command(c.client, "get", "pods",
+		"--namespace="+c.componentNamespace,
+		"--selector="+c.componentSelector,
+		"--output=jsonpath={.items[*].status.podIP}")
+	output, err := cmd.CombinedOutput()
+	require.NoError(t, err, "Error getting pod IPs: %s", string(output))
+
+	podIPsStr := string(bytes.TrimSpace(output))
+	if podIPsStr == "" {
+		return []string{}
+	}
+
+	// Split space-separated IPs
+	fields := bytes.Fields([]byte(podIPsStr))
+	ips := make([]string, len(fields))
+	for i, field := range fields {
+		ips[i] = string(field)
+	}
+	return ips
+}
+
 func stdoutAndCombined(cmd *exec.Cmd) ([]byte, []byte, error) {
 	var outOnly, outAndErr bytes.Buffer
 	allWriter := io.MultiWriter(&outOnly, &outAndErr)

@@ -11,6 +11,7 @@ import (
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	appsv1 "k8s.io/api/apps/v1"
+	coordinationv1 "k8s.io/api/coordination/v1"
 	corev1 "k8s.io/api/core/v1"
 	apimeta "k8s.io/apimachinery/pkg/api/meta"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -195,13 +196,14 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) {
 
 // waitForDeployment checks that the updated deployment with the given app.kubernetes.io/name label
 // has reached the desired number of replicas and that the number pods matches that number
-// i.e. no old pods remain. It will return a pointer to the first pod. This is only necessary
+// i.e. no old pods remain. It will return a pointer to the leader pod. This is only necessary
 // to facilitate the mitigation put in place for https://github.com/operator-framework/operator-controller/issues/1626
 func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel string) *corev1.Pod {
 	deploymentLabelSelector := labels.Set{"app.kubernetes.io/name": controlPlaneLabel}.AsSelector()
 
-	t.Log("Checking that the deployment is updated")
+	t.Log("Checking that the deployment is updated and available")
 	var desiredNumReplicas int32
+	var deploymentNamespace string
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		var managerDeployments appsv1.DeploymentList
 		require.NoError(ct, c.List(ctx, &managerDeployments, client.MatchingLabelsSelector{Selector: deploymentLabelSelector}))
@@ -214,16 +216,64 @@ func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel stri
 				managerDeployment.Status.AvailableReplicas == *managerDeployment.Spec.Replicas &&
 				managerDeployment.Status.ReadyReplicas == *managerDeployment.Spec.Replicas,
 		)
+
+		// Check that the deployment has the Available condition set to True
+		var availableCond *appsv1.DeploymentCondition
+		for i := range managerDeployment.Status.Conditions {
+			if managerDeployment.Status.Conditions[i].Type == appsv1.DeploymentAvailable {
+				availableCond = &managerDeployment.Status.Conditions[i]
+				break
+			}
+		}
+		require.NotNil(ct, availableCond, "Available condition not found")
+		require.Equal(ct, corev1.ConditionTrue, availableCond.Status, "Deployment Available condition is not True")
+
 		desiredNumReplicas = *managerDeployment.Spec.Replicas
+		deploymentNamespace = managerDeployment.Namespace
 	}, time.Minute, time.Second)
 
 	var managerPods corev1.PodList
 	t.Logf("Ensure the number of remaining pods equal the desired number of replicas (%d)", desiredNumReplicas)
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		require.NoError(ct, c.List(ctx, &managerPods, client.MatchingLabelsSelector{Selector: deploymentLabelSelector}))
-		require.Len(ct, managerPods.Items, 1)
+		require.Len(ct, managerPods.Items, int(desiredNumReplicas))
 	}, time.Minute, time.Second)
-	return &managerPods.Items[0]
+
+	// Find the leader pod by checking the lease
+	t.Log("Finding the leader pod")
 require.True(ct, 
 	managerDeployment.Status.UpdatedReplicas == *managerDeployment.Spec.Replicas && 
 		managerDeployment.Status.Replicas == *managerDeployment.Spec.Replicas && 
 		managerDeployment.Status.AvailableReplicas == *managerDeployment.Spec.Replicas && 
 		managerDeployment.Status.ReadyReplicas == *managerDeployment.Spec.Replicas, 
 ) 
 desiredNumReplicas = *managerDeployment.Spec.Replicas 
 require.True(ct, 
 	managerDeployment.Status.UpdatedReplicas == *managerDeployment.Spec.Replicas && 
 		managerDeployment.Status.Replicas == *managerDeployment.Spec.Replicas && 
 		managerDeployment.Status.AvailableReplicas == *managerDeployment.Spec.Replicas && 
 		managerDeployment.Status.ReadyReplicas == *managerDeployment.Spec.Replicas, 
 ) 
 desiredNumReplicas = *managerDeployment.Spec.Replicas 
+	// Map component labels to their leader election lease names
+	leaseNames := map[string]string{
+		"catalogd":            "catalogd-operator-lock",
+		"operator-controller": "9c4404e7.operatorframework.io",
+	}
+
+	leaseName, ok := leaseNames[controlPlaneLabel]
+	if !ok {
+		t.Fatalf("Unknown control plane component: %s", controlPlaneLabel)
+	}
+
+	var leaderPod *corev1.Pod
+	require.EventuallyWithT(t, func(ct *assert.CollectT) {
+		var lease coordinationv1.Lease
+		require.NoError(ct, c.Get(ctx, types.NamespacedName{Name: leaseName, Namespace: deploymentNamespace}, &lease))
+		require.NotNil(ct, lease.Spec.HolderIdentity)
+
+		leaderIdentity := *lease.Spec.HolderIdentity
+		// The lease holder identity format is: <pod-name>_<leader-election-id-suffix>
+		// Extract just the pod name by splitting on '_'
+		podName := strings.Split(leaderIdentity, "_")[0]
+
+		// Find the pod with matching name
+		for i := range managerPods.Items {
+			if managerPods.Items[i].Name == podName {
+				leaderPod = &managerPods.Items[i]
+				break
+			}
+		}
+		require.NotNil(ct, leaderPod, "leader pod not found with identity: %s (pod name: %s)", leaderIdentity, podName)
+	}, time.Minute, time.Second)
+
+	return leaderPod
 }
 
 func watchPodLogsForSubstring(ctx context.Context, pod *corev1.Pod, substrings ...string) (bool, error) {