diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go index e41827987..2686fa95f 100644 --- a/test/e2e/metrics_test.go +++ b/test/e2e/metrics_test.go @@ -19,6 +19,7 @@ import ( "fmt" "io" "os/exec" + "strings" "testing" "time" @@ -33,16 +34,17 @@ func TestOperatorControllerMetricsExportedEndpoint(t *testing.T) { client := utils.FindK8sClient(t) curlNamespace := createRandomNamespace(t, client) componentNamespace := getComponentNamespace(t, client, "control-plane=operator-controller-controller-manager") - metricsURL := fmt.Sprintf("https://operator-controller-service.%s.svc.cluster.local:8443/metrics", componentNamespace) config := NewMetricsTestConfig( client, curlNamespace, + componentNamespace, "operator-controller-metrics-reader", "operator-controller-metrics-binding", "operator-controller-metrics-reader", "oper-curl-metrics", - metricsURL, + "app.kubernetes.io/name=operator-controller", + operatorControllerMetricsPort, ) config.run(t) @@ -53,16 +55,17 @@ func TestCatalogdMetricsExportedEndpoint(t *testing.T) { client := utils.FindK8sClient(t) curlNamespace := createRandomNamespace(t, client) componentNamespace := getComponentNamespace(t, client, "control-plane=catalogd-controller-manager") - metricsURL := fmt.Sprintf("https://catalogd-service.%s.svc.cluster.local:7443/metrics", componentNamespace) config := NewMetricsTestConfig( client, curlNamespace, + componentNamespace, "catalogd-metrics-reader", "catalogd-metrics-binding", "catalogd-metrics-reader", "catalogd-curl-metrics", - metricsURL, + "app.kubernetes.io/name=catalogd", + catalogdMetricsPort, ) config.run(t) @@ -70,25 +73,29 @@ func TestCatalogdMetricsExportedEndpoint(t *testing.T) { // MetricsTestConfig holds the necessary configurations for testing metrics endpoints. type MetricsTestConfig struct { - client string - namespace string - clusterRole string - clusterBinding string - serviceAccount string - curlPodName string - metricsURL string + client string + namespace string + componentNamespace string + clusterRole string + clusterBinding string + serviceAccount string + curlPodName string + componentSelector string + metricsPort int } // NewMetricsTestConfig initializes a new MetricsTestConfig. -func NewMetricsTestConfig(client, namespace, clusterRole, clusterBinding, serviceAccount, curlPodName, metricsURL string) *MetricsTestConfig { +func NewMetricsTestConfig(client, namespace, componentNamespace, clusterRole, clusterBinding, serviceAccount, curlPodName, componentSelector string, metricsPort int) *MetricsTestConfig { return &MetricsTestConfig{ - client: client, - namespace: namespace, - clusterRole: clusterRole, - clusterBinding: clusterBinding, - serviceAccount: serviceAccount, - curlPodName: curlPodName, - metricsURL: metricsURL, + client: client, + namespace: namespace, + componentNamespace: componentNamespace, + clusterRole: clusterRole, + clusterBinding: clusterBinding, + serviceAccount: serviceAccount, + curlPodName: curlPodName, + componentSelector: componentSelector, + metricsPort: metricsPort, } } @@ -154,19 +161,33 @@ func (c *MetricsTestConfig) createCurlMetricsPod(t *testing.T) { require.NoError(t, err, "Error creating curl pod: %s", string(output)) } -// validate verifies if is possible to access the metrics +// validate verifies if is possible to access the metrics from all pods func (c *MetricsTestConfig) validate(t *testing.T, token string) { t.Log("Waiting for the curl pod to be ready") waitCmd := exec.Command(c.client, "wait", "--for=condition=Ready", "pod", c.curlPodName, "--namespace", c.namespace, "--timeout=60s") waitOutput, waitErr := waitCmd.CombinedOutput() require.NoError(t, waitErr, "Error waiting for curl pod to be ready: %s", string(waitOutput)) - t.Log("Validating the metrics endpoint") - curlCmd := exec.Command(c.client, "exec", c.curlPodName, "--namespace", c.namespace, "--", - "curl", "-v", "-k", "-H", "Authorization: Bearer "+token, c.metricsURL) - output, err := curlCmd.CombinedOutput() - require.NoError(t, err, "Error calling metrics endpoint: %s", string(output)) - require.Contains(t, string(output), "200 OK", "Metrics endpoint did not return 200 OK") + // Get all pod IPs for the component + podIPs := c.getComponentPodIPs(t) + require.NotEmpty(t, podIPs, "No pod IPs found for component") + t.Logf("Found %d pod(s) to scrape metrics from", len(podIPs)) + + // Validate metrics endpoint for each pod + for i, podIP := range podIPs { + // Build metrics URL with pod FQDN: ..pod.cluster.local + // Convert IP dots to dashes (e.g., 10.244.0.11 -> 10-244-0-11) + podIPDashes := strings.ReplaceAll(podIP, ".", "-") + metricsURL := fmt.Sprintf("https://%s.%s.pod.cluster.local:%d/metrics", podIPDashes, c.componentNamespace, c.metricsPort) + t.Logf("Validating metrics endpoint for pod %d/%d: %s", i+1, len(podIPs), metricsURL) + + curlCmd := exec.Command(c.client, "exec", c.curlPodName, "--namespace", c.namespace, "--", + "curl", "-v", "-k", "-H", "Authorization: Bearer "+token, metricsURL) + output, err := curlCmd.CombinedOutput() + require.NoError(t, err, "Error calling metrics endpoint %s: %s", metricsURL, string(output)) + require.Contains(t, string(output), "200 OK", "Metrics endpoint %s did not return 200 OK", metricsURL) + t.Logf("Successfully scraped metrics from pod %d/%d", i+1, len(podIPs)) + } } // cleanup removes the created resources. Uses a context with timeout to prevent hangs. @@ -243,6 +264,29 @@ func getComponentNamespace(t *testing.T, client, selector string) string { return namespace } +// getComponentPodIPs returns the IP addresses of all pods matching the component selector +func (c *MetricsTestConfig) getComponentPodIPs(t *testing.T) []string { + cmd := exec.Command(c.client, "get", "pods", + "--namespace="+c.componentNamespace, + "--selector="+c.componentSelector, + "--output=jsonpath={.items[*].status.podIP}") + output, err := cmd.CombinedOutput() + require.NoError(t, err, "Error getting pod IPs: %s", string(output)) + + podIPsStr := string(bytes.TrimSpace(output)) + if podIPsStr == "" { + return []string{} + } + + // Split space-separated IPs + fields := bytes.Fields([]byte(podIPsStr)) + ips := make([]string, len(fields)) + for i, field := range fields { + ips[i] = string(field) + } + return ips +} + func stdoutAndCombined(cmd *exec.Cmd) ([]byte, []byte, error) { var outOnly, outAndErr bytes.Buffer allWriter := io.MultiWriter(&outOnly, &outAndErr) diff --git a/test/upgrade-e2e/post_upgrade_test.go b/test/upgrade-e2e/post_upgrade_test.go index 785d91ea3..05f1e4f12 100644 --- a/test/upgrade-e2e/post_upgrade_test.go +++ b/test/upgrade-e2e/post_upgrade_test.go @@ -11,6 +11,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" appsv1 "k8s.io/api/apps/v1" + coordinationv1 "k8s.io/api/coordination/v1" corev1 "k8s.io/api/core/v1" apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -195,13 +196,14 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) { // waitForDeployment checks that the updated deployment with the given app.kubernetes.io/name label // has reached the desired number of replicas and that the number pods matches that number -// i.e. no old pods remain. It will return a pointer to the first pod. This is only necessary +// i.e. no old pods remain. It will return a pointer to the leader pod. This is only necessary // to facilitate the mitigation put in place for https://github.com/operator-framework/operator-controller/issues/1626 func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel string) *corev1.Pod { deploymentLabelSelector := labels.Set{"app.kubernetes.io/name": controlPlaneLabel}.AsSelector() - t.Log("Checking that the deployment is updated") + t.Log("Checking that the deployment is updated and available") var desiredNumReplicas int32 + var deploymentNamespace string require.EventuallyWithT(t, func(ct *assert.CollectT) { var managerDeployments appsv1.DeploymentList require.NoError(ct, c.List(ctx, &managerDeployments, client.MatchingLabelsSelector{Selector: deploymentLabelSelector})) @@ -214,16 +216,64 @@ func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel stri managerDeployment.Status.AvailableReplicas == *managerDeployment.Spec.Replicas && managerDeployment.Status.ReadyReplicas == *managerDeployment.Spec.Replicas, ) + + // Check that the deployment has the Available condition set to True + var availableCond *appsv1.DeploymentCondition + for i := range managerDeployment.Status.Conditions { + if managerDeployment.Status.Conditions[i].Type == appsv1.DeploymentAvailable { + availableCond = &managerDeployment.Status.Conditions[i] + break + } + } + require.NotNil(ct, availableCond, "Available condition not found") + require.Equal(ct, corev1.ConditionTrue, availableCond.Status, "Deployment Available condition is not True") + desiredNumReplicas = *managerDeployment.Spec.Replicas + deploymentNamespace = managerDeployment.Namespace }, time.Minute, time.Second) var managerPods corev1.PodList t.Logf("Ensure the number of remaining pods equal the desired number of replicas (%d)", desiredNumReplicas) require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.List(ctx, &managerPods, client.MatchingLabelsSelector{Selector: deploymentLabelSelector})) - require.Len(ct, managerPods.Items, 1) + require.Len(ct, managerPods.Items, int(desiredNumReplicas)) }, time.Minute, time.Second) - return &managerPods.Items[0] + + // Find the leader pod by checking the lease + t.Log("Finding the leader pod") + // Map component labels to their leader election lease names + leaseNames := map[string]string{ + "catalogd": "catalogd-operator-lock", + "operator-controller": "9c4404e7.operatorframework.io", + } + + leaseName, ok := leaseNames[controlPlaneLabel] + if !ok { + t.Fatalf("Unknown control plane component: %s", controlPlaneLabel) + } + + var leaderPod *corev1.Pod + require.EventuallyWithT(t, func(ct *assert.CollectT) { + var lease coordinationv1.Lease + require.NoError(ct, c.Get(ctx, types.NamespacedName{Name: leaseName, Namespace: deploymentNamespace}, &lease)) + require.NotNil(ct, lease.Spec.HolderIdentity) + + leaderIdentity := *lease.Spec.HolderIdentity + // The lease holder identity format is: _ + // Extract just the pod name by splitting on '_' + podName := strings.Split(leaderIdentity, "_")[0] + + // Find the pod with matching name + for i := range managerPods.Items { + if managerPods.Items[i].Name == podName { + leaderPod = &managerPods.Items[i] + break + } + } + require.NotNil(ct, leaderPod, "leader pod not found with identity: %s (pod name: %s)", leaderIdentity, podName) + }, time.Minute, time.Second) + + return leaderPod } func watchPodLogsForSubstring(ctx context.Context, pod *corev1.Pod, substrings ...string) (bool, error) {