Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 70 additions & 26 deletions test/e2e/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"fmt"
"io"
"os/exec"
"strings"
"testing"
"time"

Expand All @@ -33,16 +34,17 @@ func TestOperatorControllerMetricsExportedEndpoint(t *testing.T) {
client := utils.FindK8sClient(t)
curlNamespace := createRandomNamespace(t, client)
componentNamespace := getComponentNamespace(t, client, "control-plane=operator-controller-controller-manager")
metricsURL := fmt.Sprintf("https://operator-controller-service.%s.svc.cluster.local:8443/metrics", componentNamespace)

config := NewMetricsTestConfig(
client,
curlNamespace,
componentNamespace,
"operator-controller-metrics-reader",
"operator-controller-metrics-binding",
"operator-controller-metrics-reader",
"oper-curl-metrics",
metricsURL,
"app.kubernetes.io/name=operator-controller",
operatorControllerMetricsPort,
)

config.run(t)
Expand All @@ -53,42 +55,47 @@ func TestCatalogdMetricsExportedEndpoint(t *testing.T) {
client := utils.FindK8sClient(t)
curlNamespace := createRandomNamespace(t, client)
componentNamespace := getComponentNamespace(t, client, "control-plane=catalogd-controller-manager")
metricsURL := fmt.Sprintf("https://catalogd-service.%s.svc.cluster.local:7443/metrics", componentNamespace)

config := NewMetricsTestConfig(
client,
curlNamespace,
componentNamespace,
"catalogd-metrics-reader",
"catalogd-metrics-binding",
"catalogd-metrics-reader",
"catalogd-curl-metrics",
metricsURL,
"app.kubernetes.io/name=catalogd",
catalogdMetricsPort,
)

config.run(t)
}

// MetricsTestConfig holds the necessary configurations for testing metrics endpoints.
type MetricsTestConfig struct {
client string
namespace string
clusterRole string
clusterBinding string
serviceAccount string
curlPodName string
metricsURL string
client string
namespace string
componentNamespace string
clusterRole string
clusterBinding string
serviceAccount string
curlPodName string
componentSelector string
metricsPort int
}

// NewMetricsTestConfig initializes a new MetricsTestConfig.
func NewMetricsTestConfig(client, namespace, clusterRole, clusterBinding, serviceAccount, curlPodName, metricsURL string) *MetricsTestConfig {
func NewMetricsTestConfig(client, namespace, componentNamespace, clusterRole, clusterBinding, serviceAccount, curlPodName, componentSelector string, metricsPort int) *MetricsTestConfig {
return &MetricsTestConfig{
client: client,
namespace: namespace,
clusterRole: clusterRole,
clusterBinding: clusterBinding,
serviceAccount: serviceAccount,
curlPodName: curlPodName,
metricsURL: metricsURL,
client: client,
namespace: namespace,
componentNamespace: componentNamespace,
clusterRole: clusterRole,
clusterBinding: clusterBinding,
serviceAccount: serviceAccount,
curlPodName: curlPodName,
componentSelector: componentSelector,
metricsPort: metricsPort,
}
}

Expand Down Expand Up @@ -154,19 +161,33 @@ func (c *MetricsTestConfig) createCurlMetricsPod(t *testing.T) {
require.NoError(t, err, "Error creating curl pod: %s", string(output))
}

// validate verifies if is possible to access the metrics
// validate verifies if is possible to access the metrics from all pods
func (c *MetricsTestConfig) validate(t *testing.T, token string) {
t.Log("Waiting for the curl pod to be ready")
waitCmd := exec.Command(c.client, "wait", "--for=condition=Ready", "pod", c.curlPodName, "--namespace", c.namespace, "--timeout=60s")
waitOutput, waitErr := waitCmd.CombinedOutput()
require.NoError(t, waitErr, "Error waiting for curl pod to be ready: %s", string(waitOutput))

t.Log("Validating the metrics endpoint")
curlCmd := exec.Command(c.client, "exec", c.curlPodName, "--namespace", c.namespace, "--",
"curl", "-v", "-k", "-H", "Authorization: Bearer "+token, c.metricsURL)
output, err := curlCmd.CombinedOutput()
require.NoError(t, err, "Error calling metrics endpoint: %s", string(output))
require.Contains(t, string(output), "200 OK", "Metrics endpoint did not return 200 OK")
// Get all pod IPs for the component
podIPs := c.getComponentPodIPs(t)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we could send requests to pod using their FQDN, no need to find out their IPs.

Copy link
Contributor Author

@tmshort tmshort Dec 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But it appears that the FQDN uses dashed IP addresses, the pods don't have a hostname/subdomain set.
EDIT: https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But it appears that the FQDN uses dashed IP addresses, the pods don't have a hostname/subdomain set. EDIT: https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/

Ah, you are absolutely right.

nit: you can read all IP addresses for a service (i.e. all pod IPs) by fetching Endpoint resource (the name is the same as of service).

require.NotEmpty(t, podIPs, "No pod IPs found for component")
t.Logf("Found %d pod(s) to scrape metrics from", len(podIPs))

// Validate metrics endpoint for each pod
for i, podIP := range podIPs {
// Build metrics URL with pod FQDN: <pod-ip-with-dashes>.<namespace>.pod.cluster.local
// Convert IP dots to dashes (e.g., 10.244.0.11 -> 10-244-0-11)
podIPDashes := strings.ReplaceAll(podIP, ".", "-")
metricsURL := fmt.Sprintf("https://%s.%s.pod.cluster.local:%d/metrics", podIPDashes, c.componentNamespace, c.metricsPort)
t.Logf("Validating metrics endpoint for pod %d/%d: %s", i+1, len(podIPs), metricsURL)

curlCmd := exec.Command(c.client, "exec", c.curlPodName, "--namespace", c.namespace, "--",
"curl", "-v", "-k", "-H", "Authorization: Bearer "+token, metricsURL)
output, err := curlCmd.CombinedOutput()
require.NoError(t, err, "Error calling metrics endpoint %s: %s", metricsURL, string(output))
require.Contains(t, string(output), "200 OK", "Metrics endpoint %s did not return 200 OK", metricsURL)
t.Logf("Successfully scraped metrics from pod %d/%d", i+1, len(podIPs))
}
}

// cleanup removes the created resources. Uses a context with timeout to prevent hangs.
Expand Down Expand Up @@ -243,6 +264,29 @@ func getComponentNamespace(t *testing.T, client, selector string) string {
return namespace
}

// getComponentPodIPs returns the IP addresses of all pods matching the component selector
func (c *MetricsTestConfig) getComponentPodIPs(t *testing.T) []string {
cmd := exec.Command(c.client, "get", "pods",
"--namespace="+c.componentNamespace,
"--selector="+c.componentSelector,
"--output=jsonpath={.items[*].status.podIP}")
output, err := cmd.CombinedOutput()
require.NoError(t, err, "Error getting pod IPs: %s", string(output))

podIPsStr := string(bytes.TrimSpace(output))
if podIPsStr == "" {
return []string{}
}

// Split space-separated IPs
fields := bytes.Fields([]byte(podIPsStr))
ips := make([]string, len(fields))
for i, field := range fields {
ips[i] = string(field)
}
return ips
}

func stdoutAndCombined(cmd *exec.Cmd) ([]byte, []byte, error) {
var outOnly, outAndErr bytes.Buffer
allWriter := io.MultiWriter(&outOnly, &outAndErr)
Expand Down
58 changes: 54 additions & 4 deletions test/upgrade-e2e/post_upgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
appsv1 "k8s.io/api/apps/v1"
coordinationv1 "k8s.io/api/coordination/v1"
corev1 "k8s.io/api/core/v1"
apimeta "k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -195,13 +196,14 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) {

// waitForDeployment checks that the updated deployment with the given app.kubernetes.io/name label
// has reached the desired number of replicas and that the number pods matches that number
// i.e. no old pods remain. It will return a pointer to the first pod. This is only necessary
// i.e. no old pods remain. It will return a pointer to the leader pod. This is only necessary
// to facilitate the mitigation put in place for https://github.com/operator-framework/operator-controller/issues/1626
func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel string) *corev1.Pod {
deploymentLabelSelector := labels.Set{"app.kubernetes.io/name": controlPlaneLabel}.AsSelector()

t.Log("Checking that the deployment is updated")
t.Log("Checking that the deployment is updated and available")
var desiredNumReplicas int32
var deploymentNamespace string
require.EventuallyWithT(t, func(ct *assert.CollectT) {
var managerDeployments appsv1.DeploymentList
require.NoError(ct, c.List(ctx, &managerDeployments, client.MatchingLabelsSelector{Selector: deploymentLabelSelector}))
Expand All @@ -214,16 +216,64 @@ func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel stri
managerDeployment.Status.AvailableReplicas == *managerDeployment.Spec.Replicas &&
managerDeployment.Status.ReadyReplicas == *managerDeployment.Spec.Replicas,
)

// Check that the deployment has the Available condition set to True
var availableCond *appsv1.DeploymentCondition
for i := range managerDeployment.Status.Conditions {
if managerDeployment.Status.Conditions[i].Type == appsv1.DeploymentAvailable {
availableCond = &managerDeployment.Status.Conditions[i]
break
}
}
require.NotNil(ct, availableCond, "Available condition not found")
require.Equal(ct, corev1.ConditionTrue, availableCond.Status, "Deployment Available condition is not True")

desiredNumReplicas = *managerDeployment.Spec.Replicas
deploymentNamespace = managerDeployment.Namespace
}, time.Minute, time.Second)

var managerPods corev1.PodList
t.Logf("Ensure the number of remaining pods equal the desired number of replicas (%d)", desiredNumReplicas)
require.EventuallyWithT(t, func(ct *assert.CollectT) {
require.NoError(ct, c.List(ctx, &managerPods, client.MatchingLabelsSelector{Selector: deploymentLabelSelector}))
require.Len(ct, managerPods.Items, 1)
require.Len(ct, managerPods.Items, int(desiredNumReplicas))
}, time.Minute, time.Second)
return &managerPods.Items[0]

// Find the leader pod by checking the lease
t.Log("Finding the leader pod")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we're carrying too much complexity here. It seems we're returning the pod for two reasons:

  1. To at some point check the logs for some specific log lines to do with leader election, e.g.
t.Log("Waiting for acquired leader election")
	leaderCtx, leaderCancel := context.WithTimeout(ctx, 3*time.Minute)
	defer leaderCancel()
	leaderSubstrings := []string{"successfully acquired lease"}
	leaderElected, err := watchPodLogsForSubstring(leaderCtx, &managerPod, leaderSubstrings...)
	require.NoError(t, err)
	require.True(t, leaderElected)

t.Log("Reading logs to make sure that ClusterCatalog was reconciled by catalogdv1")
	logCtx, cancel := context.WithTimeout(ctx, time.Minute)
	defer cancel()
	substrings := []string{
		"reconcile ending",
		fmt.Sprintf(`ClusterCatalog=%q`, testClusterCatalogName),
	}
	found, err := watchPodLogsForSubstring(logCtx, &managerPod, substrings...)
	require.NoError(t, err)
	require.True(t, found)
  1. Because there's a possible bug in catalogd where after a pod restart, the ClusterCatalog still reports that it is serving the catalog. This causes some flakes by making the e2e test progress quickly to the Eventually waiting for installation, which times out because its also implicitly waiting for the catalog to unpack.

I wonder if we could refactor this helper to wait for the Available condition to be True and .status.replicas == .status.updatedReplicas, or something like that, and drop the return value.

I question whether we need 1 at all and we could try to tackle 2. in a different way, e.g. by hitting the catalog service endpoint until it doesn't return a 404. wdyt?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if changing the test that much is in scope here. This is preserving the test as-is, but making sure the returned pod is correct.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we could refactor this helper to wait for the Available condition to be True and .status.replicas == .status.updatedReplicas, or something like that, and drop the return value.

We are already checking updatedReplicas, replicas, availableReplicas and readyReplicas:

require.True(ct,
managerDeployment.Status.UpdatedReplicas == *managerDeployment.Spec.Replicas &&
managerDeployment.Status.Replicas == *managerDeployment.Spec.Replicas &&
managerDeployment.Status.AvailableReplicas == *managerDeployment.Spec.Replicas &&
managerDeployment.Status.ReadyReplicas == *managerDeployment.Spec.Replicas,
)
desiredNumReplicas = *managerDeployment.Spec.Replicas

Do we need to do more than that?

Which Available condition are you referring to? Deployment? ClusterCatalog?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the Deployment

Copy link
Contributor

@perdasilva perdasilva Dec 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if changing the test that much is in scope here. This is preserving the test as-is, but making sure the returned pod is correct.

That's fair. I just don't know that we want to carry so much complexity and it might be worth questioning the need for the return value and .zip things. I think it's ok to move it over to another PR (or maybe the cucumber tests might already cover this). But, I thought I'd call it out.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cucumber stuff will likely rewrite some of that, so I'd say defer it to that, but I can add a quick check for the deployment Available status, since we're already checking things on the deployment.

// Map component labels to their leader election lease names
leaseNames := map[string]string{
"catalogd": "catalogd-operator-lock",
"operator-controller": "9c4404e7.operatorframework.io",
}

leaseName, ok := leaseNames[controlPlaneLabel]
if !ok {
t.Fatalf("Unknown control plane component: %s", controlPlaneLabel)
}

var leaderPod *corev1.Pod
require.EventuallyWithT(t, func(ct *assert.CollectT) {
var lease coordinationv1.Lease
require.NoError(ct, c.Get(ctx, types.NamespacedName{Name: leaseName, Namespace: deploymentNamespace}, &lease))
require.NotNil(ct, lease.Spec.HolderIdentity)

leaderIdentity := *lease.Spec.HolderIdentity
// The lease holder identity format is: <pod-name>_<leader-election-id-suffix>
// Extract just the pod name by splitting on '_'
podName := strings.Split(leaderIdentity, "_")[0]

// Find the pod with matching name
for i := range managerPods.Items {
if managerPods.Items[i].Name == podName {
leaderPod = &managerPods.Items[i]
break
}
}
require.NotNil(ct, leaderPod, "leader pod not found with identity: %s (pod name: %s)", leaderIdentity, podName)
}, time.Minute, time.Second)

return leaderPod
}

func watchPodLogsForSubstring(ctx context.Context, pod *corev1.Pod, substrings ...string) (bool, error) {
Expand Down