From b1b9dfaa94dc4bc88c80bc620f2688cc9ffb8ea1 Mon Sep 17 00:00:00 2001 From: Camila Macedo <7708031+camilamacedo86@users.noreply.github.com> Date: Sun, 9 Mar 2025 20:19:27 +0000 Subject: [PATCH 1/3] POC - Test Gathering Prometheus data --- .github/workflows/e2e.yaml | 5 + .../core/clustercatalog_controller.go | 1 + .../core/pull_secret_controller.go | 1 + .../controllers/clustercatalog_controller.go | 1 + .../clusterextension_controller.go | 1 + .../controllers/pull_secret_controller.go | 1 + test/e2e/cluster_extension_install_test.go | 31 +++++ test/e2e/metrics_test.go | 109 ++++++++++++++++-- 8 files changed, 141 insertions(+), 9 deletions(-) diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index cca4559cc..2694980d0 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -35,6 +35,11 @@ jobs: - name: Run e2e tests run: ARTIFACT_PATH=/tmp/artifacts make test-e2e + - uses: actions/upload-artifact@v4 + with: + name: upgrade-e2e-artifacts + path: test/e2e/results/** + - uses: actions/upload-artifact@v4 if: failure() with: diff --git a/internal/catalogd/controllers/core/clustercatalog_controller.go b/internal/catalogd/controllers/core/clustercatalog_controller.go index be9d816fd..d0597d3ee 100644 --- a/internal/catalogd/controllers/core/clustercatalog_controller.go +++ b/internal/catalogd/controllers/core/clustercatalog_controller.go @@ -152,6 +152,7 @@ func (r *ClusterCatalogReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&ocv1.ClusterCatalog{}). + Named("catalogd-clustercatalog-controller"). Complete(r) } diff --git a/internal/catalogd/controllers/core/pull_secret_controller.go b/internal/catalogd/controllers/core/pull_secret_controller.go index 0255309ca..810581047 100644 --- a/internal/catalogd/controllers/core/pull_secret_controller.go +++ b/internal/catalogd/controllers/core/pull_secret_controller.go @@ -64,6 +64,7 @@ func (r *PullSecretReconciler) Reconcile(ctx context.Context, req ctrl.Request) func (r *PullSecretReconciler) SetupWithManager(mgr ctrl.Manager) error { _, err := ctrl.NewControllerManagedBy(mgr). For(&corev1.Secret{}). + Named("catalogd-pull-secret-controller"). WithEventFilter(newSecretPredicate(r.SecretKey)). Build(r) diff --git a/internal/operator-controller/controllers/clustercatalog_controller.go b/internal/operator-controller/controllers/clustercatalog_controller.go index c7e7edb03..17a6ccae2 100644 --- a/internal/operator-controller/controllers/clustercatalog_controller.go +++ b/internal/operator-controller/controllers/clustercatalog_controller.go @@ -93,6 +93,7 @@ func (r *ClusterCatalogReconciler) Reconcile(ctx context.Context, req ctrl.Reque // SetupWithManager sets up the controller with the Manager. func (r *ClusterCatalogReconciler) SetupWithManager(mgr ctrl.Manager) error { _, err := ctrl.NewControllerManagedBy(mgr). + Named("con-oper-clustercatalog-controller"). For(&ocv1.ClusterCatalog{}). Build(r) diff --git a/internal/operator-controller/controllers/clusterextension_controller.go b/internal/operator-controller/controllers/clusterextension_controller.go index d914b831b..d19f55639 100644 --- a/internal/operator-controller/controllers/clusterextension_controller.go +++ b/internal/operator-controller/controllers/clusterextension_controller.go @@ -407,6 +407,7 @@ func SetDeprecationStatus(ext *ocv1.ClusterExtension, bundleName string, depreca func (r *ClusterExtensionReconciler) SetupWithManager(mgr ctrl.Manager) error { controller, err := ctrl.NewControllerManagedBy(mgr). For(&ocv1.ClusterExtension{}). + Named("con-oper-cluster-extension-controller"). Watches(&ocv1.ClusterCatalog{}, crhandler.EnqueueRequestsFromMapFunc(clusterExtensionRequestsForCatalog(mgr.GetClient(), mgr.GetLogger())), builder.WithPredicates(predicate.Funcs{ diff --git a/internal/operator-controller/controllers/pull_secret_controller.go b/internal/operator-controller/controllers/pull_secret_controller.go index 6db1ae564..8ad7a50b6 100644 --- a/internal/operator-controller/controllers/pull_secret_controller.go +++ b/internal/operator-controller/controllers/pull_secret_controller.go @@ -63,6 +63,7 @@ func (r *PullSecretReconciler) Reconcile(ctx context.Context, req ctrl.Request) // SetupWithManager sets up the controller with the Manager. func (r *PullSecretReconciler) SetupWithManager(mgr ctrl.Manager) error { _, err := ctrl.NewControllerManagedBy(mgr). + Named("con-oper-pull-secret-controller"). For(&corev1.Secret{}). WithEventFilter(newSecretPredicate(r.SecretKey)). Build(r) diff --git a/test/e2e/cluster_extension_install_test.go b/test/e2e/cluster_extension_install_test.go index 7c57a078c..f5332de81 100644 --- a/test/e2e/cluster_extension_install_test.go +++ b/test/e2e/cluster_extension_install_test.go @@ -377,6 +377,10 @@ func TestClusterExtensionInstallRegistry(t *testing.T) { assert.NotEmpty(ct, clusterExtension.Status.Install.Bundle) } }, pollDuration, pollInterval) + + // For this case we cannot get the metrics + //FetchCatalogdMetricsExportedEndpoint(t) + //FetchOperatorControllerMetricsExportedEndpoint(t) }) } } @@ -455,6 +459,9 @@ location = "docker-registry.operator-controller-e2e.svc.cluster.local:5000"`, assert.NotEmpty(ct, clusterExtension.Status.Install.Bundle) } }, pollDuration, pollInterval) + + FetchCatalogdMetricsExportedEndpoint(t) + FetchOperatorControllerMetricsExportedEndpoint(t) } func TestClusterExtensionInstallRegistryMultipleBundles(t *testing.T) { @@ -505,6 +512,9 @@ func TestClusterExtensionInstallRegistryMultipleBundles(t *testing.T) { assert.Contains(ct, cond.Message, "in multiple catalogs with the same priority [extra-test-catalog test-catalog]") } }, pollDuration, pollInterval) + + FetchCatalogdMetricsExportedEndpoint(t) + FetchOperatorControllerMetricsExportedEndpoint(t) } func TestClusterExtensionBlockInstallNonSuccessorVersion(t *testing.T) { @@ -568,6 +578,9 @@ func TestClusterExtensionBlockInstallNonSuccessorVersion(t *testing.T) { assert.Equal(ct, "error upgrading from currently installed version \"1.0.0\": no bundles found for package \"test\" matching version \"1.2.0\"", cond.Message) } }, pollDuration, pollInterval) + + FetchCatalogdMetricsExportedEndpoint(t) + FetchOperatorControllerMetricsExportedEndpoint(t) } func TestClusterExtensionForceInstallNonSuccessorVersion(t *testing.T) { @@ -618,6 +631,9 @@ func TestClusterExtensionForceInstallNonSuccessorVersion(t *testing.T) { assert.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) } }, pollDuration, pollInterval) + + FetchCatalogdMetricsExportedEndpoint(t) + FetchOperatorControllerMetricsExportedEndpoint(t) } func TestClusterExtensionInstallSuccessorVersion(t *testing.T) { @@ -666,6 +682,9 @@ func TestClusterExtensionInstallSuccessorVersion(t *testing.T) { assert.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) } }, pollDuration, pollInterval) + + FetchCatalogdMetricsExportedEndpoint(t) + FetchOperatorControllerMetricsExportedEndpoint(t) } func TestClusterExtensionInstallReResolvesWhenCatalogIsPatched(t *testing.T) { @@ -733,6 +752,9 @@ func TestClusterExtensionInstallReResolvesWhenCatalogIsPatched(t *testing.T) { assert.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) } }, pollDuration, pollInterval) + + FetchCatalogdMetricsExportedEndpoint(t) + FetchOperatorControllerMetricsExportedEndpoint(t) } func TestClusterExtensionInstallReResolvesWhenNewCatalog(t *testing.T) { @@ -814,6 +836,9 @@ func TestClusterExtensionInstallReResolvesWhenNewCatalog(t *testing.T) { assert.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) } }, pollDuration, pollInterval) + + FetchCatalogdMetricsExportedEndpoint(t) + FetchOperatorControllerMetricsExportedEndpoint(t) } func TestClusterExtensionInstallReResolvesWhenManagedContentChanged(t *testing.T) { @@ -866,6 +891,9 @@ func TestClusterExtensionInstallReResolvesWhenManagedContentChanged(t *testing.T require.EventuallyWithT(t, func(ct *assert.CollectT) { assert.NoError(ct, globalClient.Get(context.Background(), types.NamespacedName{Name: testConfigMap.Name, Namespace: testConfigMap.Namespace}, testConfigMap)) }, pollDuration, pollInterval) + + FetchCatalogdMetricsExportedEndpoint(t) + FetchOperatorControllerMetricsExportedEndpoint(t) } func TestClusterExtensionRecoversFromInitialInstallFailedWhenFailureFixed(t *testing.T) { @@ -959,4 +987,7 @@ func TestClusterExtensionRecoversFromInitialInstallFailedWhenFailureFixed(t *tes assert.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) } }, pollDuration, pollInterval) + + FetchCatalogdMetricsExportedEndpoint(t) + FetchOperatorControllerMetricsExportedEndpoint(t) } diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go index 3d15035b8..7f9948264 100644 --- a/test/e2e/metrics_test.go +++ b/test/e2e/metrics_test.go @@ -17,6 +17,8 @@ import ( "bytes" "context" "errors" + "fmt" + "os" "strings" "testing" "time" @@ -36,8 +38,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/config" ) -// TestOperatorControllerMetricsExportedEndpoint verifies that the metrics endpoint for the operator controller -func TestOperatorControllerMetricsExportedEndpoint(t *testing.T) { +// FetchOperatorControllerMetricsExportedEndpoint verifies that the metrics endpoint for the operator controller +func FetchOperatorControllerMetricsExportedEndpoint(t *testing.T) { kubeClient, restConfig := findK8sClient(t) mtc := NewMetricsTestConfig( t, @@ -49,13 +51,14 @@ func TestOperatorControllerMetricsExportedEndpoint(t *testing.T) { "operator-controller-controller-manager", "oper-curl-metrics", "https://operator-controller-service.NAMESPACE.svc.cluster.local:8443/metrics", + "oper-con", ) mtc.run() } -// TestCatalogdMetricsExportedEndpoint verifies that the metrics endpoint for catalogd -func TestCatalogdMetricsExportedEndpoint(t *testing.T) { +// FetchCatalogdMetricsExportedEndpoint verifies that the metrics endpoint for catalogd +func FetchCatalogdMetricsExportedEndpoint(t *testing.T) { kubeClient, restConfig := findK8sClient(t) mtc := NewMetricsTestConfig( t, @@ -67,11 +70,68 @@ func TestCatalogdMetricsExportedEndpoint(t *testing.T) { "catalogd-controller-manager", "catalogd-curl-metrics", "https://catalogd-service.NAMESPACE.svc.cluster.local:7443/metrics", + "catalogd", ) mtc.run() } +// fetchMetrics retrieves Prometheus metrics from the endpoint +func (c *MetricsTestConfig) fetchMetrics(ctx context.Context, token string) string { + c.t.Log("Fetching Prometheus metrics after test execution") + + // Command to execute inside the pod + cmd := []string{ + "curl", "-s", "-k", + "-H", "Authorization: Bearer " + token, + c.metricsURL, + } + + // Execute command in pod + req := c.kubeClient.CoreV1().RESTClient(). + Post(). + Resource("pods"). + Namespace(c.namespace). + Name(c.curlPodName). + SubResource("exec"). + VersionedParams(&corev1.PodExecOptions{ + Container: "curl", + Command: cmd, + Stdin: false, + Stdout: true, + Stderr: true, + TTY: false, + }, scheme.ParameterCodec) + + executor, err := remotecommand.NewSPDYExecutor(c.restConfig, "POST", req.URL()) + require.NoError(c.t, err, "Error creating SPDY executor") + + var stdout, stderr bytes.Buffer + streamOpts := remotecommand.StreamOptions{ + Stdout: &stdout, + Stderr: &stderr, + } + + err = executor.StreamWithContext(ctx, streamOpts) + require.NoError(c.t, err, "Error streaming exec request: %v", stderr.String()) + + return stdout.String() +} + +// saveMetricsToFile writes the fetched metrics to a text file +func (c *MetricsTestConfig) saveMetricsToFile(metrics string) { + dir := "results" + if err := os.MkdirAll(dir, os.ModePerm); err != nil { + c.t.Fatalf("Failed to create directory %s: %v", dir, err) + } + + filePath := fmt.Sprintf("%s/metrics_%s_%s.txt", dir, c.name, c.t.Name()) + err := os.WriteFile(filePath, []byte(metrics), 0644) + require.NoError(c.t, err, "Failed to save metrics to file") + + c.t.Logf("Metrics saved to: %s", filePath) +} + func findK8sClient(t *testing.T) (kubernetes.Interface, *rest.Config) { cfg, err := config.GetConfig() require.NoError(t, err, "Failed to get Kubernetes config") @@ -94,6 +154,7 @@ type MetricsTestConfig struct { serviceAccount string curlPodName string metricsURL string + name string } // NewMetricsTestConfig initializes a new MetricsTestConfig. @@ -107,6 +168,7 @@ func NewMetricsTestConfig( serviceAccount string, curlPodName string, metricsURL string, + name string, ) *MetricsTestConfig { // Discover which namespace the relevant Pod is running in namespace := getComponentNamespace(t, kubeClient, selector) @@ -124,24 +186,41 @@ func NewMetricsTestConfig( serviceAccount: serviceAccount, curlPodName: curlPodName, metricsURL: metricsURL, + name: name, } } // run executes the entire test flow func (c *MetricsTestConfig) run() { ctx := context.Background() - defer c.cleanup(ctx) + // To speed up + // defer c.cleanup(ctx) c.createMetricsClusterRoleBinding(ctx) token := c.getServiceAccountToken(ctx) c.createCurlMetricsPod(ctx) c.waitForPodReady(ctx) // Exec `curl` in the Pod to validate the metrics c.validateMetricsEndpoint(ctx, token) + + // Fetch and save Prometheus metrics after test execution + metrics := c.fetchMetrics(ctx, token) + c.saveMetricsToFile(metrics) } // createMetricsClusterRoleBinding to bind the cluster role so metrics are accessible func (c *MetricsTestConfig) createMetricsClusterRoleBinding(ctx context.Context) { - c.t.Logf("Creating ClusterRoleBinding %q in namespace %q", c.clusterBinding, c.namespace) + c.t.Logf("Ensuring ClusterRoleBinding %q exists in namespace %q", c.clusterBinding, c.namespace) + + _, err := c.kubeClient.RbacV1().ClusterRoleBindings().Get(ctx, c.clusterBinding, metav1.GetOptions{}) + if err == nil { + c.t.Logf("ClusterRoleBinding %q already exists, skipping creation", c.clusterBinding) + return + } + + if !apierrors.IsNotFound(err) { + require.NoError(c.t, err, "Error checking for existing ClusterRoleBinding") + return + } crb := &rbacv1.ClusterRoleBinding{ ObjectMeta: metav1.ObjectMeta{ @@ -161,8 +240,9 @@ func (c *MetricsTestConfig) createMetricsClusterRoleBinding(ctx context.Context) }, } - _, err := c.kubeClient.RbacV1().ClusterRoleBindings().Create(ctx, crb, metav1.CreateOptions{}) + _, err = c.kubeClient.RbacV1().ClusterRoleBindings().Create(ctx, crb, metav1.CreateOptions{}) require.NoError(c.t, err, "Error creating ClusterRoleBinding") + c.t.Logf("Successfully created ClusterRoleBinding %q", c.clusterBinding) } // getServiceAccountToken creates a TokenRequest for the service account @@ -188,7 +268,18 @@ func (c *MetricsTestConfig) getServiceAccountToken(ctx context.Context) string { // createCurlMetricsPod spawns a pod running `curlimages/curl` to check metrics func (c *MetricsTestConfig) createCurlMetricsPod(ctx context.Context) { - c.t.Logf("Creating curl pod (%s/%s) to validate the metrics endpoint", c.namespace, c.curlPodName) + c.t.Logf("Ensuring curl pod (%s/%s) exists to validate the metrics endpoint", c.namespace, c.curlPodName) + + _, err := c.kubeClient.CoreV1().Pods(c.namespace).Get(ctx, c.curlPodName, metav1.GetOptions{}) + if err == nil { + c.t.Logf("Curl pod %q already exists, skipping creation", c.curlPodName) + return + } + + if !apierrors.IsNotFound(err) { + require.NoError(c.t, err, "Error checking for existing curl pod") + return + } pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ @@ -220,7 +311,7 @@ func (c *MetricsTestConfig) createCurlMetricsPod(ctx context.Context) { }, } - _, err := c.kubeClient.CoreV1().Pods(c.namespace).Create(ctx, pod, metav1.CreateOptions{}) + _, err = c.kubeClient.CoreV1().Pods(c.namespace).Create(ctx, pod, metav1.CreateOptions{}) require.NoError(c.t, err, "Error creating curl pod") } From ff39cf253386fefd59519ef4f2e5b11a0fbb44f0 Mon Sep 17 00:00:00 2001 From: Camila Macedo <7708031+camilamacedo86@users.noreply.github.com> Date: Mon, 10 Mar 2025 10:04:57 +0000 Subject: [PATCH 2/3] filter data --- test/e2e/metrics_test.go | 63 ++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go index 7f9948264..9629735e0 100644 --- a/test/e2e/metrics_test.go +++ b/test/e2e/metrics_test.go @@ -16,9 +16,12 @@ package e2e import ( "bytes" "context" + "encoding/json" "errors" "fmt" "os" + "regexp" + "strconv" "strings" "testing" "time" @@ -77,17 +80,15 @@ func FetchCatalogdMetricsExportedEndpoint(t *testing.T) { } // fetchMetrics retrieves Prometheus metrics from the endpoint -func (c *MetricsTestConfig) fetchMetrics(ctx context.Context, token string) string { +func (c *MetricsTestConfig) fetchMetrics(ctx context.Context, token string) map[string]float64 { c.t.Log("Fetching Prometheus metrics after test execution") - // Command to execute inside the pod cmd := []string{ "curl", "-s", "-k", "-H", "Authorization: Bearer " + token, c.metricsURL, } - // Execute command in pod req := c.kubeClient.CoreV1().RESTClient(). Post(). Resource("pods"). @@ -115,21 +116,60 @@ func (c *MetricsTestConfig) fetchMetrics(ctx context.Context, token string) stri err = executor.StreamWithContext(ctx, streamOpts) require.NoError(c.t, err, "Error streaming exec request: %v", stderr.String()) - return stdout.String() + return parseMetrics(stdout.String()) } -// saveMetricsToFile writes the fetched metrics to a text file -func (c *MetricsTestConfig) saveMetricsToFile(metrics string) { +// parseMetrics extracts only the required metrics from Prometheus response +func parseMetrics(metricsText string) map[string]float64 { + relevantMetrics := []string{ + "controller_runtime_reconcile_time_seconds_sum", + "http_request_duration_seconds_sum", + "controller_runtime_reconcile_total", + "go_cpu_classes_gc_total_cpu_seconds_total", + "go_cpu_classes_idle_cpu_seconds_total", + "controller_runtime_reconcile_errors_total", + "controller_runtime_webhook_latency_seconds", + } + + mapMetrics := make(map[string]float64) + lines := strings.Split(metricsText, "\n") + for _, line := range lines { + // Check for partial matches + for _, partial := range relevantMetrics { + if strings.Contains(line, partial) { + re := regexp.MustCompile(`(\S+)\s+([0-9.]+)`) + match := re.FindStringSubmatch(line) + if len(match) == 3 { + metricName := match[1] + value, err := strconv.ParseFloat(match[2], 64) + if err == nil { + mapMetrics[metricName] = value + } + } + } + } + } + + return mapMetrics +} + +// storeMetricsResult writes only relevant metrics to a JSON file +func (c *MetricsTestConfig) storeMetricsResult(metrics map[string]float64) { dir := "results" if err := os.MkdirAll(dir, os.ModePerm); err != nil { c.t.Fatalf("Failed to create directory %s: %v", dir, err) } + + filePath := fmt.Sprintf("%s/metrics_%s_%s_%s.txt", dir, c.name, c.t.Name(), time.Now().Format("20060102_150405")) + jsonData, err := json.MarshalIndent(metrics, "", " ") + if err != nil { + c.t.Fatalf("Failed to encode metrics as JSON: %v", err) + } - filePath := fmt.Sprintf("%s/metrics_%s_%s.txt", dir, c.name, c.t.Name()) - err := os.WriteFile(filePath, []byte(metrics), 0644) + err = os.WriteFile(filePath, jsonData, 0644) require.NoError(c.t, err, "Failed to save metrics to file") - c.t.Logf("Metrics saved to: %s", filePath) + c.t.Logf("Filtered metrics saved to: %s", filePath) } func findK8sClient(t *testing.T) (kubernetes.Interface, *rest.Config) { @@ -193,8 +233,7 @@ func NewMetricsTestConfig( // run executes the entire test flow func (c *MetricsTestConfig) run() { ctx := context.Background() - // To speed up - // defer c.cleanup(ctx) + defer c.cleanup(ctx) c.createMetricsClusterRoleBinding(ctx) token := c.getServiceAccountToken(ctx) c.createCurlMetricsPod(ctx) @@ -204,7 +243,7 @@ func (c *MetricsTestConfig) run() { // Fetch and save Prometheus metrics after test execution metrics := c.fetchMetrics(ctx, token) - c.saveMetricsToFile(metrics) + c.storeMetricsResult(metrics) } // createMetricsClusterRoleBinding to bind the cluster role so metrics are accessible From 7f5cdc0f9cdf4c44d19bd3c39983171e2831ad94 Mon Sep 17 00:00:00 2001 From: Camila Macedo <7708031+camilamacedo86@users.noreply.github.com> Date: Tue, 11 Mar 2025 00:41:04 +0000 Subject: [PATCH 3/3] add pprof test but remove upload artefact --- .github/workflows/e2e.yaml | 11 +-- cmd/operator-controller/main.go | 3 + .../catalogd/manager/catalogd_service.yaml | 3 + config/base/catalogd/manager/manager.yaml | 1 + .../operator-controller/manager/manager.yaml | 1 + test/e2e/metrics_test.go | 80 +++++++++++++++++-- 6 files changed, 89 insertions(+), 10 deletions(-) diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 2694980d0..797fbb1cf 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -34,11 +34,12 @@ jobs: - name: Run e2e tests run: ARTIFACT_PATH=/tmp/artifacts make test-e2e - - - uses: actions/upload-artifact@v4 - with: - name: upgrade-e2e-artifacts - path: test/e2e/results/** + +# To not store pprof +# - uses: actions/upload-artifact@v4 +# with: +# name: upgrade-e2e-artifacts +# path: test/e2e/results/** - uses: actions/upload-artifact@v4 if: failure() diff --git a/cmd/operator-controller/main.go b/cmd/operator-controller/main.go index 56949ffd7..80ab0e4e6 100644 --- a/cmd/operator-controller/main.go +++ b/cmd/operator-controller/main.go @@ -83,6 +83,7 @@ var ( type config struct { metricsAddr string + pprofAddr string certFile string keyFile string enableLeaderElection bool @@ -131,6 +132,7 @@ func init() { //create flagset, the collection of flags for this command flags := operatorControllerCmd.Flags() flags.StringVar(&cfg.metricsAddr, "metrics-bind-address", "", "The address for the metrics endpoint. Requires tls-cert and tls-key. (Default: ':8443')") + flags.StringVar(&cfg.pprofAddr, "pprof-bind-address", "0", "The address the pprof endpoint binds to. an empty string or 0 disables pprof") flags.StringVar(&cfg.probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flags.StringVar(&cfg.catalogdCasDir, "catalogd-cas-dir", "", "The directory of TLS certificate authorities to use for verifying HTTPS connections to the Catalogd web service.") flags.StringVar(&cfg.pullCasDir, "pull-cas-dir", "", "The directory of TLS certificate authorities to use for verifying HTTPS connections to image registries.") @@ -265,6 +267,7 @@ func run() error { mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ Scheme: scheme.Scheme, Metrics: metricsServerOptions, + PprofBindAddress: cfg.pprofAddr, HealthProbeBindAddress: cfg.probeAddr, LeaderElection: cfg.enableLeaderElection, LeaderElectionID: "9c4404e7.operatorframework.io", diff --git a/config/base/catalogd/manager/catalogd_service.yaml b/config/base/catalogd/manager/catalogd_service.yaml index 693b687f3..6ca796bbf 100644 --- a/config/base/catalogd/manager/catalogd_service.yaml +++ b/config/base/catalogd/manager/catalogd_service.yaml @@ -14,6 +14,9 @@ spec: protocol: TCP port: 80 targetPort: 8443 + - name: pprof + port: 8083 + targetPort: 8083 - name: webhook protocol: TCP port: 9443 diff --git a/config/base/catalogd/manager/manager.yaml b/config/base/catalogd/manager/manager.yaml index 5c52165ec..12e04544d 100644 --- a/config/base/catalogd/manager/manager.yaml +++ b/config/base/catalogd/manager/manager.yaml @@ -45,6 +45,7 @@ spec: - ./catalogd args: - --leader-elect + - --pprof-bind-address=:8083 - --metrics-bind-address=:7443 - --external-address=catalogd-service.olmv1-system.svc image: controller:latest diff --git a/config/base/operator-controller/manager/manager.yaml b/config/base/operator-controller/manager/manager.yaml index db34940c3..979d52b32 100644 --- a/config/base/operator-controller/manager/manager.yaml +++ b/config/base/operator-controller/manager/manager.yaml @@ -44,6 +44,7 @@ spec: - /operator-controller args: - "--health-probe-bind-address=:8081" + - "--pprof-bind-address=:8082" - "--metrics-bind-address=:8443" - "--leader-elect" image: controller:latest diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go index 9629735e0..a39989214 100644 --- a/test/e2e/metrics_test.go +++ b/test/e2e/metrics_test.go @@ -20,6 +20,7 @@ import ( "errors" "fmt" "os" + "os/exec" "regexp" "strconv" "strings" @@ -153,13 +154,81 @@ func parseMetrics(metricsText string) map[string]float64 { return mapMetrics } +func (c *MetricsTestConfig) fetchPprofAndStore(ctx context.Context, token string, profileType string) { + c.t.Logf("fetching pprof profiling data for profile: %s", profileType) + + var url string + if strings.Contains(c.metricsURL, "catalogd-service") { + url = fmt.Sprintf("http://catalogd-service.olmv1-system.svc.cluster.local:8083/debug/pprof/%s", profileType) + } else if strings.Contains(c.metricsURL, "operator-controller-service") { + url = fmt.Sprintf("http://operator-controller-service.olmv1-system.svc.cluster.local:8082/debug/pprof/%s", profileType) + } else { + c.t.Fatalf("unknown service in metricsURL: %s", c.metricsURL) + } + + savePath := fmt.Sprintf("/tmp/%s.pprof", profileType) + cmd := []string{ + "sh", "-c", + fmt.Sprintf(`curl -s -k -H "Authorization: Bearer %s" %s > %s`, token, url, savePath), + } + + req := c.kubeClient.CoreV1().RESTClient(). + Post(). + Resource("pods"). + Namespace(c.namespace). + Name(c.curlPodName). + SubResource("exec"). + VersionedParams(&corev1.PodExecOptions{ + Container: "curl", + Command: cmd, + Stdin: false, + Stdout: true, + Stderr: true, + TTY: false, + }, scheme.ParameterCodec) + + executor, err := remotecommand.NewSPDYExecutor(c.restConfig, "POST", req.URL()) + require.NoError(c.t, err, "rrror creating SPDY executor for pod exec") + + var stdout, stderr bytes.Buffer + streamOpts := remotecommand.StreamOptions{ + Stdout: &stdout, + Stderr: &stderr, + } + + err = executor.StreamWithContext(ctx, streamOpts) + require.NoError(c.t, err, "rrror executing curl in pod: %v", stderr.String()) + + tt := strings.ReplaceAll(c.t.Name(), "/", "_") + localP := fmt.Sprintf("results/%s_%s_%s.pprof", tt, profileType, time.Now().Format("20060102_150405")) + copyCmd := fmt.Sprintf("kubectl cp %s/%s:%s %s", c.namespace, c.curlPodName, savePath, localP) + + output, copyErr := exec.Command("sh", "-c", copyCmd).CombinedOutput() + require.NoError(c.t, copyErr, "failed copy pprof: %s", string(output)) + c.t.Logf("Pprof data successfully saved to: %s", localP) + + textP := strings.Replace(localP, ".pprof", ".txt", 1) + textCmd := fmt.Sprintf("go tool pprof -text %s > %s", localP, textP) + textOutput, textErr := exec.Command("sh", "-c", textCmd).CombinedOutput() + require.NoError(c.t, textErr, "failed to convert pprof: %s", string(textOutput)) + + c.t.Logf("Pprof text saved to: %s", textP) +} + +func (c *MetricsTestConfig) fetchAndStorePprof(ctx context.Context, token string) { + profiles := []string{"profile", "heap", "goroutine"} + for _, profileType := range profiles { + c.fetchPprofAndStore(ctx, token, profileType) + } +} + // storeMetricsResult writes only relevant metrics to a JSON file func (c *MetricsTestConfig) storeMetricsResult(metrics map[string]float64) { dir := "results" if err := os.MkdirAll(dir, os.ModePerm); err != nil { c.t.Fatalf("Failed to create directory %s: %v", dir, err) } - + filePath := fmt.Sprintf("%s/metrics_%s_%s_%s.txt", dir, c.name, c.t.Name(), time.Now().Format("20060102_150405")) jsonData, err := json.MarshalIndent(metrics, "", " ") if err != nil { @@ -167,9 +236,9 @@ func (c *MetricsTestConfig) storeMetricsResult(metrics map[string]float64) { } err = os.WriteFile(filePath, jsonData, 0644) - require.NoError(c.t, err, "Failed to save metrics to file") + require.NoError(c.t, err, "failed to save metrics") - c.t.Logf("Filtered metrics saved to: %s", filePath) + c.t.Logf("filtered metrics saved to: %s", filePath) } func findK8sClient(t *testing.T) (kubernetes.Interface, *rest.Config) { @@ -233,7 +302,7 @@ func NewMetricsTestConfig( // run executes the entire test flow func (c *MetricsTestConfig) run() { ctx := context.Background() - defer c.cleanup(ctx) + //defer c.cleanup(ctx) c.createMetricsClusterRoleBinding(ctx) token := c.getServiceAccountToken(ctx) c.createCurlMetricsPod(ctx) @@ -241,9 +310,10 @@ func (c *MetricsTestConfig) run() { // Exec `curl` in the Pod to validate the metrics c.validateMetricsEndpoint(ctx, token) - // Fetch and save Prometheus metrics after test execution metrics := c.fetchMetrics(ctx, token) c.storeMetricsResult(metrics) + + c.fetchAndStorePprof(ctx, token) } // createMetricsClusterRoleBinding to bind the cluster role so metrics are accessible