From 572ac8d458917bc02162fb27981c2f813ca0232b Mon Sep 17 00:00:00 2001 From: Olof Mattsson Date: Thu, 28 May 2026 13:14:18 +0200 Subject: [PATCH 1/7] ci(live): run cli/test/live/ on every PR against a freshly-booted backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #96. Boots k8s-stack-manager (origin/main, api-only docker-compose profile) in the same job, applies a SQL fixture so require* helpers don't skip, mints a 1-day API key via the public auth endpoints, then runs the live suite. PR-time signal that the wire contracts still match — the class of bug stub-based unit tests can't see (recently surfaced in PR #98). What ships: .github/workflows/live-tests.yml - Triggers on push to main + every PR targeting main. - Clones k8s-sm main, `docker compose up backend` (no --profile → api-only, since frontend is gated to "full"). - Polls /health/live for up to 60s. - Pipes cli/test/live/testdata/ci-seed.sql into the mysql container. - Logs in as the env-seeded admin, mints a key via POST /api/v1/users//api-keys with expires_in_days=1, masks it in logs, exports as STACKCTL_LIVE_API_KEY. - `go test -tags live -timeout 10m ./test/live/...` - Dumps backend logs on failure. cli/test/live/testdata/ci-seed.sql - 1 cluster (no kubeconfig — test-connection will fail, tests expect that), 1 stack_definition + chart_config, 1 published stack_template + template_chart_config. - owner_id is looked up via SELECT … WHERE username='admin' so we don't need to inject the non-deterministic admin UUID. cli/test/live/doc.go - Documents the env vars + how to reproduce the CI flow locally (clone k8s-sm, compose up, apply seed, mint key, run suite). Design decisions (from the open questions in #96): - Bootstrap: backend SQL seed for fixtures, login-then-mint for the API key. Avoids hardcoding bcrypt/SHA-256 hashes in test data. - Backend pin: track origin/main. Surfaces cross-repo contract breaks immediately — that's exactly the signal #96 wants. - Workflow location: separate file. Easy to mark non-required if it ever gets flaky without blocking the rest of CI. Two contract-drift gotchas surfaced while validating against rancher-desktop k8s-sm (the value this workflow will catch on PRs): - POST /api/v1/users//api-keys requires expires_at OR expires_in_days. Initial draft sent neither and got 400. - The minted-key response field is `raw_key`, not `key`. Both fixed in the workflow + doc.go before pushing. Co-Authored-By: Claude Opus 4.7 --- .github/workflows/live-tests.yml | 122 +++++++++++++++++++++++++++++ cli/test/live/doc.go | 55 +++++++++++-- cli/test/live/testdata/ci-seed.sql | 90 +++++++++++++++++++++ 3 files changed, 262 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/live-tests.yml create mode 100644 cli/test/live/testdata/ci-seed.sql diff --git a/.github/workflows/live-tests.yml b/.github/workflows/live-tests.yml new file mode 100644 index 0000000..db2e1cd --- /dev/null +++ b/.github/workflows/live-tests.yml @@ -0,0 +1,122 @@ +name: Live tests + +# Boots k8s-stack-manager (origin/main, api-only profile) in the same job, +# applies a SQL fixture, mints an API key, then runs cli/test/live against +# the booted backend. Catches wire-contract drift between the two repos at +# PR time — the stub-based unit tests can't see it. +# +# Tracks omattsson/stackctl#96. + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +concurrency: + group: live-tests-${{ github.ref }} + cancel-in-progress: true + +jobs: + live: + name: Live integration suite + runs-on: ubuntu-latest + timeout-minutes: 20 + env: + ADMIN_USERNAME: admin + ADMIN_PASSWORD: ci-admin-password-do-not-reuse + DB_PASSWORD: rootpassword + DB_NAME: app + MYSQL_CONTAINER: app-mysql-dev + BACKEND_URL: http://localhost:8081 + steps: + - name: Check out stackctl + uses: actions/checkout@v6 + + - name: Set up Go + uses: actions/setup-go@v6 + with: + go-version-file: cli/go.mod + cache-dependency-path: cli/go.sum + + - name: Clone k8s-stack-manager (origin/main) + run: | + git clone --depth=1 https://github.com/omattsson/k8s-stack-manager /tmp/k8s-sm + echo "k8s-sm @ $(git -C /tmp/k8s-sm rev-parse --short HEAD)" + + - name: Boot api-only stack (backend + mysql) + working-directory: /tmp/k8s-sm + env: + ADMIN_USERNAME: ${{ env.ADMIN_USERNAME }} + ADMIN_PASSWORD: ${{ env.ADMIN_PASSWORD }} + DB_PASSWORD: ${{ env.DB_PASSWORD }} + run: | + # No --profile flag → only services without profiles run, which is + # exactly the api-only set (backend + mysql). Frontend is gated to + # the "full" profile. + docker compose up -d --wait backend + + - name: Wait for backend /health/live + run: | + for i in $(seq 1 60); do + if curl -fsS "$BACKEND_URL/health/live" >/dev/null 2>&1; then + echo "Backend healthy after ${i}s" + exit 0 + fi + sleep 1 + done + echo "Backend never became healthy" >&2 + docker compose -f /tmp/k8s-sm/docker-compose.yml logs backend + exit 1 + + - name: Apply CI seed fixtures + run: | + docker exec -i "$MYSQL_CONTAINER" \ + mysql -u root -p"$DB_PASSWORD" "$DB_NAME" \ + < cli/test/live/testdata/ci-seed.sql + + - name: Mint API key for live tests + id: apikey + run: | + set -euo pipefail + # Login as the env-seeded admin → JWT + admin user id. + login=$(curl -fsS -X POST "$BACKEND_URL/api/v1/auth/login" \ + -H 'Content-Type: application/json' \ + -d "{\"username\":\"$ADMIN_USERNAME\",\"password\":\"$ADMIN_PASSWORD\"}") + jwt=$(echo "$login" | jq -r '.token // .access_token') + admin_id=$(echo "$login" | jq -r '.user.id') + if [ -z "$jwt" ] || [ "$jwt" = "null" ] || [ -z "$admin_id" ] || [ "$admin_id" = "null" ]; then + echo "Login response missing token/user.id" >&2 + echo "$login" | jq . >&2 + exit 1 + fi + # Mint a key — the raw key is only returned once. + # Backend requires an expiration; 1 day is plenty for a CI run. + # Response field is "raw_key" (prefixed sk_), not "key". + mint=$(curl -fsS -X POST "$BACKEND_URL/api/v1/users/$admin_id/api-keys" \ + -H "Authorization: Bearer $jwt" \ + -H 'Content-Type: application/json' \ + -d '{"name":"ci-live-tests","expires_in_days":1}') + key=$(echo "$mint" | jq -r '.raw_key') + if [ -z "$key" ] || [ "$key" = "null" ]; then + echo "API-key mint response missing .raw_key" >&2 + echo "$mint" | jq . >&2 + exit 1 + fi + # Mask in logs and export for the test step. + echo "::add-mask::$key" + echo "key=$key" >> "$GITHUB_OUTPUT" + + - name: Run live suite + env: + STACKCTL_LIVE_URL: ${{ env.BACKEND_URL }} + STACKCTL_LIVE_API_KEY: ${{ steps.apikey.outputs.key }} + working-directory: cli + run: go test -tags live -count=1 -timeout 10m -v ./test/live/... + + - name: Backend logs on failure + if: failure() + run: docker compose -f /tmp/k8s-sm/docker-compose.yml logs backend diff --git a/cli/test/live/doc.go b/cli/test/live/doc.go index 3334106..ae1b0d1 100644 --- a/cli/test/live/doc.go +++ b/cli/test/live/doc.go @@ -1,7 +1,52 @@ -// Package live contains integration tests that run against a live backend. -// These tests are gated behind the "live" build tag and require environment -// variables STACKCTL_LIVE_USER and STACKCTL_LIVE_PASS. STACKCTL_LIVE_URL is -// optional; if unset, the tests default to http://localhost:8081. +// Package live contains integration tests that run against a live +// k8s-stack-manager backend. They are gated behind the "live" build tag +// because they make real HTTP calls and shape-check the response wire +// contract — the only way to catch field-name drift between stackctl and +// the backend that stub-based unit tests can't see. // -// Run with: go test -tags live ./test/live/ -v +// # Environment +// +// - STACKCTL_LIVE_URL — backend base URL (default: http://localhost:8081). +// - One of: +// STACKCTL_LIVE_API_KEY — header-based, no session. +// STACKCTL_LIVE_USER + STACKCTL_LIVE_PASS — login flow. +// - STACKCTL_LIVE_HEAVY=1 — also run the workload tests in live_test.go +// that actually deploy stack instances (~80 GiB of golden-db pull +// each, takes ~10 min). Off by default — the per-endpoint *_live_test.go +// files cover wire-shape contracts without real workloads. +// +// # Running locally +// +// go test -tags live ./test/live/ -v +// +// # Reproducing the CI flow +// +// .github/workflows/live-tests.yml boots k8s-stack-manager (origin/main, +// api-only profile) in the same job and runs this suite. To reproduce +// locally, run the equivalent of those steps against your own checkout: +// +// # 1. Boot the api-only stack (backend + mysql; no frontend). +// cd /path/to/k8s-stack-manager +// ADMIN_USERNAME=admin ADMIN_PASSWORD=ci-admin-password-do-not-reuse \ +// docker compose up -d --wait backend +// +// # 2. Apply the seed fixtures (one cluster, one definition, one +// # published template — so require* helpers don't skip). +// docker exec -i app-mysql-dev mysql -u root -prootpassword app \ +// < /path/to/stackctl/cli/test/live/testdata/ci-seed.sql +// +// # 3. Mint an API key (login → mint), then run the suite. +// jwt=$(curl -fsS -X POST http://localhost:8081/api/v1/auth/login \ +// -H 'Content-Type: application/json' \ +// -d '{"username":"admin","password":"ci-admin-password-do-not-reuse"}' \ +// | jq -r '.token // .access_token') +// admin_id=$(curl -fsS http://localhost:8081/api/v1/auth/me \ +// -H "Authorization: Bearer $jwt" | jq -r '.id') +// key=$(curl -fsS -X POST "http://localhost:8081/api/v1/users/$admin_id/api-keys" \ +// -H "Authorization: Bearer $jwt" -H 'Content-Type: application/json' \ +// -d '{"name":"local-live-tests","expires_in_days":1}' | jq -r '.raw_key') +// +// STACKCTL_LIVE_URL=http://localhost:8081 \ +// STACKCTL_LIVE_API_KEY="$key" \ +// go test -tags live -count=1 ./test/live/... package live diff --git a/cli/test/live/testdata/ci-seed.sql b/cli/test/live/testdata/ci-seed.sql new file mode 100644 index 0000000..9404109 --- /dev/null +++ b/cli/test/live/testdata/ci-seed.sql @@ -0,0 +1,90 @@ +-- CI fixture seed for the live integration suite. +-- +-- The live tests gate themselves behind require{Cluster,Definition,Template} +-- helpers that skip when the backend has nothing to look at. A fresh CI +-- backend would skip ~70% of the suite — defeating the point of running it. +-- +-- This script inserts the minimum metadata needed so every test reaches its +-- assertions: +-- - 1 cluster (no real connectivity) +-- - 1 stack_definition + 1 chart_config +-- - 1 stack_template + 1 template_chart_config +-- +-- Owner is the env-seeded admin user (created by the backend's +-- EnsureAdminUser at boot). We look it up at apply time so we don't need to +-- substitute the non-deterministic UUID into the script. + +SET @admin_id = (SELECT id FROM users WHERE username = 'admin' LIMIT 1); + +-- Cluster — health_status stays "unreachable" since api-only mode has no +-- real kubeconfig. test-connection / nodes endpoints will fail; tests +-- handle that as expected. +INSERT INTO clusters ( + id, name, description, + api_server_url, kubeconfig_data, kubeconfig_path, + region, health_status, + registry_url, registry_username, registry_password, image_pull_secret_name, + max_namespaces, max_instances_per_user, + is_default, use_in_cluster, + created_at, updated_at +) VALUES ( + '00000000-0000-0000-0000-000000000001', 'ci-stub', 'CI fixture cluster — no real connectivity', + '', '', '', + '', 'unreachable', + '', '', '', '', + 0, 0, + 1, 0, + NOW(), NOW() +); + +-- Stack definition (owner = admin) with a single noop chart. +INSERT INTO stack_definitions ( + id, name, description, owner_id, default_branch, created_at, updated_at +) VALUES ( + '00000000-0000-0000-0000-000000000002', + 'ci-stub-definition', + 'CI fixture definition — wire-shape only, never deployed', + @admin_id, 'master', NOW(), NOW() +); + +INSERT INTO chart_configs ( + id, stack_definition_id, + chart_name, repository_url, source_repo_url, build_pipeline_id, + chart_path, chart_version, default_values, deploy_order, + created_at +) VALUES ( + '00000000-0000-0000-0000-000000000003', + '00000000-0000-0000-0000-000000000002', + 'ci-noop', '', '', '', + '', '0.1.0', '', 0, + NOW() +); + +-- Published stack template so requireTemplate and TestLiveTemplate_ListAndGet +-- find at least one row. Inline chart so the chart-roundtrip tests have +-- something to read. +INSERT INTO stack_templates ( + id, name, description, category, version, owner_id, + default_branch, is_published, created_at, updated_at +) VALUES ( + '00000000-0000-0000-0000-000000000004', + 'ci-stub-template', + 'CI fixture template — published so list filter sees it', + '', '1.0.0', @admin_id, + 'master', 1, NOW(), NOW() +); + +INSERT INTO template_chart_configs ( + id, stack_template_id, + chart_name, repository_url, source_repo_url, build_pipeline_id, + chart_path, chart_version, default_values, locked_values, + deploy_order, required, + created_at +) VALUES ( + '00000000-0000-0000-0000-000000000005', + '00000000-0000-0000-0000-000000000004', + 'ci-noop', '', '', '', + '', '0.1.0', '', '', + 0, 0, + NOW() +); From e0fab5ef10fbde841f0a5a71a55fe5b0318d31ab Mon Sep 17 00:00:00 2001 From: Olof Mattsson Date: Thu, 28 May 2026 13:25:55 +0200 Subject: [PATCH 2/7] ci(live): reset upstream dev bind-mount so production image starts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit k8s-stack-manager's docker-compose.yml bind-mounts ./backend:/app for hot-reload during dev. With target: production, that overlays the image's baked-in /app/main binary with the host source tree, which has no compiled binary — container exits immediately with "stat ./main: no such file or directory". Ships testdata/docker-compose.ci.yml as a compose override that resets the volumes list (Compose ≥1.28 !reset semantics), and wires both -f files into every compose call in the workflow. Co-Authored-By: Claude Opus 4.7 --- .github/workflows/live-tests.yml | 20 ++++++++++++++++---- cli/test/live/testdata/docker-compose.ci.yml | 11 +++++++++++ 2 files changed, 27 insertions(+), 4 deletions(-) create mode 100644 cli/test/live/testdata/docker-compose.ci.yml diff --git a/.github/workflows/live-tests.yml b/.github/workflows/live-tests.yml index db2e1cd..303061f 100644 --- a/.github/workflows/live-tests.yml +++ b/.github/workflows/live-tests.yml @@ -48,7 +48,6 @@ jobs: echo "k8s-sm @ $(git -C /tmp/k8s-sm rev-parse --short HEAD)" - name: Boot api-only stack (backend + mysql) - working-directory: /tmp/k8s-sm env: ADMIN_USERNAME: ${{ env.ADMIN_USERNAME }} ADMIN_PASSWORD: ${{ env.ADMIN_PASSWORD }} @@ -57,7 +56,13 @@ jobs: # No --profile flag → only services without profiles run, which is # exactly the api-only set (backend + mysql). Frontend is gated to # the "full" profile. - docker compose up -d --wait backend + # + # The override file resets the dev bind-mount that would otherwise + # hide the production image's baked-in ./main binary. + docker compose \ + -f /tmp/k8s-sm/docker-compose.yml \ + -f "$GITHUB_WORKSPACE/cli/test/live/testdata/docker-compose.ci.yml" \ + up -d --wait backend - name: Wait for backend /health/live run: | @@ -69,7 +74,10 @@ jobs: sleep 1 done echo "Backend never became healthy" >&2 - docker compose -f /tmp/k8s-sm/docker-compose.yml logs backend + docker compose \ + -f /tmp/k8s-sm/docker-compose.yml \ + -f "$GITHUB_WORKSPACE/cli/test/live/testdata/docker-compose.ci.yml" \ + logs backend exit 1 - name: Apply CI seed fixtures @@ -119,4 +127,8 @@ jobs: - name: Backend logs on failure if: failure() - run: docker compose -f /tmp/k8s-sm/docker-compose.yml logs backend + run: | + docker compose \ + -f /tmp/k8s-sm/docker-compose.yml \ + -f "$GITHUB_WORKSPACE/cli/test/live/testdata/docker-compose.ci.yml" \ + logs backend diff --git a/cli/test/live/testdata/docker-compose.ci.yml b/cli/test/live/testdata/docker-compose.ci.yml new file mode 100644 index 0000000..e237809 --- /dev/null +++ b/cli/test/live/testdata/docker-compose.ci.yml @@ -0,0 +1,11 @@ +# CI override for k8s-stack-manager's docker-compose.yml. +# +# The upstream compose bind-mounts ./backend:/app for dev hot-reload, but +# that overlays the production image's baked-in /app/main binary with the +# host source tree (which has no compiled binary), so the container can't +# start. We're not editing source at runtime in CI, so reset the volumes. +# +# `!reset []` replaces the list instead of merging (Compose ≥1.28). +services: + backend: + volumes: !reset [] From a38daf1e1362432b1c812abaf09545a1d56d3705 Mon Sep 17 00:00:00 2001 From: Olof Mattsson Date: Thu, 28 May 2026 13:35:19 +0200 Subject: [PATCH 3/7] ci(live): redact JWT and raw API key from failure-path logs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If login or api-key mint shapes drift, the workflow dumps the raw response to stderr for debugging. Both responses carry credentials (JWT in login, raw_key in mint) that shouldn't end up in CI logs even on the failure branch — masked outputs still surface the field names and status, which is the actual diagnostic value. Addresses CodeRabbit on PR #99. The sister findings about pinning actions/checkout@v6 and actions/setup-go@v6 to commit SHAs and adding persist-credentials: false are deliberately skipped — none of the four existing workflows in this repo follow that policy, and the right place to harden is a dedicated repo-wide change, not a one-workflow special case. Co-Authored-By: Claude Opus 4.7 --- .github/workflows/live-tests.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/live-tests.yml b/.github/workflows/live-tests.yml index 303061f..d749469 100644 --- a/.github/workflows/live-tests.yml +++ b/.github/workflows/live-tests.yml @@ -98,7 +98,8 @@ jobs: admin_id=$(echo "$login" | jq -r '.user.id') if [ -z "$jwt" ] || [ "$jwt" = "null" ] || [ -z "$admin_id" ] || [ "$admin_id" = "null" ]; then echo "Login response missing token/user.id" >&2 - echo "$login" | jq . >&2 + # Drop the JWT before logging so a failed boot can't leak it. + echo "$login" | jq 'del(.token, .access_token)' >&2 exit 1 fi # Mint a key — the raw key is only returned once. @@ -111,7 +112,9 @@ jobs: key=$(echo "$mint" | jq -r '.raw_key') if [ -z "$key" ] || [ "$key" = "null" ]; then echo "API-key mint response missing .raw_key" >&2 - echo "$mint" | jq . >&2 + # Drop the raw key before logging — it's only valid once but + # still a credential while the job is alive. + echo "$mint" | jq 'del(.raw_key)' >&2 exit 1 fi # Mask in logs and export for the test step. From f96fc5f84e1b56d7fadd1efd30ecb7c89a6832be Mon Sep 17 00:00:00 2001 From: Olof Mattsson Date: Thu, 28 May 2026 13:41:51 +0200 Subject: [PATCH 4/7] ci(live): disable container-side healthcheck (alpine image has no curl) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upstream backend healthcheck runs `curl -f http://localhost:8081/health/live` inside the container, but the production alpine image only ships ./main + helm + ca-certificates — no curl. The healthcheck fails forever and `docker compose up --wait` hits its timeout. We already poll /health/live from the host in the next step, so the in-container check is redundant. Override with `test: ["NONE"]` and keep the host-side gate. Co-Authored-By: Claude Opus 4.7 --- cli/test/live/testdata/docker-compose.ci.yml | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/cli/test/live/testdata/docker-compose.ci.yml b/cli/test/live/testdata/docker-compose.ci.yml index e237809..b8b3140 100644 --- a/cli/test/live/testdata/docker-compose.ci.yml +++ b/cli/test/live/testdata/docker-compose.ci.yml @@ -1,11 +1,20 @@ # CI override for k8s-stack-manager's docker-compose.yml. # -# The upstream compose bind-mounts ./backend:/app for dev hot-reload, but -# that overlays the production image's baked-in /app/main binary with the -# host source tree (which has no compiled binary), so the container can't -# start. We're not editing source at runtime in CI, so reset the volumes. +# Two upstream defaults don't fit `target: production` + headless CI: # -# `!reset []` replaces the list instead of merging (Compose ≥1.28). +# 1. `./backend:/app` bind-mount is for dev hot-reload. With the production +# target it overlays the baked-in /app/main binary with the host source +# tree (which has no compiled binary), so the container exits with +# "stat ./main: no such file or directory". `!reset []` replaces the +# list instead of merging (Compose ≥1.28). +# +# 2. The healthcheck runs `curl -f http://localhost:8081/health/live` +# inside the container, but the alpine production image doesn't have +# curl installed. The check fails forever and `docker compose up +# --wait` times out. Disable the container-side healthcheck — we run +# our own poll against /health/live from the host in the next step. services: backend: volumes: !reset [] + healthcheck: + test: ["NONE"] From 259341d1c29041fd827d35cc9d8d6389d680a05d Mon Sep 17 00:00:00 2001 From: Olof Mattsson Date: Thu, 28 May 2026 13:46:34 +0200 Subject: [PATCH 5/7] ci(live): use wget for healthcheck (alpine image has no curl) Last attempt set `test: ["NONE"]` to disable the broken curl-based healthcheck, but `docker compose up --wait` errors out with "no healthcheck configured" instead of treating "running" as ready. Alpine's base image ships busybox `wget` (curl is not installed), so override the healthcheck command to use that. Shorter interval + retries chosen so a healthy backend is flagged ready within ~10s rather than upstream's 75s start_period. Co-Authored-By: Claude Opus 4.7 --- cli/test/live/testdata/docker-compose.ci.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/cli/test/live/testdata/docker-compose.ci.yml b/cli/test/live/testdata/docker-compose.ci.yml index b8b3140..f9dc287 100644 --- a/cli/test/live/testdata/docker-compose.ci.yml +++ b/cli/test/live/testdata/docker-compose.ci.yml @@ -10,11 +10,16 @@ # # 2. The healthcheck runs `curl -f http://localhost:8081/health/live` # inside the container, but the alpine production image doesn't have -# curl installed. The check fails forever and `docker compose up -# --wait` times out. Disable the container-side healthcheck — we run -# our own poll against /health/live from the host in the next step. +# curl installed. Replace with busybox `wget --spider`, which alpine +# ships in the base image. (`test: ["NONE"]` would disable the check +# but then `docker compose up --wait` errors with "no healthcheck +# configured" instead of treating "running" as ready.) services: backend: volumes: !reset [] healthcheck: - test: ["NONE"] + test: ["CMD", "wget", "-q", "--spider", "http://localhost:8081/health/live"] + interval: 5s + timeout: 5s + retries: 12 + start_period: 30s From 9c98f0e2ec7ce837f3dd9bf95209f9b809ed7b7d Mon Sep 17 00:00:00 2001 From: Olof Mattsson Date: Thu, 28 May 2026 14:12:34 +0200 Subject: [PATCH 6/7] test(live): skip cluster health/nodes when backend can't reach cluster MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CI live-tests workflow runs against a stub cluster (no kubeconfig, no real kube-apiserver). The backend correctly returns 500 with "Failed to connect to cluster" for diagnostic endpoints in that state, so two subtests of TestLiveCluster_HealthAndTest were failing in CI while passing locally against rancher-desktop. Two changes: 1. The `health` subtest now skips on the same "cluster unreachable" conditions the `nodes` subtest already handles (added in #98 via CodeRabbit's tightening). Symmetric with `nodes` so a wire-shape regression on either endpoint still surfaces as a failure. 2. The substring set is extracted into isClusterUnreachable() and widened to include "failed to connect to cluster" — the actual literal in the backend's 500 response when kubeconfig is empty or the cluster is down. Verified against rancher-desktop k8s-sm: all three subtests still pass (real cluster, real kubeconfig). The CI api-only path will now skip the two diagnostic subtests cleanly instead of failing. Co-Authored-By: Claude Opus 4.7 --- cli/test/live/cluster_live_test.go | 35 +++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/cli/test/live/cluster_live_test.go b/cli/test/live/cluster_live_test.go index 1002e51..e258b53 100644 --- a/cli/test/live/cluster_live_test.go +++ b/cli/test/live/cluster_live_test.go @@ -69,7 +69,17 @@ func TestLiveCluster_HealthAndTest(t *testing.T) { t.Run("health", func(t *testing.T) { health, err := c.GetClusterHealth(cluster.ID) - require.NoError(t, err, "get cluster health") + if err != nil { + // Same skip rule as the "nodes" subtest below: when the + // backend can't talk to the cluster, the health summary + // 500s. That's not a wire-shape regression, just an + // environment with no kubeconfig (e.g. the CI api-only + // stack against a stub cluster). + if isClusterUnreachable(err) { + t.Skipf("health endpoint unavailable (cluster not reachable): %v", err) + } + require.NoError(t, err, "get cluster health") + } // Health summary should at least populate node count for any // non-empty cluster. Zero would mean the wire decode lost data. assert.GreaterOrEqual(t, health.NodeCount, 0, "node_count must decode (zero is fine for empty clusters)") @@ -91,8 +101,7 @@ func TestLiveCluster_HealthAndTest(t *testing.T) { if err != nil { // Skip only when the backend can't reach the cluster — any // other error (bad wire shape, 5xx) should fail the test. - msg := strings.ToLower(err.Error()) - if strings.Contains(msg, "not reachable") || strings.Contains(msg, "unavailable") || strings.Contains(msg, "connection refused") { + if isClusterUnreachable(err) { t.Skipf("nodes endpoint unavailable (cluster not reachable): %v", err) } require.NoError(t, err, "get cluster nodes") @@ -101,3 +110,23 @@ func TestLiveCluster_HealthAndTest(t *testing.T) { assert.NotNil(t, nodes) }) } + +// isClusterUnreachable returns true when the backend's error indicates +// the cluster itself isn't reachable (no kubeconfig, kube-apiserver +// down, DNS failure) rather than a wire-shape or auth issue. Used by +// the diagnostic-endpoint tests to skip cleanly in CI environments +// running against a stub cluster. +func isClusterUnreachable(err error) bool { + msg := strings.ToLower(err.Error()) + for _, needle := range []string{ + "not reachable", + "unavailable", + "connection refused", + "failed to connect to cluster", // backend's literal 500 message + } { + if strings.Contains(msg, needle) { + return true + } + } + return false +} From 8a34986a84da125c537845a1793383278b4c970e Mon Sep 17 00:00:00 2001 From: Olof Mattsson Date: Thu, 28 May 2026 14:20:28 +0200 Subject: [PATCH 7/7] test(live): tighten cluster-unreachable substring set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop "unavailable" — too generic, could mask a legitimate 503 from a wire-shape regression. Add "unreachable" instead, which matches the backend's models.ClusterUnreachable status constant and is the more accurate token for a cluster-down condition. Addresses CodeRabbit on PR #99. The companion suggestion to convert TestLiveCluster_HealthAndTest to a table-driven + t.Parallel() form is deliberately skipped: none of the four sibling *_live_test.go files in this package follow that pattern (the convention applies to unit tests; live tests against a shared backend run serially by design), and a mass-refactor of the live suite is out of scope for a workflow PR. Co-Authored-By: Claude Opus 4.7 --- cli/test/live/cluster_live_test.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cli/test/live/cluster_live_test.go b/cli/test/live/cluster_live_test.go index e258b53..5da3276 100644 --- a/cli/test/live/cluster_live_test.go +++ b/cli/test/live/cluster_live_test.go @@ -119,8 +119,11 @@ func TestLiveCluster_HealthAndTest(t *testing.T) { func isClusterUnreachable(err error) bool { msg := strings.ToLower(err.Error()) for _, needle := range []string{ + // Backend uses ClusterUnreachable = "unreachable" as its + // status constant; keep both spacings since the error string + // can render either way. + "unreachable", "not reachable", - "unavailable", "connection refused", "failed to connect to cluster", // backend's literal 500 message } {