From 572ac8d458917bc02162fb27981c2f813ca0232b Mon Sep 17 00:00:00 2001
From: Olof Mattsson <olof.mattsson@klaravik.se>
Date: Thu, 28 May 2026 13:14:18 +0200
Subject: [PATCH 1/7] ci(live): run cli/test/live/ on every PR against a
 freshly-booted backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #96.

Boots k8s-stack-manager (origin/main, api-only docker-compose profile)
in the same job, applies a SQL fixture so require* helpers don't skip,
mints a 1-day API key via the public auth endpoints, then runs the
live suite. PR-time signal that the wire contracts still match — the
class of bug stub-based unit tests can't see (recently surfaced in
PR #98).

What ships:

  .github/workflows/live-tests.yml
      - Triggers on push to main + every PR targeting main.
      - Clones k8s-sm main, `docker compose up backend` (no --profile
        → api-only, since frontend is gated to "full").
      - Polls /health/live for up to 60s.
      - Pipes cli/test/live/testdata/ci-seed.sql into the mysql
        container.
      - Logs in as the env-seeded admin, mints a key via
        POST /api/v1/users/<id>/api-keys with expires_in_days=1,
        masks it in logs, exports as STACKCTL_LIVE_API_KEY.
      - `go test -tags live -timeout 10m ./test/live/...`
      - Dumps backend logs on failure.

  cli/test/live/testdata/ci-seed.sql
      - 1 cluster (no kubeconfig — test-connection will fail, tests
        expect that), 1 stack_definition + chart_config, 1 published
        stack_template + template_chart_config.
      - owner_id is looked up via SELECT … WHERE username='admin' so
        we don't need to inject the non-deterministic admin UUID.

  cli/test/live/doc.go
      - Documents the env vars + how to reproduce the CI flow locally
        (clone k8s-sm, compose up, apply seed, mint key, run suite).

Design decisions (from the open questions in #96):

  - Bootstrap: backend SQL seed for fixtures, login-then-mint for the
    API key. Avoids hardcoding bcrypt/SHA-256 hashes in test data.
  - Backend pin: track origin/main. Surfaces cross-repo contract
    breaks immediately — that's exactly the signal #96 wants.
  - Workflow location: separate file. Easy to mark non-required if it
    ever gets flaky without blocking the rest of CI.

Two contract-drift gotchas surfaced while validating against
rancher-desktop k8s-sm (the value this workflow will catch on PRs):

  - POST /api/v1/users/<id>/api-keys requires expires_at OR
    expires_in_days. Initial draft sent neither and got 400.
  - The minted-key response field is `raw_key`, not `key`.

Both fixed in the workflow + doc.go before pushing.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/live-tests.yml   | 122 +++++++++++++++++++++++++++++
 cli/test/live/doc.go               |  55 +++++++++++--
 cli/test/live/testdata/ci-seed.sql |  90 +++++++++++++++++++++
 3 files changed, 262 insertions(+), 5 deletions(-)
 create mode 100644 .github/workflows/live-tests.yml
 create mode 100644 cli/test/live/testdata/ci-seed.sql
diff --git a/.github/workflows/live-tests.yml b/.github/workflows/live-tests.yml
new file mode 100644
index 0000000..db2e1cd
--- /dev/null
+++ b/.github/workflows/live-tests.yml
@@ -0,0 +1,122 @@
+name: Live tests
+
+# Boots k8s-stack-manager (origin/main, api-only profile) in the same job,
+# applies a SQL fixture, mints an API key, then runs cli/test/live against
+# the booted backend. Catches wire-contract drift between the two repos at
+# PR time — the stub-based unit tests can't see it.
+#
+# Tracks omattsson/stackctl#96.
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+permissions:
+  contents: read
+
+concurrency:
+  group: live-tests-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  live:
+    name: Live integration suite
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    env:
+      ADMIN_USERNAME: admin
+      ADMIN_PASSWORD: ci-admin-password-do-not-reuse
+      DB_PASSWORD: rootpassword
+      DB_NAME: app
+      MYSQL_CONTAINER: app-mysql-dev
+      BACKEND_URL: http://localhost:8081
+    steps:
+      - name: Check out stackctl
+        uses: actions/checkout@v6
+
+      - name: Set up Go
+        uses: actions/setup-go@v6
+        with:
+          go-version-file: cli/go.mod
+          cache-dependency-path: cli/go.sum
+
+      - name: Clone k8s-stack-manager (origin/main)
+        run: |
+          git clone --depth=1 https://github.com/omattsson/k8s-stack-manager /tmp/k8s-sm
+          echo "k8s-sm @ $(git -C /tmp/k8s-sm rev-parse --short HEAD)"
+
+      - name: Boot api-only stack (backend + mysql)
+        working-directory: /tmp/k8s-sm
+        env:
+          ADMIN_USERNAME: ${{ env.ADMIN_USERNAME }}
+          ADMIN_PASSWORD: ${{ env.ADMIN_PASSWORD }}
+          DB_PASSWORD: ${{ env.DB_PASSWORD }}
+        run: |
+          # No --profile flag → only services without profiles run, which is
+          # exactly the api-only set (backend + mysql). Frontend is gated to
+          # the "full" profile.
+          docker compose up -d --wait backend
+
+      - name: Wait for backend /health/live
+        run: |
+          for i in $(seq 1 60); do
+            if curl -fsS "$BACKEND_URL/health/live" >/dev/null 2>&1; then
+              echo "Backend healthy after ${i}s"
+              exit 0
+            fi
+            sleep 1
+          done
+          echo "Backend never became healthy" >&2
+          docker compose -f /tmp/k8s-sm/docker-compose.yml logs backend
+          exit 1
+
+      - name: Apply CI seed fixtures
+        run: |
+          docker exec -i "$MYSQL_CONTAINER" \
+            mysql -u root -p"$DB_PASSWORD" "$DB_NAME" \
+            < cli/test/live/testdata/ci-seed.sql
+
+      - name: Mint API key for live tests
+        id: apikey
+        run: |
+          set -euo pipefail
+          # Login as the env-seeded admin → JWT + admin user id.
+          login=$(curl -fsS -X POST "$BACKEND_URL/api/v1/auth/login" \
+            -H 'Content-Type: application/json' \
+            -d "{\"username\":\"$ADMIN_USERNAME\",\"password\":\"$ADMIN_PASSWORD\"}")
+          jwt=$(echo "$login" | jq -r '.token // .access_token')
+          admin_id=$(echo "$login" | jq -r '.user.id')
+          if [ -z "$jwt" ] || [ "$jwt" = "null" ] || [ -z "$admin_id" ] || [ "$admin_id" = "null" ]; then
+            echo "Login response missing token/user.id" >&2
+            echo "$login" | jq . >&2
+            exit 1
+          fi
+          # Mint a key — the raw key is only returned once.
+          # Backend requires an expiration; 1 day is plenty for a CI run.
+          # Response field is "raw_key" (prefixed sk_<hex>), not "key".
+          mint=$(curl -fsS -X POST "$BACKEND_URL/api/v1/users/$admin_id/api-keys" \
+            -H "Authorization: Bearer $jwt" \
+            -H 'Content-Type: application/json' \
+            -d '{"name":"ci-live-tests","expires_in_days":1}')
+          key=$(echo "$mint" | jq -r '.raw_key')
+          if [ -z "$key" ] || [ "$key" = "null" ]; then
+            echo "API-key mint response missing .raw_key" >&2
+            echo "$mint" | jq . >&2
+            exit 1
+          fi
+          # Mask in logs and export for the test step.
+          echo "::add-mask::$key"
+          echo "key=$key" >> "$GITHUB_OUTPUT"
+
+      - name: Run live suite
+        env:
+          STACKCTL_LIVE_URL: ${{ env.BACKEND_URL }}
+          STACKCTL_LIVE_API_KEY: ${{ steps.apikey.outputs.key }}
+        working-directory: cli
+        run: go test -tags live -count=1 -timeout 10m -v ./test/live/...
+
+      - name: Backend logs on failure
+        if: failure()
+        run: docker compose -f /tmp/k8s-sm/docker-compose.yml logs backend
diff --git a/cli/test/live/doc.go b/cli/test/live/doc.go
index 3334106..ae1b0d1 100644
--- a/cli/test/live/doc.go
+++ b/cli/test/live/doc.go
@@ -1,7 +1,52 @@
-// Package live contains integration tests that run against a live backend.
-// These tests are gated behind the "live" build tag and require environment
-// variables STACKCTL_LIVE_USER and STACKCTL_LIVE_PASS. STACKCTL_LIVE_URL is
-// optional; if unset, the tests default to http://localhost:8081.
+// Package live contains integration tests that run against a live
+// k8s-stack-manager backend. They are gated behind the "live" build tag
+// because they make real HTTP calls and shape-check the response wire
+// contract — the only way to catch field-name drift between stackctl and
+// the backend that stub-based unit tests can't see.
 //
-// Run with: go test -tags live ./test/live/ -v
+// # Environment
+//
+//   - STACKCTL_LIVE_URL — backend base URL (default: http://localhost:8081).
+//   - One of:
+//       STACKCTL_LIVE_API_KEY                    — header-based, no session.
+//       STACKCTL_LIVE_USER + STACKCTL_LIVE_PASS — login flow.
+//   - STACKCTL_LIVE_HEAVY=1 — also run the workload tests in live_test.go
+//     that actually deploy stack instances (~80 GiB of golden-db pull
+//     each, takes ~10 min). Off by default — the per-endpoint *_live_test.go
+//     files cover wire-shape contracts without real workloads.
+//
+// # Running locally
+//
+//	go test -tags live ./test/live/ -v
+//
+// # Reproducing the CI flow
+//
+// .github/workflows/live-tests.yml boots k8s-stack-manager (origin/main,
+// api-only profile) in the same job and runs this suite. To reproduce
+// locally, run the equivalent of those steps against your own checkout:
+//
+//	# 1. Boot the api-only stack (backend + mysql; no frontend).
+//	cd /path/to/k8s-stack-manager
+//	ADMIN_USERNAME=admin ADMIN_PASSWORD=ci-admin-password-do-not-reuse \
+//	    docker compose up -d --wait backend
+//
+//	# 2. Apply the seed fixtures (one cluster, one definition, one
+//	#    published template — so require* helpers don't skip).
+//	docker exec -i app-mysql-dev mysql -u root -prootpassword app \
+//	    < /path/to/stackctl/cli/test/live/testdata/ci-seed.sql
+//
+//	# 3. Mint an API key (login → mint), then run the suite.
+//	jwt=$(curl -fsS -X POST http://localhost:8081/api/v1/auth/login \
+//	    -H 'Content-Type: application/json' \
+//	    -d '{"username":"admin","password":"ci-admin-password-do-not-reuse"}' \
+//	    | jq -r '.token // .access_token')
+//	admin_id=$(curl -fsS http://localhost:8081/api/v1/auth/me \
+//	    -H "Authorization: Bearer $jwt" | jq -r '.id')
+//	key=$(curl -fsS -X POST "http://localhost:8081/api/v1/users/$admin_id/api-keys" \
+//	    -H "Authorization: Bearer $jwt" -H 'Content-Type: application/json' \
+//	    -d '{"name":"local-live-tests","expires_in_days":1}' | jq -r '.raw_key')
+//
+//	STACKCTL_LIVE_URL=http://localhost:8081 \
+//	STACKCTL_LIVE_API_KEY="$key" \
+//	    go test -tags live -count=1 ./test/live/...
 package live
diff --git a/cli/test/live/testdata/ci-seed.sql b/cli/test/live/testdata/ci-seed.sql
new file mode 100644
index 0000000..9404109
--- /dev/null
+++ b/cli/test/live/testdata/ci-seed.sql
@@ -0,0 +1,90 @@
+-- CI fixture seed for the live integration suite.
+--
+-- The live tests gate themselves behind require{Cluster,Definition,Template}
+-- helpers that skip when the backend has nothing to look at. A fresh CI
+-- backend would skip ~70% of the suite — defeating the point of running it.
+--
+-- This script inserts the minimum metadata needed so every test reaches its
+-- assertions:
+--   - 1 cluster (no real connectivity)
+--   - 1 stack_definition + 1 chart_config
+--   - 1 stack_template + 1 template_chart_config
+--
+-- Owner is the env-seeded admin user (created by the backend's
+-- EnsureAdminUser at boot). We look it up at apply time so we don't need to
+-- substitute the non-deterministic UUID into the script.
+
+SET @admin_id = (SELECT id FROM users WHERE username = 'admin' LIMIT 1);
+
+-- Cluster — health_status stays "unreachable" since api-only mode has no
+-- real kubeconfig. test-connection / nodes endpoints will fail; tests
+-- handle that as expected.
+INSERT INTO clusters (
+    id, name, description,
+    api_server_url, kubeconfig_data, kubeconfig_path,
+    region, health_status,
+    registry_url, registry_username, registry_password, image_pull_secret_name,
+    max_namespaces, max_instances_per_user,
+    is_default, use_in_cluster,
+    created_at, updated_at
+) VALUES (
+    '00000000-0000-0000-0000-000000000001', 'ci-stub', 'CI fixture cluster — no real connectivity',
+    '', '', '',
+    '', 'unreachable',
+    '', '', '', '',
+    0, 0,
+    1, 0,
+    NOW(), NOW()
+);
+
+-- Stack definition (owner = admin) with a single noop chart.
+INSERT INTO stack_definitions (
+    id, name, description, owner_id, default_branch, created_at, updated_at
+) VALUES (
+    '00000000-0000-0000-0000-000000000002',
+    'ci-stub-definition',
+    'CI fixture definition — wire-shape only, never deployed',
+    @admin_id, 'master', NOW(), NOW()
+);
+
+INSERT INTO chart_configs (
+    id, stack_definition_id,
+    chart_name, repository_url, source_repo_url, build_pipeline_id,
+    chart_path, chart_version, default_values, deploy_order,
+    created_at
+) VALUES (
+    '00000000-0000-0000-0000-000000000003',
+    '00000000-0000-0000-0000-000000000002',
+    'ci-noop', '', '', '',
+    '', '0.1.0', '', 0,
+    NOW()
+);
+
+-- Published stack template so requireTemplate and TestLiveTemplate_ListAndGet
+-- find at least one row. Inline chart so the chart-roundtrip tests have
+-- something to read.
+INSERT INTO stack_templates (
+    id, name, description, category, version, owner_id,
+    default_branch, is_published, created_at, updated_at
+) VALUES (
+    '00000000-0000-0000-0000-000000000004',
+    'ci-stub-template',
+    'CI fixture template — published so list filter sees it',
+    '', '1.0.0', @admin_id,
+    'master', 1, NOW(), NOW()
+);
+
+INSERT INTO template_chart_configs (
+    id, stack_template_id,
+    chart_name, repository_url, source_repo_url, build_pipeline_id,
+    chart_path, chart_version, default_values, locked_values,
+    deploy_order, required,
+    created_at
+) VALUES (
+    '00000000-0000-0000-0000-000000000005',
+    '00000000-0000-0000-0000-000000000004',
+    'ci-noop', '', '', '',
+    '', '0.1.0', '', '',
+    0, 0,
+    NOW()
+);

From e0fab5ef10fbde841f0a5a71a55fe5b0318d31ab Mon Sep 17 00:00:00 2001
From: Olof Mattsson <olof.mattsson@klaravik.se>
Date: Thu, 28 May 2026 13:25:55 +0200
Subject: [PATCH 2/7] ci(live): reset upstream dev bind-mount so production
 image starts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

k8s-stack-manager's docker-compose.yml bind-mounts ./backend:/app for
hot-reload during dev. With target: production, that overlays the
image's baked-in /app/main binary with the host source tree, which
has no compiled binary — container exits immediately with
"stat ./main: no such file or directory".

Ships testdata/docker-compose.ci.yml as a compose override that
resets the volumes list (Compose ≥1.28 !reset semantics), and wires
both -f files into every compose call in the workflow.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/live-tests.yml             | 20 ++++++++++++++++----
 cli/test/live/testdata/docker-compose.ci.yml | 11 +++++++++++
 2 files changed, 27 insertions(+), 4 deletions(-)
 create mode 100644 cli/test/live/testdata/docker-compose.ci.yml

diff --git a/.github/workflows/live-tests.yml b/.github/workflows/live-tests.yml
index db2e1cd..303061f 100644
--- a/.github/workflows/live-tests.yml
+++ b/.github/workflows/live-tests.yml
@@ -48,7 +48,6 @@ jobs:
           echo "k8s-sm @ $(git -C /tmp/k8s-sm rev-parse --short HEAD)"
 
       - name: Boot api-only stack (backend + mysql)
-        working-directory: /tmp/k8s-sm
         env:
           ADMIN_USERNAME: ${{ env.ADMIN_USERNAME }}
           ADMIN_PASSWORD: ${{ env.ADMIN_PASSWORD }}
@@ -57,7 +56,13 @@ jobs:
           # No --profile flag → only services without profiles run, which is
           # exactly the api-only set (backend + mysql). Frontend is gated to
           # the "full" profile.
-          docker compose up -d --wait backend
+          #
+          # The override file resets the dev bind-mount that would otherwise
+          # hide the production image's baked-in ./main binary.
+          docker compose \
+            -f /tmp/k8s-sm/docker-compose.yml \
+            -f "$GITHUB_WORKSPACE/cli/test/live/testdata/docker-compose.ci.yml" \
+            up -d --wait backend
 
       - name: Wait for backend /health/live
         run: |
@@ -69,7 +74,10 @@ jobs:
             sleep 1
           done
           echo "Backend never became healthy" >&2
-          docker compose -f /tmp/k8s-sm/docker-compose.yml logs backend
+          docker compose \
+            -f /tmp/k8s-sm/docker-compose.yml \
+            -f "$GITHUB_WORKSPACE/cli/test/live/testdata/docker-compose.ci.yml" \
+            logs backend
           exit 1
 
       - name: Apply CI seed fixtures
@@ -119,4 +127,8 @@ jobs:
 
       - name: Backend logs on failure
         if: failure()
-        run: docker compose -f /tmp/k8s-sm/docker-compose.yml logs backend
+        run: |
+          docker compose \
+            -f /tmp/k8s-sm/docker-compose.yml \
+            -f "$GITHUB_WORKSPACE/cli/test/live/testdata/docker-compose.ci.yml" \
+            logs backend
diff --git a/cli/test/live/testdata/docker-compose.ci.yml b/cli/test/live/testdata/docker-compose.ci.yml
new file mode 100644
index 0000000..e237809
--- /dev/null
+++ b/cli/test/live/testdata/docker-compose.ci.yml
@@ -0,0 +1,11 @@
+# CI override for k8s-stack-manager's docker-compose.yml.
+#
+# The upstream compose bind-mounts ./backend:/app for dev hot-reload, but
+# that overlays the production image's baked-in /app/main binary with the
+# host source tree (which has no compiled binary), so the container can't
+# start. We're not editing source at runtime in CI, so reset the volumes.
+#
+# `!reset []` replaces the list instead of merging (Compose ≥1.28).
+services:
+  backend:
+    volumes: !reset []

From a38daf1e1362432b1c812abaf09545a1d56d3705 Mon Sep 17 00:00:00 2001
From: Olof Mattsson <olof.mattsson@klaravik.se>
Date: Thu, 28 May 2026 13:35:19 +0200
Subject: [PATCH 3/7] ci(live): redact JWT and raw API key from failure-path
 logs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If login or api-key mint shapes drift, the workflow dumps the raw
response to stderr for debugging. Both responses carry credentials
(JWT in login, raw_key in mint) that shouldn't end up in CI logs even
on the failure branch — masked outputs still surface the field names
and status, which is the actual diagnostic value.

Addresses CodeRabbit on PR #99. The sister findings about pinning
actions/checkout@v6 and actions/setup-go@v6 to commit SHAs and adding
persist-credentials: false are deliberately skipped — none of the
four existing workflows in this repo follow that policy, and the
right place to harden is a dedicated repo-wide change, not a
one-workflow special case.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/live-tests.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/live-tests.yml b/.github/workflows/live-tests.yml
index 303061f..d749469 100644
--- a/.github/workflows/live-tests.yml
+++ b/.github/workflows/live-tests.yml
@@ -98,7 +98,8 @@ jobs:
           admin_id=$(echo "$login" | jq -r '.user.id')
           if [ -z "$jwt" ] || [ "$jwt" = "null" ] || [ -z "$admin_id" ] || [ "$admin_id" = "null" ]; then
             echo "Login response missing token/user.id" >&2
-            echo "$login" | jq . >&2
+            # Drop the JWT before logging so a failed boot can't leak it.
+            echo "$login" | jq 'del(.token, .access_token)' >&2
             exit 1
           fi
           # Mint a key — the raw key is only returned once.
@@ -111,7 +112,9 @@ jobs:
           key=$(echo "$mint" | jq -r '.raw_key')
           if [ -z "$key" ] || [ "$key" = "null" ]; then
             echo "API-key mint response missing .raw_key" >&2
-            echo "$mint" | jq . >&2
+            # Drop the raw key before logging — it's only valid once but
+            # still a credential while the job is alive.
+            echo "$mint" | jq 'del(.raw_key)' >&2
             exit 1
           fi
           # Mask in logs and export for the test step.

From f96fc5f84e1b56d7fadd1efd30ecb7c89a6832be Mon Sep 17 00:00:00 2001
From: Olof Mattsson <olof.mattsson@klaravik.se>
Date: Thu, 28 May 2026 13:41:51 +0200
Subject: [PATCH 4/7] ci(live): disable container-side healthcheck (alpine
 image has no curl)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Upstream backend healthcheck runs `curl -f http://localhost:8081/health/live`
inside the container, but the production alpine image only ships
./main + helm + ca-certificates — no curl. The healthcheck fails
forever and `docker compose up --wait` hits its timeout.

We already poll /health/live from the host in the next step, so the
in-container check is redundant. Override with `test: ["NONE"]` and
keep the host-side gate.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 cli/test/live/testdata/docker-compose.ci.yml | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/cli/test/live/testdata/docker-compose.ci.yml b/cli/test/live/testdata/docker-compose.ci.yml
index e237809..b8b3140 100644
--- a/cli/test/live/testdata/docker-compose.ci.yml
+++ b/cli/test/live/testdata/docker-compose.ci.yml
@@ -1,11 +1,20 @@
 # CI override for k8s-stack-manager's docker-compose.yml.
 #
-# The upstream compose bind-mounts ./backend:/app for dev hot-reload, but
-# that overlays the production image's baked-in /app/main binary with the
-# host source tree (which has no compiled binary), so the container can't
-# start. We're not editing source at runtime in CI, so reset the volumes.
+# Two upstream defaults don't fit `target: production` + headless CI:
 #
-# `!reset []` replaces the list instead of merging (Compose ≥1.28).
+# 1. `./backend:/app` bind-mount is for dev hot-reload. With the production
+#    target it overlays the baked-in /app/main binary with the host source
+#    tree (which has no compiled binary), so the container exits with
+#    "stat ./main: no such file or directory". `!reset []` replaces the
+#    list instead of merging (Compose ≥1.28).
+#
+# 2. The healthcheck runs `curl -f http://localhost:8081/health/live`
+#    inside the container, but the alpine production image doesn't have
+#    curl installed. The check fails forever and `docker compose up
+#    --wait` times out. Disable the container-side healthcheck — we run
+#    our own poll against /health/live from the host in the next step.
 services:
   backend:
     volumes: !reset []
+    healthcheck:
+      test: ["NONE"]

From 259341d1c29041fd827d35cc9d8d6389d680a05d Mon Sep 17 00:00:00 2001
From: Olof Mattsson <olof.mattsson@klaravik.se>
Date: Thu, 28 May 2026 13:46:34 +0200
Subject: [PATCH 5/7] ci(live): use wget for healthcheck (alpine image has no
 curl)

Last attempt set `test: ["NONE"]` to disable the broken curl-based
healthcheck, but `docker compose up --wait` errors out with "no
healthcheck configured" instead of treating "running" as ready.

Alpine's base image ships busybox `wget` (curl is not installed), so
override the healthcheck command to use that. Shorter interval +
retries chosen so a healthy backend is flagged ready within ~10s
rather than upstream's 75s start_period.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 cli/test/live/testdata/docker-compose.ci.yml | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/cli/test/live/testdata/docker-compose.ci.yml b/cli/test/live/testdata/docker-compose.ci.yml
index b8b3140..f9dc287 100644
--- a/cli/test/live/testdata/docker-compose.ci.yml
+++ b/cli/test/live/testdata/docker-compose.ci.yml
@@ -10,11 +10,16 @@
 #
 # 2. The healthcheck runs `curl -f http://localhost:8081/health/live`
 #    inside the container, but the alpine production image doesn't have
-#    curl installed. The check fails forever and `docker compose up
-#    --wait` times out. Disable the container-side healthcheck — we run
-#    our own poll against /health/live from the host in the next step.
+#    curl installed. Replace with busybox `wget --spider`, which alpine
+#    ships in the base image. (`test: ["NONE"]` would disable the check
+#    but then `docker compose up --wait` errors with "no healthcheck
+#    configured" instead of treating "running" as ready.)
 services:
   backend:
     volumes: !reset []
     healthcheck:
-      test: ["NONE"]
+      test: ["CMD", "wget", "-q", "--spider", "http://localhost:8081/health/live"]
+      interval: 5s
+      timeout: 5s
+      retries: 12
+      start_period: 30s

From 9c98f0e2ec7ce837f3dd9bf95209f9b809ed7b7d Mon Sep 17 00:00:00 2001
From: Olof Mattsson <olof.mattsson@klaravik.se>
Date: Thu, 28 May 2026 14:12:34 +0200
Subject: [PATCH 6/7] test(live): skip cluster health/nodes when backend can't
 reach cluster
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CI live-tests workflow runs against a stub cluster (no kubeconfig,
no real kube-apiserver). The backend correctly returns 500 with
"Failed to connect to cluster" for diagnostic endpoints in that
state, so two subtests of TestLiveCluster_HealthAndTest were failing
in CI while passing locally against rancher-desktop.

Two changes:

1. The `health` subtest now skips on the same "cluster unreachable"
   conditions the `nodes` subtest already handles (added in #98 via
   CodeRabbit's tightening). Symmetric with `nodes` so a wire-shape
   regression on either endpoint still surfaces as a failure.

2. The substring set is extracted into isClusterUnreachable() and
   widened to include "failed to connect to cluster" — the actual
   literal in the backend's 500 response when kubeconfig is empty or
   the cluster is down.

Verified against rancher-desktop k8s-sm: all three subtests still
pass (real cluster, real kubeconfig). The CI api-only path will now
skip the two diagnostic subtests cleanly instead of failing.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 cli/test/live/cluster_live_test.go | 35 +++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/cli/test/live/cluster_live_test.go b/cli/test/live/cluster_live_test.go
index 1002e51..e258b53 100644
--- a/cli/test/live/cluster_live_test.go
+++ b/cli/test/live/cluster_live_test.go
@@ -69,7 +69,17 @@ func TestLiveCluster_HealthAndTest(t *testing.T) {
 
 	t.Run("health", func(t *testing.T) {
 		health, err := c.GetClusterHealth(cluster.ID)
-		require.NoError(t, err, "get cluster health")
+		if err != nil {
+			// Same skip rule as the "nodes" subtest below: when the
+			// backend can't talk to the cluster, the health summary
+			// 500s. That's not a wire-shape regression, just an
+			// environment with no kubeconfig (e.g. the CI api-only
+			// stack against a stub cluster).
+			if isClusterUnreachable(err) {
+				t.Skipf("health endpoint unavailable (cluster not reachable): %v", err)
+			}
+			require.NoError(t, err, "get cluster health")
+		}
 		// Health summary should at least populate node count for any
 		// non-empty cluster. Zero would mean the wire decode lost data.
 		assert.GreaterOrEqual(t, health.NodeCount, 0, "node_count must decode (zero is fine for empty clusters)")
@@ -91,8 +101,7 @@ func TestLiveCluster_HealthAndTest(t *testing.T) {
 		if err != nil {
 			// Skip only when the backend can't reach the cluster — any
 			// other error (bad wire shape, 5xx) should fail the test.
-			msg := strings.ToLower(err.Error())
-			if strings.Contains(msg, "not reachable") || strings.Contains(msg, "unavailable") || strings.Contains(msg, "connection refused") {
+			if isClusterUnreachable(err) {
 				t.Skipf("nodes endpoint unavailable (cluster not reachable): %v", err)
 			}
 			require.NoError(t, err, "get cluster nodes")
@@ -101,3 +110,23 @@ func TestLiveCluster_HealthAndTest(t *testing.T) {
 		assert.NotNil(t, nodes)
 	})
 }
+
+// isClusterUnreachable returns true when the backend's error indicates
+// the cluster itself isn't reachable (no kubeconfig, kube-apiserver
+// down, DNS failure) rather than a wire-shape or auth issue. Used by
+// the diagnostic-endpoint tests to skip cleanly in CI environments
+// running against a stub cluster.
+func isClusterUnreachable(err error) bool {
+	msg := strings.ToLower(err.Error())
+	for _, needle := range []string{
+		"not reachable",
+		"unavailable",
+		"connection refused",
+		"failed to connect to cluster", // backend's literal 500 message
+	} {
+		if strings.Contains(msg, needle) {
+			return true
+		}
+	}
+	return false
+}

From 8a34986a84da125c537845a1793383278b4c970e Mon Sep 17 00:00:00 2001
From: Olof Mattsson <olof.mattsson@klaravik.se>
Date: Thu, 28 May 2026 14:20:28 +0200
Subject: [PATCH 7/7] test(live): tighten cluster-unreachable substring set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop "unavailable" — too generic, could mask a legitimate 503 from a
wire-shape regression. Add "unreachable" instead, which matches the
backend's models.ClusterUnreachable status constant and is the more
accurate token for a cluster-down condition.

Addresses CodeRabbit on PR #99. The companion suggestion to convert
TestLiveCluster_HealthAndTest to a table-driven + t.Parallel() form
is deliberately skipped: none of the four sibling *_live_test.go
files in this package follow that pattern (the convention applies to
unit tests; live tests against a shared backend run serially by
design), and a mass-refactor of the live suite is out of scope for a
workflow PR.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 cli/test/live/cluster_live_test.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cli/test/live/cluster_live_test.go b/cli/test/live/cluster_live_test.go
index e258b53..5da3276 100644
--- a/cli/test/live/cluster_live_test.go
+++ b/cli/test/live/cluster_live_test.go
@@ -119,8 +119,11 @@ func TestLiveCluster_HealthAndTest(t *testing.T) {
 func isClusterUnreachable(err error) bool {
 	msg := strings.ToLower(err.Error())
 	for _, needle := range []string{
+		// Backend uses ClusterUnreachable = "unreachable" as its
+		// status constant; keep both spacings since the error string
+		// can render either way.
+		"unreachable",
 		"not reachable",
-		"unavailable",
 		"connection refused",
 		"failed to connect to cluster", // backend's literal 500 message
 	} {