diff --git a/.github/workflows/live-tests.yml b/.github/workflows/live-tests.yml new file mode 100644 index 0000000..d749469 --- /dev/null +++ b/.github/workflows/live-tests.yml @@ -0,0 +1,137 @@ +name: Live tests + +# Boots k8s-stack-manager (origin/main, api-only profile) in the same job, +# applies a SQL fixture, mints an API key, then runs cli/test/live against +# the booted backend. Catches wire-contract drift between the two repos at +# PR time — the stub-based unit tests can't see it. +# +# Tracks omattsson/stackctl#96. + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +concurrency: + group: live-tests-${{ github.ref }} + cancel-in-progress: true + +jobs: + live: + name: Live integration suite + runs-on: ubuntu-latest + timeout-minutes: 20 + env: + ADMIN_USERNAME: admin + ADMIN_PASSWORD: ci-admin-password-do-not-reuse + DB_PASSWORD: rootpassword + DB_NAME: app + MYSQL_CONTAINER: app-mysql-dev + BACKEND_URL: http://localhost:8081 + steps: + - name: Check out stackctl + uses: actions/checkout@v6 + + - name: Set up Go + uses: actions/setup-go@v6 + with: + go-version-file: cli/go.mod + cache-dependency-path: cli/go.sum + + - name: Clone k8s-stack-manager (origin/main) + run: | + git clone --depth=1 https://github.com/omattsson/k8s-stack-manager /tmp/k8s-sm + echo "k8s-sm @ $(git -C /tmp/k8s-sm rev-parse --short HEAD)" + + - name: Boot api-only stack (backend + mysql) + env: + ADMIN_USERNAME: ${{ env.ADMIN_USERNAME }} + ADMIN_PASSWORD: ${{ env.ADMIN_PASSWORD }} + DB_PASSWORD: ${{ env.DB_PASSWORD }} + run: | + # No --profile flag → only services without profiles run, which is + # exactly the api-only set (backend + mysql). Frontend is gated to + # the "full" profile. + # + # The override file resets the dev bind-mount that would otherwise + # hide the production image's baked-in ./main binary. + docker compose \ + -f /tmp/k8s-sm/docker-compose.yml \ + -f "$GITHUB_WORKSPACE/cli/test/live/testdata/docker-compose.ci.yml" \ + up -d --wait backend + + - name: Wait for backend /health/live + run: | + for i in $(seq 1 60); do + if curl -fsS "$BACKEND_URL/health/live" >/dev/null 2>&1; then + echo "Backend healthy after ${i}s" + exit 0 + fi + sleep 1 + done + echo "Backend never became healthy" >&2 + docker compose \ + -f /tmp/k8s-sm/docker-compose.yml \ + -f "$GITHUB_WORKSPACE/cli/test/live/testdata/docker-compose.ci.yml" \ + logs backend + exit 1 + + - name: Apply CI seed fixtures + run: | + docker exec -i "$MYSQL_CONTAINER" \ + mysql -u root -p"$DB_PASSWORD" "$DB_NAME" \ + < cli/test/live/testdata/ci-seed.sql + + - name: Mint API key for live tests + id: apikey + run: | + set -euo pipefail + # Login as the env-seeded admin → JWT + admin user id. + login=$(curl -fsS -X POST "$BACKEND_URL/api/v1/auth/login" \ + -H 'Content-Type: application/json' \ + -d "{\"username\":\"$ADMIN_USERNAME\",\"password\":\"$ADMIN_PASSWORD\"}") + jwt=$(echo "$login" | jq -r '.token // .access_token') + admin_id=$(echo "$login" | jq -r '.user.id') + if [ -z "$jwt" ] || [ "$jwt" = "null" ] || [ -z "$admin_id" ] || [ "$admin_id" = "null" ]; then + echo "Login response missing token/user.id" >&2 + # Drop the JWT before logging so a failed boot can't leak it. + echo "$login" | jq 'del(.token, .access_token)' >&2 + exit 1 + fi + # Mint a key — the raw key is only returned once. + # Backend requires an expiration; 1 day is plenty for a CI run. + # Response field is "raw_key" (prefixed sk_), not "key". + mint=$(curl -fsS -X POST "$BACKEND_URL/api/v1/users/$admin_id/api-keys" \ + -H "Authorization: Bearer $jwt" \ + -H 'Content-Type: application/json' \ + -d '{"name":"ci-live-tests","expires_in_days":1}') + key=$(echo "$mint" | jq -r '.raw_key') + if [ -z "$key" ] || [ "$key" = "null" ]; then + echo "API-key mint response missing .raw_key" >&2 + # Drop the raw key before logging — it's only valid once but + # still a credential while the job is alive. + echo "$mint" | jq 'del(.raw_key)' >&2 + exit 1 + fi + # Mask in logs and export for the test step. + echo "::add-mask::$key" + echo "key=$key" >> "$GITHUB_OUTPUT" + + - name: Run live suite + env: + STACKCTL_LIVE_URL: ${{ env.BACKEND_URL }} + STACKCTL_LIVE_API_KEY: ${{ steps.apikey.outputs.key }} + working-directory: cli + run: go test -tags live -count=1 -timeout 10m -v ./test/live/... + + - name: Backend logs on failure + if: failure() + run: | + docker compose \ + -f /tmp/k8s-sm/docker-compose.yml \ + -f "$GITHUB_WORKSPACE/cli/test/live/testdata/docker-compose.ci.yml" \ + logs backend diff --git a/cli/test/live/cluster_live_test.go b/cli/test/live/cluster_live_test.go index 1002e51..5da3276 100644 --- a/cli/test/live/cluster_live_test.go +++ b/cli/test/live/cluster_live_test.go @@ -69,7 +69,17 @@ func TestLiveCluster_HealthAndTest(t *testing.T) { t.Run("health", func(t *testing.T) { health, err := c.GetClusterHealth(cluster.ID) - require.NoError(t, err, "get cluster health") + if err != nil { + // Same skip rule as the "nodes" subtest below: when the + // backend can't talk to the cluster, the health summary + // 500s. That's not a wire-shape regression, just an + // environment with no kubeconfig (e.g. the CI api-only + // stack against a stub cluster). + if isClusterUnreachable(err) { + t.Skipf("health endpoint unavailable (cluster not reachable): %v", err) + } + require.NoError(t, err, "get cluster health") + } // Health summary should at least populate node count for any // non-empty cluster. Zero would mean the wire decode lost data. assert.GreaterOrEqual(t, health.NodeCount, 0, "node_count must decode (zero is fine for empty clusters)") @@ -91,8 +101,7 @@ func TestLiveCluster_HealthAndTest(t *testing.T) { if err != nil { // Skip only when the backend can't reach the cluster — any // other error (bad wire shape, 5xx) should fail the test. - msg := strings.ToLower(err.Error()) - if strings.Contains(msg, "not reachable") || strings.Contains(msg, "unavailable") || strings.Contains(msg, "connection refused") { + if isClusterUnreachable(err) { t.Skipf("nodes endpoint unavailable (cluster not reachable): %v", err) } require.NoError(t, err, "get cluster nodes") @@ -101,3 +110,26 @@ func TestLiveCluster_HealthAndTest(t *testing.T) { assert.NotNil(t, nodes) }) } + +// isClusterUnreachable returns true when the backend's error indicates +// the cluster itself isn't reachable (no kubeconfig, kube-apiserver +// down, DNS failure) rather than a wire-shape or auth issue. Used by +// the diagnostic-endpoint tests to skip cleanly in CI environments +// running against a stub cluster. +func isClusterUnreachable(err error) bool { + msg := strings.ToLower(err.Error()) + for _, needle := range []string{ + // Backend uses ClusterUnreachable = "unreachable" as its + // status constant; keep both spacings since the error string + // can render either way. + "unreachable", + "not reachable", + "connection refused", + "failed to connect to cluster", // backend's literal 500 message + } { + if strings.Contains(msg, needle) { + return true + } + } + return false +} diff --git a/cli/test/live/doc.go b/cli/test/live/doc.go index 3334106..ae1b0d1 100644 --- a/cli/test/live/doc.go +++ b/cli/test/live/doc.go @@ -1,7 +1,52 @@ -// Package live contains integration tests that run against a live backend. -// These tests are gated behind the "live" build tag and require environment -// variables STACKCTL_LIVE_USER and STACKCTL_LIVE_PASS. STACKCTL_LIVE_URL is -// optional; if unset, the tests default to http://localhost:8081. +// Package live contains integration tests that run against a live +// k8s-stack-manager backend. They are gated behind the "live" build tag +// because they make real HTTP calls and shape-check the response wire +// contract — the only way to catch field-name drift between stackctl and +// the backend that stub-based unit tests can't see. // -// Run with: go test -tags live ./test/live/ -v +// # Environment +// +// - STACKCTL_LIVE_URL — backend base URL (default: http://localhost:8081). +// - One of: +// STACKCTL_LIVE_API_KEY — header-based, no session. +// STACKCTL_LIVE_USER + STACKCTL_LIVE_PASS — login flow. +// - STACKCTL_LIVE_HEAVY=1 — also run the workload tests in live_test.go +// that actually deploy stack instances (~80 GiB of golden-db pull +// each, takes ~10 min). Off by default — the per-endpoint *_live_test.go +// files cover wire-shape contracts without real workloads. +// +// # Running locally +// +// go test -tags live ./test/live/ -v +// +// # Reproducing the CI flow +// +// .github/workflows/live-tests.yml boots k8s-stack-manager (origin/main, +// api-only profile) in the same job and runs this suite. To reproduce +// locally, run the equivalent of those steps against your own checkout: +// +// # 1. Boot the api-only stack (backend + mysql; no frontend). +// cd /path/to/k8s-stack-manager +// ADMIN_USERNAME=admin ADMIN_PASSWORD=ci-admin-password-do-not-reuse \ +// docker compose up -d --wait backend +// +// # 2. Apply the seed fixtures (one cluster, one definition, one +// # published template — so require* helpers don't skip). +// docker exec -i app-mysql-dev mysql -u root -prootpassword app \ +// < /path/to/stackctl/cli/test/live/testdata/ci-seed.sql +// +// # 3. Mint an API key (login → mint), then run the suite. +// jwt=$(curl -fsS -X POST http://localhost:8081/api/v1/auth/login \ +// -H 'Content-Type: application/json' \ +// -d '{"username":"admin","password":"ci-admin-password-do-not-reuse"}' \ +// | jq -r '.token // .access_token') +// admin_id=$(curl -fsS http://localhost:8081/api/v1/auth/me \ +// -H "Authorization: Bearer $jwt" | jq -r '.id') +// key=$(curl -fsS -X POST "http://localhost:8081/api/v1/users/$admin_id/api-keys" \ +// -H "Authorization: Bearer $jwt" -H 'Content-Type: application/json' \ +// -d '{"name":"local-live-tests","expires_in_days":1}' | jq -r '.raw_key') +// +// STACKCTL_LIVE_URL=http://localhost:8081 \ +// STACKCTL_LIVE_API_KEY="$key" \ +// go test -tags live -count=1 ./test/live/... package live diff --git a/cli/test/live/testdata/ci-seed.sql b/cli/test/live/testdata/ci-seed.sql new file mode 100644 index 0000000..9404109 --- /dev/null +++ b/cli/test/live/testdata/ci-seed.sql @@ -0,0 +1,90 @@ +-- CI fixture seed for the live integration suite. +-- +-- The live tests gate themselves behind require{Cluster,Definition,Template} +-- helpers that skip when the backend has nothing to look at. A fresh CI +-- backend would skip ~70% of the suite — defeating the point of running it. +-- +-- This script inserts the minimum metadata needed so every test reaches its +-- assertions: +-- - 1 cluster (no real connectivity) +-- - 1 stack_definition + 1 chart_config +-- - 1 stack_template + 1 template_chart_config +-- +-- Owner is the env-seeded admin user (created by the backend's +-- EnsureAdminUser at boot). We look it up at apply time so we don't need to +-- substitute the non-deterministic UUID into the script. + +SET @admin_id = (SELECT id FROM users WHERE username = 'admin' LIMIT 1); + +-- Cluster — health_status stays "unreachable" since api-only mode has no +-- real kubeconfig. test-connection / nodes endpoints will fail; tests +-- handle that as expected. +INSERT INTO clusters ( + id, name, description, + api_server_url, kubeconfig_data, kubeconfig_path, + region, health_status, + registry_url, registry_username, registry_password, image_pull_secret_name, + max_namespaces, max_instances_per_user, + is_default, use_in_cluster, + created_at, updated_at +) VALUES ( + '00000000-0000-0000-0000-000000000001', 'ci-stub', 'CI fixture cluster — no real connectivity', + '', '', '', + '', 'unreachable', + '', '', '', '', + 0, 0, + 1, 0, + NOW(), NOW() +); + +-- Stack definition (owner = admin) with a single noop chart. +INSERT INTO stack_definitions ( + id, name, description, owner_id, default_branch, created_at, updated_at +) VALUES ( + '00000000-0000-0000-0000-000000000002', + 'ci-stub-definition', + 'CI fixture definition — wire-shape only, never deployed', + @admin_id, 'master', NOW(), NOW() +); + +INSERT INTO chart_configs ( + id, stack_definition_id, + chart_name, repository_url, source_repo_url, build_pipeline_id, + chart_path, chart_version, default_values, deploy_order, + created_at +) VALUES ( + '00000000-0000-0000-0000-000000000003', + '00000000-0000-0000-0000-000000000002', + 'ci-noop', '', '', '', + '', '0.1.0', '', 0, + NOW() +); + +-- Published stack template so requireTemplate and TestLiveTemplate_ListAndGet +-- find at least one row. Inline chart so the chart-roundtrip tests have +-- something to read. +INSERT INTO stack_templates ( + id, name, description, category, version, owner_id, + default_branch, is_published, created_at, updated_at +) VALUES ( + '00000000-0000-0000-0000-000000000004', + 'ci-stub-template', + 'CI fixture template — published so list filter sees it', + '', '1.0.0', @admin_id, + 'master', 1, NOW(), NOW() +); + +INSERT INTO template_chart_configs ( + id, stack_template_id, + chart_name, repository_url, source_repo_url, build_pipeline_id, + chart_path, chart_version, default_values, locked_values, + deploy_order, required, + created_at +) VALUES ( + '00000000-0000-0000-0000-000000000005', + '00000000-0000-0000-0000-000000000004', + 'ci-noop', '', '', '', + '', '0.1.0', '', '', + 0, 0, + NOW() +); diff --git a/cli/test/live/testdata/docker-compose.ci.yml b/cli/test/live/testdata/docker-compose.ci.yml new file mode 100644 index 0000000..f9dc287 --- /dev/null +++ b/cli/test/live/testdata/docker-compose.ci.yml @@ -0,0 +1,25 @@ +# CI override for k8s-stack-manager's docker-compose.yml. +# +# Two upstream defaults don't fit `target: production` + headless CI: +# +# 1. `./backend:/app` bind-mount is for dev hot-reload. With the production +# target it overlays the baked-in /app/main binary with the host source +# tree (which has no compiled binary), so the container exits with +# "stat ./main: no such file or directory". `!reset []` replaces the +# list instead of merging (Compose ≥1.28). +# +# 2. The healthcheck runs `curl -f http://localhost:8081/health/live` +# inside the container, but the alpine production image doesn't have +# curl installed. Replace with busybox `wget --spider`, which alpine +# ships in the base image. (`test: ["NONE"]` would disable the check +# but then `docker compose up --wait` errors with "no healthcheck +# configured" instead of treating "running" as ready.) +services: + backend: + volumes: !reset [] + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:8081/health/live"] + interval: 5s + timeout: 5s + retries: 12 + start_period: 30s