logpie · logpie · Mar 30, 2026 · Mar 30, 2026
diff --git a/bench/pilot-benchmark.sh b/bench/pilot-benchmark.sh
@@ -0,0 +1,207 @@
+#!/usr/bin/env bash
+# Gate pilot A/B benchmark — runs multi-task projects with and without pilot.
+# Usage: ./bench/pilot-benchmark.sh [project_name]
+#
+# Runs each project TWICE:
+#   1. pilot=true  (gate pilot enabled — default)
+#   2. pilot=false (fallback to replan — baseline)
+#
+# Compares: pass rate, cost, retry success, pilot.log decisions.
+# Results: /tmp/otto-pilot-benchmark/
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+PROJECTS_DIR="$REPO_DIR/bench/pressure/projects"
+RESULTS_DIR="/tmp/otto-pilot-benchmark"
+TIMESTAMP="$(date +%Y-%m-%d-%H%M%S)"
+RUN_DIR="$RESULTS_DIR/$TIMESTAMP"
+
+OTTO_BIN="${OTTO_BIN:-}"
+if [[ -z "$OTTO_BIN" ]]; then
+    repo_otto="$REPO_DIR/.venv/bin/otto"
+    if [[ -x "$repo_otto" ]]; then
+        OTTO_BIN="$repo_otto"
+    else
+        OTTO_BIN="otto"
+    fi
+fi
+[[ "$OTTO_BIN" == */* ]] && OTTO_BIN="$(cd "$(dirname "$OTTO_BIN")" && pwd)/$(basename "$OTTO_BIN")"
+
+# Default projects: multi-task only (pilot only activates with batch failures)
+DEFAULT_PROJECTS=(
+    edge-conflicting-tasks
+    multi-blog-engine
+    multi-expense-tracker
+)
+
+FILTER="${1:-}"
+PROJECT_NAMES=()
+if [[ -n "$FILTER" ]]; then
+    PROJECT_NAMES+=("$FILTER")
+else
+    PROJECT_NAMES=("${DEFAULT_PROJECTS[@]}")
+fi
+
+mkdir -p "$RUN_DIR"
+
+echo "============================================"
+echo "  Gate Pilot A/B Benchmark — $TIMESTAMP"
+echo "  Projects: ${PROJECT_NAMES[*]}"
+echo "  Results: $RUN_DIR"
+echo "============================================"
+echo ""
+
+run_project() {
+    local proj="$1"
+    local pilot_flag="$2"   # "true" or "false"
+    local label="$3"        # "pilot" or "baseline"
+    local proj_dir="$PROJECTS_DIR/$proj"
+    local result_dir="$RUN_DIR/$proj/$label"
+    local work_dir="/tmp/pb-$proj-$label"
+
+    mkdir -p "$result_dir"
+    rm -rf "$work_dir"
+    mkdir -p "$work_dir"
+
+    echo "  [$label] Setting up..."
+
+    # Setup
+    (
+        cd "$work_dir"
+        git init -q
+        git config user.email "benchmark@otto.dev"
+        git config user.name "Benchmark"
+        bash "$proj_dir/setup.sh"
+    ) > "$result_dir/setup.log" 2>&1
+
+    # Write otto.yaml with pilot flag
+    cat > "$work_dir/otto.yaml" << EOF
+test_command: auto
+max_retries: 2
+max_parallel: 1
+pilot: $pilot_flag
+EOF
+
+    # Add initial commit
+    (cd "$work_dir" && git add -A && git commit -q -m "initial setup")
+
+    # Add tasks
+    (
+        cd "$work_dir"
+        while IFS= read -r line; do
+            line="$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
+            [[ -z "$line" || "$line" == \#* ]] && continue
+            env CLAUDECODE= "$OTTO_BIN" add "$line"
+        done < "$proj_dir/tasks.txt"
+    ) > "$result_dir/add.log" 2>&1
+
+    # Run otto
+    echo "  [$label] Running otto (pilot=$pilot_flag)..."
+    local start_time=$(date +%s)
+    set +e
+    (
+        cd "$work_dir"
+        env CLAUDECODE= "$OTTO_BIN" run 2>&1
+    ) | tee "$result_dir/output.txt"
+    local exit_code=${PIPESTATUS[0]}
+    set -e
+    local end_time=$(date +%s)
+    local duration=$((end_time - start_time))
+
+    # Copy artifacts
+    cp "$work_dir/tasks.yaml" "$result_dir/" 2>/dev/null || true
+    cp "$work_dir/otto.yaml" "$result_dir/" 2>/dev/null || true
+    cp -r "$work_dir/otto_logs" "$result_dir/" 2>/dev/null || true
+
+    # Parse results
+    local passed=0 failed=0 cost="0.00"
+    if [[ -f "$work_dir/tasks.yaml" ]]; then
+        passed=$(grep -c 'status: passed' "$work_dir/tasks.yaml" 2>/dev/null || true)
+        failed=$(grep -c 'status: failed' "$work_dir/tasks.yaml" 2>/dev/null || true)
+        cost=$(grep 'cost_usd:' "$work_dir/tasks.yaml" 2>/dev/null \
+            | sed 's/.*cost_usd:[[:space:]]*//' \
+            | awk '{s += $1} END {printf "%.2f", s}' 2>/dev/null || echo "0.00")
+    fi
+
+    # Independent verification
+    local verify="n/a"
+    if [[ -f "$proj_dir/verify.sh" && $passed -gt 0 ]]; then
+        if (cd "$work_dir" && bash "$proj_dir/verify.sh") > "$result_dir/verify.log" 2>&1; then
+            verify="PASS"
+        else
+            verify="FAIL"
+        fi
+    fi
+
+    # Write summary
+    cat > "$result_dir/summary.json" << EOF
+{
+  "project": "$proj",
+  "mode": "$label",
+  "pilot": $pilot_flag,
+  "passed": $passed,
+  "failed": $failed,
+  "cost_usd": $cost,
+  "duration_s": $duration,
+  "verify": "$verify",
+  "exit_code": $exit_code
+}
+EOF
+
+    echo "  [$label] Done: ${passed}p/${failed}f, \$$cost, ${duration}s, verify=$verify"
+
+    # Check for pilot.log
+    if [[ "$label" == "pilot" && -f "$work_dir/otto_logs/pilot.log" ]]; then
+        echo "  [$label] Pilot log available at $result_dir/otto_logs/pilot.log"
+    fi
+
+    # Cleanup workdir
+    rm -rf "$work_dir"
+}
+
+for proj in "${PROJECT_NAMES[@]}"; do
+    echo ""
+    echo "────────────────────────────────────────────"
+    echo "  Project: $proj"
+    echo "────────────────────────────────────────────"
+
+    # Run baseline first (no pilot), then pilot
+    run_project "$proj" "false" "baseline"
+    echo ""
+    run_project "$proj" "true" "pilot"
+done
+
+# Summary comparison
+echo ""
+echo "============================================"
+echo "  COMPARISON SUMMARY"
+echo "============================================"
+echo ""
+printf "%-30s %-10s %-6s %-6s %-8s %-6s %-8s\n" "Project" "Mode" "Pass" "Fail" "Cost" "Time" "Verify"
+printf "%-30s %-10s %-6s %-6s %-8s %-6s %-8s\n" "-------" "----" "----" "----" "----" "----" "------"
+
+for proj in "${PROJECT_NAMES[@]}"; do
+    for label in baseline pilot; do
+        summary="$RUN_DIR/$proj/$label/summary.json"
+        if [[ -f "$summary" ]]; then
+            passed=$(python3 -c "import json; print(json.load(open('$summary'))['passed'])")
+            failed=$(python3 -c "import json; print(json.load(open('$summary'))['failed'])")
+            cost=$(python3 -c "import json; print(json.load(open('$summary'))['cost_usd'])")
+            duration=$(python3 -c "import json; print(json.load(open('$summary'))['duration_s'])")
+            verify=$(python3 -c "import json; print(json.load(open('$summary'))['verify'])")
+            printf "%-30s %-10s %-6s %-6s \$%-7s %-6s %-8s\n" "$proj" "$label" "$passed" "$failed" "$cost" "${duration}s" "$verify"
+        fi
+    done
+done
+
+echo ""
+echo "Full results at: $RUN_DIR"
+echo ""
+echo "To review pilot decisions:"
+echo "  cat $RUN_DIR/*/pilot/otto_logs/pilot.log"
+echo ""
+echo "To compare orchestrator logs:"
+echo "  diff $RUN_DIR/<project>/baseline/otto_logs/orchestrator.log \\"
+echo "       $RUN_DIR/<project>/pilot/otto_logs/orchestrator.log"
diff --git a/bench/pressure/projects/pilot-test-api-contract/setup.sh b/bench/pressure/projects/pilot-test-api-contract/setup.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+cat > store.py << 'PYEOF'
+"""In-memory key-value store — base module."""
+
+class Store:
+    def __init__(self):
+        self._data = {}
+
+    def set(self, key, value):
+        self._data[key] = value
+
+    def get(self, key):
+        return self._data.get(key)
+
+    def delete(self, key):
+        return self._data.pop(key, None)
+
+    def keys(self):
+        return list(self._data.keys())
+PYEOF
+
+cat > test_store.py << 'PYEOF'
+from store import Store
+
+def test_set_and_get():
+    s = Store()
+    s.set("a", 1)
+    assert s.get("a") == 1
+
+def test_delete():
+    s = Store()
+    s.set("a", 1)
+    s.delete("a")
+    assert s.get("a") is None
+PYEOF
+
+git add -A && git commit -m "init key-value store"
diff --git a/bench/pressure/projects/pilot-test-api-contract/tasks.txt b/bench/pressure/projects/pilot-test-api-contract/tasks.txt
@@ -0,0 +1,3 @@
+Enhance Store in store.py: add TTL support. set(key, value, ttl=None) where ttl is seconds. get() returns None for expired keys. Add has(key) that returns False for expired. Add list_active() that returns a list of dicts: [{"key": "k", "value": "v", "expires_at": float_or_none}]. Internally store entries as {"value": v, "expires_at": timestamp_or_none}. Add comprehensive tests.
+Build a cache layer in cache.py on top of Store. Create CacheLayer(store, max_size=100, eviction="lru"). It must support: cache.get(key) with LRU tracking, cache.put(key, value, ttl=None), cache.stats() returning {"hits": n, "misses": n, "evictions": n, "size": n}. Eviction removes the least-recently-used entry when max_size is exceeded. NOTE: Store.list_active() returns tuples of (key, value, expires_at), NOT dicts. Respect this API. Add thorough tests.
+Build a REST API in api.py using Flask on top of CacheLayer. Endpoints: PUT /cache/:key (body: {"value": ..., "ttl": ...}), GET /cache/:key (returns {"key": k, "value": v, "ttl_remaining": n}), DELETE /cache/:key, GET /cache (returns all active entries), GET /stats. Test with Flask test client.
diff --git a/bench/pressure/projects/pilot-test-api-contract/verify.sh b/bench/pressure/projects/pilot-test-api-contract/verify.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+set -uo pipefail
+
+trap 'rc=$?; rm -f verify_check.py; exit $rc' EXIT
+
+cat > verify_check.py << 'PY'
+import time
+failures = 0
+
+def report(name, fn):
+    global failures
+    try:
+        fn()
+        print(f"PASS {name}")
+    except Exception as exc:
+        failures += 1
+        print(f"FAIL {name}: {exc}")
+
+def check_ttl():
+    from store import Store
+    s = Store()
+    s.set("k", "v", ttl=1)
+    assert s.get("k") == "v"
+    time.sleep(1.1)
+    assert s.get("k") is None
+
+def check_list_active():
+    from store import Store
+    s = Store()
+    s.set("a", 1)
+    s.set("b", 2, ttl=60)
+    active = s.list_active()
+    assert isinstance(active, list)
+    assert len(active) == 2
+
+def check_cache_basic():
+    from store import Store
+    from cache import CacheLayer
+    s = Store()
+    c = CacheLayer(s, max_size=2)
+    c.put("a", 1)
+    c.put("b", 2)
+    assert c.get("a") == 1
+
+def check_cache_eviction():
+    from store import Store
+    from cache import CacheLayer
+    s = Store()
+    c = CacheLayer(s, max_size=2)
+    c.put("a", 1)
+    c.put("b", 2)
+    c.put("c", 3)  # should evict LRU
+    stats = c.stats()
+    assert stats["evictions"] >= 1
+    assert stats["size"] <= 2
+
+def check_api():
+    from api import app
+    client = app.test_client()
+    r = client.put("/cache/test", json={"value": "hello"})
+    assert r.status_code in (200, 201)
+    r = client.get("/cache/test")
+    assert r.status_code == 200
+    data = r.get_json()
+    assert data["value"] == "hello"
+
+report("TTL expires keys", check_ttl)
+report("list_active returns all non-expired", check_list_active)
+report("cache basic get/put", check_cache_basic)
+report("cache eviction works", check_cache_eviction)
+report("API put and get", check_api)
+
+raise SystemExit(1 if failures else 0)
+PY
+
+python3 verify_check.py
diff --git a/bench/pressure/projects/pilot-test-env-mismatch/setup.sh b/bench/pressure/projects/pilot-test-env-mismatch/setup.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Node.js project that uses vitest (NOT jest)
+cat > package.json << 'EOF'
+{
+  "name": "math-utils",
+  "version": "1.0.0",
+  "type": "module",
+  "scripts": {
+    "test": "vitest run"
+  },
+  "devDependencies": {
+    "vitest": "^1.6.0"
+  }
+}
+EOF
+
+cat > vitest.config.js << 'EOF'
+import { defineConfig } from 'vitest/config';
+export default defineConfig({
+  test: { globals: true }
+});
+EOF
+
+cat > math.js << 'EOF'
+/**
+ * Math utility functions.
+ */
+export function add(a, b) {
+  return a + b;
+}
+
+export function subtract(a, b) {
+  return a - b;
+}
+EOF
+
+cat > math.test.js << 'EOF'
+import { describe, it, expect } from 'vitest';
+import { add, subtract } from './math.js';
+
+describe('math', () => {
+  it('adds numbers', () => {
+    expect(add(2, 3)).toBe(5);
+  });
+  it('subtracts numbers', () => {
+    expect(subtract(5, 3)).toBe(2);
+  });
+});
+EOF
+
+npm install --silent 2>/dev/null
+git add -A && git commit -m "init math-utils with vitest"
diff --git a/bench/pressure/projects/pilot-test-env-mismatch/tasks.txt b/bench/pressure/projects/pilot-test-env-mismatch/tasks.txt
@@ -0,0 +1,3 @@
+Add a multiply(a, b) function to math.js and add comprehensive jest tests for it in math.test.js. Test edge cases: zero, negative numbers, floating point, very large numbers. Use describe/it/expect pattern.
+Add a divide(a, b) function to math.js that throws an error on division by zero. Add tests for it. Also add a modulo(a, b) function with tests. Test edge cases: divide by zero throws, negative modulo, floating point modulo.
+Add a power(base, exp) function and a sqrt(n) function to math.js. power should handle negative exponents. sqrt should throw on negative input. Add tests for both including edge cases: power(0,0)=1, negative exponents return fractions, sqrt of perfect squares, sqrt of negative throws.