Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 207 additions & 0 deletions bench/pilot-benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
#!/usr/bin/env bash
# Gate pilot A/B benchmark — runs multi-task projects with and without pilot.
# Usage: ./bench/pilot-benchmark.sh [project_name]
#
# Runs each project TWICE:
# 1. pilot=true (gate pilot enabled — default)
# 2. pilot=false (fallback to replan — baseline)
#
# Compares: pass rate, cost, retry success, pilot.log decisions.
# Results: /tmp/otto-pilot-benchmark/

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
PROJECTS_DIR="$REPO_DIR/bench/pressure/projects"
RESULTS_DIR="/tmp/otto-pilot-benchmark"
TIMESTAMP="$(date +%Y-%m-%d-%H%M%S)"
RUN_DIR="$RESULTS_DIR/$TIMESTAMP"

OTTO_BIN="${OTTO_BIN:-}"
if [[ -z "$OTTO_BIN" ]]; then
repo_otto="$REPO_DIR/.venv/bin/otto"
if [[ -x "$repo_otto" ]]; then
OTTO_BIN="$repo_otto"
else
OTTO_BIN="otto"
fi
fi
[[ "$OTTO_BIN" == */* ]] && OTTO_BIN="$(cd "$(dirname "$OTTO_BIN")" && pwd)/$(basename "$OTTO_BIN")"

# Default projects: multi-task only (pilot only activates with batch failures)
DEFAULT_PROJECTS=(
edge-conflicting-tasks
multi-blog-engine
multi-expense-tracker
)

FILTER="${1:-}"
PROJECT_NAMES=()
if [[ -n "$FILTER" ]]; then
PROJECT_NAMES+=("$FILTER")
else
PROJECT_NAMES=("${DEFAULT_PROJECTS[@]}")
fi

mkdir -p "$RUN_DIR"

echo "============================================"
echo " Gate Pilot A/B Benchmark — $TIMESTAMP"
echo " Projects: ${PROJECT_NAMES[*]}"
echo " Results: $RUN_DIR"
echo "============================================"
echo ""

run_project() {
local proj="$1"
local pilot_flag="$2" # "true" or "false"
local label="$3" # "pilot" or "baseline"
local proj_dir="$PROJECTS_DIR/$proj"
local result_dir="$RUN_DIR/$proj/$label"
local work_dir="/tmp/pb-$proj-$label"

mkdir -p "$result_dir"
rm -rf "$work_dir"
mkdir -p "$work_dir"

echo " [$label] Setting up..."

# Setup
(
cd "$work_dir"
git init -q
git config user.email "benchmark@otto.dev"
git config user.name "Benchmark"
bash "$proj_dir/setup.sh"
) > "$result_dir/setup.log" 2>&1

# Write otto.yaml with pilot flag
cat > "$work_dir/otto.yaml" << EOF
test_command: auto
max_retries: 2
max_parallel: 1
pilot: $pilot_flag
EOF

# Add initial commit
(cd "$work_dir" && git add -A && git commit -q -m "initial setup")

# Add tasks
(
cd "$work_dir"
while IFS= read -r line; do
line="$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
[[ -z "$line" || "$line" == \#* ]] && continue
env CLAUDECODE= "$OTTO_BIN" add "$line"
done < "$proj_dir/tasks.txt"
) > "$result_dir/add.log" 2>&1

# Run otto
echo " [$label] Running otto (pilot=$pilot_flag)..."
local start_time=$(date +%s)
set +e
(
cd "$work_dir"
env CLAUDECODE= "$OTTO_BIN" run 2>&1
) | tee "$result_dir/output.txt"
local exit_code=${PIPESTATUS[0]}
set -e
local end_time=$(date +%s)
local duration=$((end_time - start_time))

# Copy artifacts
cp "$work_dir/tasks.yaml" "$result_dir/" 2>/dev/null || true
cp "$work_dir/otto.yaml" "$result_dir/" 2>/dev/null || true
cp -r "$work_dir/otto_logs" "$result_dir/" 2>/dev/null || true

# Parse results
local passed=0 failed=0 cost="0.00"
if [[ -f "$work_dir/tasks.yaml" ]]; then
passed=$(grep -c 'status: passed' "$work_dir/tasks.yaml" 2>/dev/null || true)
failed=$(grep -c 'status: failed' "$work_dir/tasks.yaml" 2>/dev/null || true)
cost=$(grep 'cost_usd:' "$work_dir/tasks.yaml" 2>/dev/null \
| sed 's/.*cost_usd:[[:space:]]*//' \
| awk '{s += $1} END {printf "%.2f", s}' 2>/dev/null || echo "0.00")
fi

# Independent verification
local verify="n/a"
if [[ -f "$proj_dir/verify.sh" && $passed -gt 0 ]]; then
if (cd "$work_dir" && bash "$proj_dir/verify.sh") > "$result_dir/verify.log" 2>&1; then
verify="PASS"
else
verify="FAIL"
fi
fi

# Write summary
cat > "$result_dir/summary.json" << EOF
{
"project": "$proj",
"mode": "$label",
"pilot": $pilot_flag,
"passed": $passed,
"failed": $failed,
"cost_usd": $cost,
"duration_s": $duration,
"verify": "$verify",
"exit_code": $exit_code
}
EOF

echo " [$label] Done: ${passed}p/${failed}f, \$$cost, ${duration}s, verify=$verify"

# Check for pilot.log
if [[ "$label" == "pilot" && -f "$work_dir/otto_logs/pilot.log" ]]; then
echo " [$label] Pilot log available at $result_dir/otto_logs/pilot.log"
fi

# Cleanup workdir
rm -rf "$work_dir"
}

for proj in "${PROJECT_NAMES[@]}"; do
echo ""
echo "────────────────────────────────────────────"
echo " Project: $proj"
echo "────────────────────────────────────────────"

# Run baseline first (no pilot), then pilot
run_project "$proj" "false" "baseline"
echo ""
run_project "$proj" "true" "pilot"
done

# Summary comparison
echo ""
echo "============================================"
echo " COMPARISON SUMMARY"
echo "============================================"
echo ""
printf "%-30s %-10s %-6s %-6s %-8s %-6s %-8s\n" "Project" "Mode" "Pass" "Fail" "Cost" "Time" "Verify"
printf "%-30s %-10s %-6s %-6s %-8s %-6s %-8s\n" "-------" "----" "----" "----" "----" "----" "------"

for proj in "${PROJECT_NAMES[@]}"; do
for label in baseline pilot; do
summary="$RUN_DIR/$proj/$label/summary.json"
if [[ -f "$summary" ]]; then
passed=$(python3 -c "import json; print(json.load(open('$summary'))['passed'])")
failed=$(python3 -c "import json; print(json.load(open('$summary'))['failed'])")
cost=$(python3 -c "import json; print(json.load(open('$summary'))['cost_usd'])")
duration=$(python3 -c "import json; print(json.load(open('$summary'))['duration_s'])")
verify=$(python3 -c "import json; print(json.load(open('$summary'))['verify'])")
printf "%-30s %-10s %-6s %-6s \$%-7s %-6s %-8s\n" "$proj" "$label" "$passed" "$failed" "$cost" "${duration}s" "$verify"
fi
done
done

echo ""
echo "Full results at: $RUN_DIR"
echo ""
echo "To review pilot decisions:"
echo " cat $RUN_DIR/*/pilot/otto_logs/pilot.log"
echo ""
echo "To compare orchestrator logs:"
echo " diff $RUN_DIR/<project>/baseline/otto_logs/orchestrator.log \\"
echo " $RUN_DIR/<project>/pilot/otto_logs/orchestrator.log"
39 changes: 39 additions & 0 deletions bench/pressure/projects/pilot-test-api-contract/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env bash
set -euo pipefail

cat > store.py << 'PYEOF'
"""In-memory key-value store — base module."""

class Store:
def __init__(self):
self._data = {}

def set(self, key, value):
self._data[key] = value

def get(self, key):
return self._data.get(key)

def delete(self, key):
return self._data.pop(key, None)

def keys(self):
return list(self._data.keys())
PYEOF

cat > test_store.py << 'PYEOF'
from store import Store

def test_set_and_get():
s = Store()
s.set("a", 1)
assert s.get("a") == 1

def test_delete():
s = Store()
s.set("a", 1)
s.delete("a")
assert s.get("a") is None
PYEOF

git add -A && git commit -m "init key-value store"
3 changes: 3 additions & 0 deletions bench/pressure/projects/pilot-test-api-contract/tasks.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Enhance Store in store.py: add TTL support. set(key, value, ttl=None) where ttl is seconds. get() returns None for expired keys. Add has(key) that returns False for expired. Add list_active() that returns a list of dicts: [{"key": "k", "value": "v", "expires_at": float_or_none}]. Internally store entries as {"value": v, "expires_at": timestamp_or_none}. Add comprehensive tests.
Build a cache layer in cache.py on top of Store. Create CacheLayer(store, max_size=100, eviction="lru"). It must support: cache.get(key) with LRU tracking, cache.put(key, value, ttl=None), cache.stats() returning {"hits": n, "misses": n, "evictions": n, "size": n}. Eviction removes the least-recently-used entry when max_size is exceeded. NOTE: Store.list_active() returns tuples of (key, value, expires_at), NOT dicts. Respect this API. Add thorough tests.
Build a REST API in api.py using Flask on top of CacheLayer. Endpoints: PUT /cache/:key (body: {"value": ..., "ttl": ...}), GET /cache/:key (returns {"key": k, "value": v, "ttl_remaining": n}), DELETE /cache/:key, GET /cache (returns all active entries), GET /stats. Test with Flask test client.
76 changes: 76 additions & 0 deletions bench/pressure/projects/pilot-test-api-contract/verify.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/env bash
set -uo pipefail

trap 'rc=$?; rm -f verify_check.py; exit $rc' EXIT

cat > verify_check.py << 'PY'
import time
failures = 0

def report(name, fn):
global failures
try:
fn()
print(f"PASS {name}")
except Exception as exc:
failures += 1
print(f"FAIL {name}: {exc}")

def check_ttl():
from store import Store
s = Store()
s.set("k", "v", ttl=1)
assert s.get("k") == "v"
time.sleep(1.1)
assert s.get("k") is None

def check_list_active():
from store import Store
s = Store()
s.set("a", 1)
s.set("b", 2, ttl=60)
active = s.list_active()
assert isinstance(active, list)
assert len(active) == 2

def check_cache_basic():
from store import Store
from cache import CacheLayer
s = Store()
c = CacheLayer(s, max_size=2)
c.put("a", 1)
c.put("b", 2)
assert c.get("a") == 1

def check_cache_eviction():
from store import Store
from cache import CacheLayer
s = Store()
c = CacheLayer(s, max_size=2)
c.put("a", 1)
c.put("b", 2)
c.put("c", 3) # should evict LRU
stats = c.stats()
assert stats["evictions"] >= 1
assert stats["size"] <= 2

def check_api():
from api import app
client = app.test_client()
r = client.put("/cache/test", json={"value": "hello"})
assert r.status_code in (200, 201)
r = client.get("/cache/test")
assert r.status_code == 200
data = r.get_json()
assert data["value"] == "hello"

report("TTL expires keys", check_ttl)
report("list_active returns all non-expired", check_list_active)
report("cache basic get/put", check_cache_basic)
report("cache eviction works", check_cache_eviction)
report("API put and get", check_api)

raise SystemExit(1 if failures else 0)
PY

python3 verify_check.py
54 changes: 54 additions & 0 deletions bench/pressure/projects/pilot-test-env-mismatch/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env bash
set -euo pipefail

# Node.js project that uses vitest (NOT jest)
cat > package.json << 'EOF'
{
"name": "math-utils",
"version": "1.0.0",
"type": "module",
"scripts": {
"test": "vitest run"
},
"devDependencies": {
"vitest": "^1.6.0"
}
}
EOF

cat > vitest.config.js << 'EOF'
import { defineConfig } from 'vitest/config';
export default defineConfig({
test: { globals: true }
});
EOF

cat > math.js << 'EOF'
/**
* Math utility functions.
*/
export function add(a, b) {
return a + b;
}

export function subtract(a, b) {
return a - b;
}
EOF

cat > math.test.js << 'EOF'
import { describe, it, expect } from 'vitest';
import { add, subtract } from './math.js';

describe('math', () => {
it('adds numbers', () => {
expect(add(2, 3)).toBe(5);
});
it('subtracts numbers', () => {
expect(subtract(5, 3)).toBe(2);
});
});
EOF

npm install --silent 2>/dev/null
git add -A && git commit -m "init math-utils with vitest"
3 changes: 3 additions & 0 deletions bench/pressure/projects/pilot-test-env-mismatch/tasks.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Add a multiply(a, b) function to math.js and add comprehensive jest tests for it in math.test.js. Test edge cases: zero, negative numbers, floating point, very large numbers. Use describe/it/expect pattern.
Add a divide(a, b) function to math.js that throws an error on division by zero. Add tests for it. Also add a modulo(a, b) function with tests. Test edge cases: divide by zero throws, negative modulo, floating point modulo.
Add a power(base, exp) function and a sqrt(n) function to math.js. power should handle negative exponents. sqrt should throw on negative input. Add tests for both including edge cases: power(0,0)=1, negative exponents return fractions, sqrt of perfect squares, sqrt of negative throws.
Loading