From 946a9d273380662f33dc8bd95f3f252a82bc7110 Mon Sep 17 00:00:00 2001 From: Ben Tossell Date: Wed, 18 Feb 2026 10:04:24 -0500 Subject: [PATCH 1/4] ops: add runtime health checks to doctor.sh New checks aligned with what the heartbeat monitors at runtime: - Slack bridge responding (curl localhost:7890) - Disk usage warning at 80%, fail at 90% - Stale session sockets (no owning process) - Orphaned worktrees (>5 triggers warning) - Session log total size (warn at 500MB) --- bin/doctor.sh | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/bin/doctor.sh b/bin/doctor.sh index c00882d..c6cfe7c 100755 --- a/bin/doctor.sh +++ b/bin/doctor.sh @@ -232,6 +232,77 @@ else fi fi +# ── Runtime Health ──────────────────────────────────────────────────────────── + +echo "" +echo "Runtime health:" + +# Slack bridge +if curl -s -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:7890/send -H 'Content-Type: application/json' -d '{}' 2>/dev/null | grep -q "400"; then + pass "slack bridge responding (port 7890)" +else + warn "slack bridge not responding on port 7890" +fi + +# Disk usage +DISK_PCT=$(df / 2>/dev/null | tail -1 | awk '{print $5}' | tr -d '%') +if [ -n "$DISK_PCT" ]; then + if [ "$DISK_PCT" -ge 90 ]; then + fail "disk usage at ${DISK_PCT}% (critical)" + elif [ "$DISK_PCT" -ge 80 ]; then + warn "disk usage at ${DISK_PCT}%" + else + pass "disk usage at ${DISK_PCT}%" + fi +fi + +# Stale session sockets +SOCKET_DIR="$AGENT_HOME/.pi/session-control" +if [ -d "$SOCKET_DIR" ]; then + STALE_SOCKS=0 + if command -v fuser &>/dev/null; then + for sock in "$SOCKET_DIR"/*.sock; do + [ -e "$sock" ] || continue + if ! fuser "$sock" &>/dev/null 2>&1; then + STALE_SOCKS=$((STALE_SOCKS + 1)) + fi + done + fi + if [ "$STALE_SOCKS" -gt 0 ]; then + warn "$STALE_SOCKS stale session socket(s) in $SOCKET_DIR" + else + pass "no stale session sockets" + fi +fi + +# Orphaned worktrees +WORKTREE_DIR="$AGENT_HOME/workspace/worktrees" +if [ -d "$WORKTREE_DIR" ]; then + ORPHANS=0 + for wt in "$WORKTREE_DIR"/*/; do + [ -d "$wt" ] || continue + ORPHANS=$((ORPHANS + 1)) + done + if [ "$ORPHANS" -gt 5 ]; then + warn "$ORPHANS worktrees in $WORKTREE_DIR (consider cleanup)" + elif [ "$ORPHANS" -gt 0 ]; then + pass "$ORPHANS active worktree(s)" + fi +fi + +# Session log size +if [ -d "$AGENT_HOME/.pi/agent/sessions" ]; then + LOG_SIZE_KB=$(du -sk "$AGENT_HOME/.pi/agent/sessions" 2>/dev/null | cut -f1) + if [ -n "$LOG_SIZE_KB" ]; then + LOG_SIZE_MB=$((LOG_SIZE_KB / 1024)) + if [ "$LOG_SIZE_MB" -ge 500 ]; then + warn "session logs total ${LOG_SIZE_MB}MB (consider pruning)" + else + pass "session logs total ${LOG_SIZE_MB}MB" + fi + fi +fi + # ── Summary ────────────────────────────────────────────────────────────────── echo "" From c86f764ff0894a458779e29442688670a79a3c27 Mon Sep 17 00:00:00 2001 From: Ben Tossell Date: Wed, 18 Feb 2026 10:17:25 -0500 Subject: [PATCH 2/4] =?UTF-8?q?ops:=20fix=20AGENT=5FHOME=20=E2=86=92=20BAU?= =?UTF-8?q?DBOT=5FHOME,=20rename=20ORPHANS=20=E2=86=92=20WORKTREE=5FCOUNT?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - All references to undefined AGENT_HOME replaced with BAUDBOT_HOME - Renamed ORPHANS variable to WORKTREE_COUNT (counts all worktrees, not just orphaned) --- bin/doctor.sh | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/bin/doctor.sh b/bin/doctor.sh index c6cfe7c..297444a 100755 --- a/bin/doctor.sh +++ b/bin/doctor.sh @@ -257,7 +257,7 @@ if [ -n "$DISK_PCT" ]; then fi # Stale session sockets -SOCKET_DIR="$AGENT_HOME/.pi/session-control" +SOCKET_DIR="$BAUDBOT_HOME/.pi/session-control" if [ -d "$SOCKET_DIR" ]; then STALE_SOCKS=0 if command -v fuser &>/dev/null; then @@ -275,24 +275,24 @@ if [ -d "$SOCKET_DIR" ]; then fi fi -# Orphaned worktrees -WORKTREE_DIR="$AGENT_HOME/workspace/worktrees" +# Worktree count +WORKTREE_DIR="$BAUDBOT_HOME/workspace/worktrees" if [ -d "$WORKTREE_DIR" ]; then - ORPHANS=0 + WORKTREE_COUNT=0 for wt in "$WORKTREE_DIR"/*/; do [ -d "$wt" ] || continue - ORPHANS=$((ORPHANS + 1)) + WORKTREE_COUNT=$((WORKTREE_COUNT + 1)) done - if [ "$ORPHANS" -gt 5 ]; then - warn "$ORPHANS worktrees in $WORKTREE_DIR (consider cleanup)" - elif [ "$ORPHANS" -gt 0 ]; then - pass "$ORPHANS active worktree(s)" + if [ "$WORKTREE_COUNT" -gt 5 ]; then + warn "$WORKTREE_COUNT worktrees in $WORKTREE_DIR (consider cleanup)" + elif [ "$WORKTREE_COUNT" -gt 0 ]; then + pass "$WORKTREE_COUNT active worktree(s)" fi fi # Session log size -if [ -d "$AGENT_HOME/.pi/agent/sessions" ]; then - LOG_SIZE_KB=$(du -sk "$AGENT_HOME/.pi/agent/sessions" 2>/dev/null | cut -f1) +if [ -d "$BAUDBOT_HOME/.pi/agent/sessions" ]; then + LOG_SIZE_KB=$(du -sk "$BAUDBOT_HOME/.pi/agent/sessions" 2>/dev/null | cut -f1) if [ -n "$LOG_SIZE_KB" ]; then LOG_SIZE_MB=$((LOG_SIZE_KB / 1024)) if [ "$LOG_SIZE_MB" -ge 500 ]; then From 138519223860d769cdc1663f5f4cf2c18a737f1c Mon Sep 17 00:00:00 2001 From: Ben Tossell Date: Wed, 18 Feb 2026 10:41:31 -0500 Subject: [PATCH 3/4] ops: remove worktree count and log size filler checks Keep only the checks that answer 'why isn't the bot responding': bridge health, disk usage, stale sockets. --- bin/doctor.sh | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/bin/doctor.sh b/bin/doctor.sh index 297444a..7468024 100755 --- a/bin/doctor.sh +++ b/bin/doctor.sh @@ -275,33 +275,6 @@ if [ -d "$SOCKET_DIR" ]; then fi fi -# Worktree count -WORKTREE_DIR="$BAUDBOT_HOME/workspace/worktrees" -if [ -d "$WORKTREE_DIR" ]; then - WORKTREE_COUNT=0 - for wt in "$WORKTREE_DIR"/*/; do - [ -d "$wt" ] || continue - WORKTREE_COUNT=$((WORKTREE_COUNT + 1)) - done - if [ "$WORKTREE_COUNT" -gt 5 ]; then - warn "$WORKTREE_COUNT worktrees in $WORKTREE_DIR (consider cleanup)" - elif [ "$WORKTREE_COUNT" -gt 0 ]; then - pass "$WORKTREE_COUNT active worktree(s)" - fi -fi - -# Session log size -if [ -d "$BAUDBOT_HOME/.pi/agent/sessions" ]; then - LOG_SIZE_KB=$(du -sk "$BAUDBOT_HOME/.pi/agent/sessions" 2>/dev/null | cut -f1) - if [ -n "$LOG_SIZE_KB" ]; then - LOG_SIZE_MB=$((LOG_SIZE_KB / 1024)) - if [ "$LOG_SIZE_MB" -ge 500 ]; then - warn "session logs total ${LOG_SIZE_MB}MB (consider pruning)" - else - pass "session logs total ${LOG_SIZE_MB}MB" - fi - fi -fi # ── Summary ────────────────────────────────────────────────────────────────── From aaf882e277eb78b93bb2f4cd5ddf5db8b9dcf170 Mon Sep 17 00:00:00 2001 From: Ben Tossell Date: Wed, 18 Feb 2026 10:51:41 -0500 Subject: [PATCH 4/4] ops: warn when stale socket check is skipped without fuser --- bin/doctor.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/bin/doctor.sh b/bin/doctor.sh index 7468024..f9b3338 100755 --- a/bin/doctor.sh +++ b/bin/doctor.sh @@ -267,11 +267,13 @@ if [ -d "$SOCKET_DIR" ]; then STALE_SOCKS=$((STALE_SOCKS + 1)) fi done - fi - if [ "$STALE_SOCKS" -gt 0 ]; then - warn "$STALE_SOCKS stale session socket(s) in $SOCKET_DIR" + if [ "$STALE_SOCKS" -gt 0 ]; then + warn "$STALE_SOCKS stale session socket(s) in $SOCKET_DIR" + else + pass "no stale session sockets" + fi else - pass "no stale session sockets" + warn "fuser not installed; skipping stale socket check" fi fi