From b19343c4e9e13017b46f307a03dea2c28760a2ad Mon Sep 17 00:00:00 2001 From: Ben Vinegar Date: Sat, 21 Feb 2026 16:47:09 -0500 Subject: [PATCH 1/4] ops: show broker connection in baudbot status --- README.md | 2 +- bin/baudbot | 74 +++++++++++++++++++++++++++++++++++++++++++++- docs/operations.md | 2 +- 3 files changed, 75 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6697b3b..d474233 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ sudo baudbot deploy # start the service sudo baudbot start -# check health +# check health (includes deployed version + broker connection status) sudo baudbot status sudo baudbot doctor ``` diff --git a/bin/baudbot b/bin/baudbot index f01cb5f..d3c22b0 100755 --- a/bin/baudbot +++ b/bin/baudbot @@ -103,7 +103,7 @@ usage() { echo " start Start the agent (systemd, or --direct for foreground)" echo " stop Stop the agent" echo " restart Restart the agent" - echo " status Show agent status + deployed version" + echo " status Show agent status + deployed version + broker connection" echo " logs Tail agent logs" echo " attach Attach to control-agent by default; supports --pi/--tmux" echo " sessions List agent tmux and pi sessions (name → id)" @@ -326,6 +326,76 @@ print_deployed_version() { echo -e "${BOLD}deployed version:${RESET} $line" } +is_broker_configured() { + local agent_user="${1:-baudbot_agent}" + local env_file="/home/$agent_user/.config/.env" + local required_key="" + local line="" + local value="" + + [ -r "$env_file" ] || return 1 + + for required_key in \ + SLACK_BROKER_URL \ + SLACK_BROKER_WORKSPACE_ID \ + SLACK_BROKER_SERVER_PRIVATE_KEY \ + SLACK_BROKER_SERVER_PUBLIC_KEY \ + SLACK_BROKER_SERVER_SIGNING_PRIVATE_KEY \ + SLACK_BROKER_PUBLIC_KEY \ + SLACK_BROKER_SIGNING_PUBLIC_KEY; do + line="$(grep -E "^${required_key}=" "$env_file" | tail -1 || true)" + [ -n "$line" ] || return 1 + + value="${line#*=}" + value="${value//[[:space:]]/}" + [ -n "$value" ] || return 1 + [ "$value" != "\"\"" ] || return 1 + [ "$value" != "''" ] || return 1 + done + + return 0 +} + +print_broker_connection_status() { + local agent_user="${BAUDBOT_AGENT_USER:-baudbot_agent}" + local pane="" + local latest_signal="" + + if ! is_broker_configured "$agent_user"; then + echo -e "${BOLD}broker connection:${RESET} not configured" + return 0 + fi + + if [ "$(id -u)" -eq 0 ]; then + if ! sudo -u "$agent_user" tmux has-session -t slack-bridge 2>/dev/null; then + echo -e "${BOLD}broker connection:${RESET} disconnected (bridge tmux session not running)" + return 0 + fi + pane="$(sudo -u "$agent_user" tmux capture-pane -p -t slack-bridge -S -400 2>/dev/null || true)" + elif [ "$(id -un)" = "$agent_user" ]; then + if ! tmux has-session -t slack-bridge 2>/dev/null; then + echo -e "${BOLD}broker connection:${RESET} disconnected (bridge tmux session not running)" + return 0 + fi + pane="$(tmux capture-pane -p -t slack-bridge -S -400 2>/dev/null || true)" + else + echo -e "${BOLD}broker connection:${RESET} configured (run with sudo for runtime status)" + return 0 + fi + + latest_signal="$(printf '%s\n' "$pane" | grep -E 'inbox poll failed|backing off|idle|pulled [0-9]+ message\(s\)|Slack broker pull bridge is running' | tail -1 || true)" + + if printf '%s\n' "$latest_signal" | grep -Eq 'inbox poll failed|backing off'; then + echo -e "${BOLD}broker connection:${RESET} reconnecting (recent inbox poll failure)" + elif printf '%s\n' "$latest_signal" | grep -Eq 'idle|pulled [0-9]+ message\(s\)'; then + echo -e "${BOLD}broker connection:${RESET} connected" + elif printf '%s\n' "$latest_signal" | grep -q 'Slack broker pull bridge is running'; then + echo -e "${BOLD}broker connection:${RESET} starting" + else + echo -e "${BOLD}broker connection:${RESET} unknown (bridge running, no recent poll telemetry)" + fi +} + pi_control_dir() { local agent_user="${1:-baudbot_agent}" echo "/home/$agent_user/.pi/session-control" @@ -473,6 +543,7 @@ case "${1:-}" in systemctl status baudbot "$@" || status_rc=$? echo "" print_deployed_version + print_broker_connection_status exit "$status_rc" else # Fallback: check if baudbot_agent has pi running @@ -484,6 +555,7 @@ case "${1:-}" in fi echo "" print_deployed_version + print_broker_connection_status fi ;; diff --git a/docs/operations.md b/docs/operations.md index a565400..68c184b 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -10,7 +10,7 @@ sudo baudbot start sudo baudbot stop sudo baudbot restart -# Status and logs +# Status and logs (status includes deployed version + broker connection state) sudo baudbot status sudo baudbot logs From 6227ee162cd21aaa07d24606351a0f7c8d887c0b Mon Sep 17 00:00:00 2001 From: Ben Vinegar Date: Sat, 21 Feb 2026 16:50:32 -0500 Subject: [PATCH 2/4] ops: tighten broker status logic in baudbot CLI --- bin/baudbot | 69 +++++++++++++++++++---------------------------------- 1 file changed, 25 insertions(+), 44 deletions(-) diff --git a/bin/baudbot b/bin/baudbot index d3c22b0..0216d4c 100755 --- a/bin/baudbot +++ b/bin/baudbot @@ -326,34 +326,11 @@ print_deployed_version() { echo -e "${BOLD}deployed version:${RESET} $line" } -is_broker_configured() { - local agent_user="${1:-baudbot_agent}" - local env_file="/home/$agent_user/.config/.env" - local required_key="" - local line="" - local value="" - +broker_mode_configured() { + local env_file="/home/${1:-baudbot_agent}/.config/.env" [ -r "$env_file" ] || return 1 - - for required_key in \ - SLACK_BROKER_URL \ - SLACK_BROKER_WORKSPACE_ID \ - SLACK_BROKER_SERVER_PRIVATE_KEY \ - SLACK_BROKER_SERVER_PUBLIC_KEY \ - SLACK_BROKER_SERVER_SIGNING_PRIVATE_KEY \ - SLACK_BROKER_PUBLIC_KEY \ - SLACK_BROKER_SIGNING_PUBLIC_KEY; do - line="$(grep -E "^${required_key}=" "$env_file" | tail -1 || true)" - [ -n "$line" ] || return 1 - - value="${line#*=}" - value="${value//[[:space:]]/}" - [ -n "$value" ] || return 1 - [ "$value" != "\"\"" ] || return 1 - [ "$value" != "''" ] || return 1 - done - - return 0 + grep -Eq '^SLACK_BROKER_URL=[^[:space:]].*$' "$env_file" || return 1 + grep -Eq '^SLACK_BROKER_WORKSPACE_ID=[^[:space:]].*$' "$env_file" || return 1 } print_broker_connection_status() { @@ -361,39 +338,43 @@ print_broker_connection_status() { local pane="" local latest_signal="" - if ! is_broker_configured "$agent_user"; then + if ! broker_mode_configured "$agent_user"; then echo -e "${BOLD}broker connection:${RESET} not configured" return 0 fi if [ "$(id -u)" -eq 0 ]; then - if ! sudo -u "$agent_user" tmux has-session -t slack-bridge 2>/dev/null; then + sudo -u "$agent_user" tmux has-session -t slack-bridge 2>/dev/null || { echo -e "${BOLD}broker connection:${RESET} disconnected (bridge tmux session not running)" return 0 - fi - pane="$(sudo -u "$agent_user" tmux capture-pane -p -t slack-bridge -S -400 2>/dev/null || true)" + } + pane="$(sudo -u "$agent_user" tmux capture-pane -p -t slack-bridge -S -200 2>/dev/null || true)" elif [ "$(id -un)" = "$agent_user" ]; then - if ! tmux has-session -t slack-bridge 2>/dev/null; then + tmux has-session -t slack-bridge 2>/dev/null || { echo -e "${BOLD}broker connection:${RESET} disconnected (bridge tmux session not running)" return 0 - fi - pane="$(tmux capture-pane -p -t slack-bridge -S -400 2>/dev/null || true)" + } + pane="$(tmux capture-pane -p -t slack-bridge -S -200 2>/dev/null || true)" else echo -e "${BOLD}broker connection:${RESET} configured (run with sudo for runtime status)" return 0 fi latest_signal="$(printf '%s\n' "$pane" | grep -E 'inbox poll failed|backing off|idle|pulled [0-9]+ message\(s\)|Slack broker pull bridge is running' | tail -1 || true)" - - if printf '%s\n' "$latest_signal" | grep -Eq 'inbox poll failed|backing off'; then - echo -e "${BOLD}broker connection:${RESET} reconnecting (recent inbox poll failure)" - elif printf '%s\n' "$latest_signal" | grep -Eq 'idle|pulled [0-9]+ message\(s\)'; then - echo -e "${BOLD}broker connection:${RESET} connected" - elif printf '%s\n' "$latest_signal" | grep -q 'Slack broker pull bridge is running'; then - echo -e "${BOLD}broker connection:${RESET} starting" - else - echo -e "${BOLD}broker connection:${RESET} unknown (bridge running, no recent poll telemetry)" - fi + case "$latest_signal" in + *"inbox poll failed"*|*"backing off"*) + echo -e "${BOLD}broker connection:${RESET} reconnecting (recent inbox poll failure)" + ;; + *"idle"*|*"pulled "*) + echo -e "${BOLD}broker connection:${RESET} connected" + ;; + *"Slack broker pull bridge is running"*) + echo -e "${BOLD}broker connection:${RESET} starting" + ;; + *) + echo -e "${BOLD}broker connection:${RESET} unknown (bridge running, no recent poll telemetry)" + ;; + esac } pi_control_dir() { From 642ab972ffd89dead17795f04231ca870c824848 Mon Sep 17 00:00:00 2001 From: Ben Vinegar Date: Sat, 21 Feb 2026 16:54:33 -0500 Subject: [PATCH 3/4] ops: add broker poll/inbound/outbound health reporting --- README.md | 2 +- bin/baudbot | 101 +++++++++++++++++--- docs/operations.md | 2 +- slack-bridge/broker-bridge.mjs | 163 ++++++++++++++++++++++++++++++--- 4 files changed, 241 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index d474233..60b1cbe 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ sudo baudbot deploy # start the service sudo baudbot start -# check health (includes deployed version + broker connection status) +# check health (includes deployed version + broker connection/health status) sudo baudbot status sudo baudbot doctor ``` diff --git a/bin/baudbot b/bin/baudbot index 0216d4c..a70512d 100755 --- a/bin/baudbot +++ b/bin/baudbot @@ -335,8 +335,10 @@ broker_mode_configured() { print_broker_connection_status() { local agent_user="${BAUDBOT_AGENT_USER:-baudbot_agent}" - local pane="" - local latest_signal="" + local health_file="/home/$agent_user/.pi/agent/broker-health.json" + local health_summary="" + local connection_state="" + local components_line="" if ! broker_mode_configured "$agent_user"; then echo -e "${BOLD}broker connection:${RESET} not configured" @@ -348,33 +350,108 @@ print_broker_connection_status() { echo -e "${BOLD}broker connection:${RESET} disconnected (bridge tmux session not running)" return 0 } - pane="$(sudo -u "$agent_user" tmux capture-pane -p -t slack-bridge -S -200 2>/dev/null || true)" elif [ "$(id -un)" = "$agent_user" ]; then tmux has-session -t slack-bridge 2>/dev/null || { echo -e "${BOLD}broker connection:${RESET} disconnected (bridge tmux session not running)" return 0 } - pane="$(tmux capture-pane -p -t slack-bridge -S -200 2>/dev/null || true)" else echo -e "${BOLD}broker connection:${RESET} configured (run with sudo for runtime status)" return 0 fi - latest_signal="$(printf '%s\n' "$pane" | grep -E 'inbox poll failed|backing off|idle|pulled [0-9]+ message\(s\)|Slack broker pull bridge is running' | tail -1 || true)" - case "$latest_signal" in - *"inbox poll failed"*|*"backing off"*) - echo -e "${BOLD}broker connection:${RESET} reconnecting (recent inbox poll failure)" - ;; - *"idle"*|*"pulled "*) + if [ ! -r "$health_file" ]; then + echo -e "${BOLD}broker connection:${RESET} starting" + echo -e "${BOLD}broker health:${RESET} unavailable (waiting for bridge health file)" + return 0 + fi + + health_summary="$(python3 - "$health_file" <<'PY' +import json +import sys +from datetime import datetime, timezone + +path = sys.argv[1] +with open(path, 'r', encoding='utf-8') as f: + h = json.load(f) + +def parse_iso(s): + if not s: + return None + try: + if s.endswith('Z'): + s = s[:-1] + '+00:00' + dt = datetime.fromisoformat(s) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + except Exception: + return None + +def age_seconds(ts): + dt = parse_iso(ts) + if not dt: + return None + return (datetime.now(timezone.utc) - dt).total_seconds() + +def status(ok_ts, err_ts): + ok_dt = parse_iso(ok_ts) + err_dt = parse_iso(err_ts) + if err_dt and (not ok_dt or err_dt >= ok_dt): + return 'error' + if ok_dt: + return 'ok' + return 'unknown' + +poll = h.get('poll', {}) +inbound = h.get('inbound', {}) +ack = h.get('ack', {}) +outbound = h.get('outbound', {}) + +poll_age = age_seconds(poll.get('last_ok_at')) +poll_failures = int(poll.get('consecutive_failures') or 0) +poll_state = status(poll.get('last_ok_at'), poll.get('last_error_at')) + +if poll_state == 'error' and poll_failures > 0: + connection = 'reconnecting' +elif poll_age is not None and poll_age <= 120: + connection = 'connected' +elif poll_age is not None: + connection = 'stale' +else: + connection = 'starting' + +inbound_state = status(inbound.get('last_process_ok_at'), inbound.get('last_process_error_at')) +ack_state = status(ack.get('last_ok_at'), ack.get('last_error_at')) +outbound_state = status(outbound.get('last_ok_at'), outbound.get('last_error_at')) + +print(connection) +print(f'poll={poll_state} inbound={inbound_state} ack={ack_state} outbound={outbound_state}') +PY + )" + + connection_state="$(printf '%s\n' "$health_summary" | sed -n '1p')" + components_line="$(printf '%s\n' "$health_summary" | sed -n '2p')" + + case "$connection_state" in + connected) echo -e "${BOLD}broker connection:${RESET} connected" ;; - *"Slack broker pull bridge is running"*) + reconnecting) + echo -e "${BOLD}broker connection:${RESET} reconnecting" + ;; + stale) + echo -e "${BOLD}broker connection:${RESET} stale (no recent successful poll)" + ;; + starting) echo -e "${BOLD}broker connection:${RESET} starting" ;; *) - echo -e "${BOLD}broker connection:${RESET} unknown (bridge running, no recent poll telemetry)" + echo -e "${BOLD}broker connection:${RESET} unknown" ;; esac + + [ -n "$components_line" ] && echo -e "${BOLD}broker health:${RESET} $components_line" } pi_control_dir() { diff --git a/docs/operations.md b/docs/operations.md index 68c184b..fd58fb8 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -10,7 +10,7 @@ sudo baudbot start sudo baudbot stop sudo baudbot restart -# Status and logs (status includes deployed version + broker connection state) +# Status and logs (status includes deployed version + broker connection/health state) sudo baudbot status sudo baudbot logs diff --git a/slack-bridge/broker-bridge.mjs b/slack-bridge/broker-bridge.mjs index c362ed9..bc08a35 100755 --- a/slack-bridge/broker-bridge.mjs +++ b/slack-bridge/broker-bridge.mjs @@ -36,6 +36,7 @@ const POLL_INTERVAL_MS = parseInt(process.env.SLACK_BROKER_POLL_INTERVAL_MS || " const MAX_MESSAGES = parseInt(process.env.SLACK_BROKER_MAX_MESSAGES || "10", 10); const DEDUPE_TTL_MS = parseInt(process.env.SLACK_BROKER_DEDUPE_TTL_MS || String(20 * 60 * 1000), 10); const MAX_BACKOFF_MS = 30_000; +const BROKER_HEALTH_PATH = path.join(homedir(), ".pi", "agent", "broker-health.json"); function ts() { return new Date().toISOString(); @@ -95,6 +96,114 @@ let cryptoState = null; const dedupe = new Map(); +const brokerHealth = { + started_at: new Date().toISOString(), + updated_at: new Date().toISOString(), + outbound_mode: outboundMode, + broker_url: brokerBaseUrl, + workspace_id: workspaceId, + poll: { + last_ok_at: null, + last_error_at: null, + consecutive_failures: 0, + last_error: null, + }, + inbound: { + last_decrypt_ok_at: null, + last_decrypt_error_at: null, + last_process_ok_at: null, + last_process_error_at: null, + last_error: null, + }, + ack: { + last_ok_at: null, + last_error_at: null, + last_error: null, + }, + outbound: { + last_ok_at: null, + last_error_at: null, + last_error: null, + }, +}; + +function trimError(err) { + const msg = err instanceof Error ? err.message : String(err || "unknown error"); + return msg.slice(0, 400); +} + +function persistBrokerHealth() { + brokerHealth.updated_at = new Date().toISOString(); + const dir = path.dirname(BROKER_HEALTH_PATH); + const tmp = `${BROKER_HEALTH_PATH}.tmp`; + fs.mkdirSync(dir, { recursive: true }); + fs.writeFileSync(tmp, `${JSON.stringify(brokerHealth, null, 2)}\n`, { mode: 0o600 }); + fs.renameSync(tmp, BROKER_HEALTH_PATH); +} + +function markHealth(section, ok, err = null) { + const now = new Date().toISOString(); + + if (section === "poll") { + if (ok) { + brokerHealth.poll.last_ok_at = now; + brokerHealth.poll.consecutive_failures = 0; + brokerHealth.poll.last_error = null; + } else { + brokerHealth.poll.last_error_at = now; + brokerHealth.poll.consecutive_failures += 1; + brokerHealth.poll.last_error = trimError(err); + } + persistBrokerHealth(); + return; + } + + if (section === "inbound_decrypt") { + if (ok) { + brokerHealth.inbound.last_decrypt_ok_at = now; + } else { + brokerHealth.inbound.last_decrypt_error_at = now; + brokerHealth.inbound.last_error = trimError(err); + } + persistBrokerHealth(); + return; + } + + if (section === "inbound_process") { + if (ok) { + brokerHealth.inbound.last_process_ok_at = now; + } else { + brokerHealth.inbound.last_process_error_at = now; + brokerHealth.inbound.last_error = trimError(err); + } + persistBrokerHealth(); + return; + } + + if (section === "ack") { + if (ok) { + brokerHealth.ack.last_ok_at = now; + brokerHealth.ack.last_error = null; + } else { + brokerHealth.ack.last_error_at = now; + brokerHealth.ack.last_error = trimError(err); + } + persistBrokerHealth(); + return; + } + + if (section === "outbound") { + if (ok) { + brokerHealth.outbound.last_ok_at = now; + brokerHealth.outbound.last_error = null; + } else { + brokerHealth.outbound.last_error_at = now; + brokerHealth.outbound.last_error = trimError(err); + } + persistBrokerHealth(); + } +} + function toBase64(bytes) { return Buffer.from(bytes).toString("base64"); } @@ -329,15 +438,22 @@ async function sendViaBroker({ action, routing, body }) { const sig = sodium.crypto_sign_detached(canonical, cryptoState.serverSignSecretKey); const signature = toBase64(sig); - return brokerFetch("/api/send", { - workspace_id: workspaceId, - action, - routing, - encrypted_body: encryptedBody, - nonce: nonceB64, - timestamp, - signature, - }); + try { + const result = await brokerFetch("/api/send", { + workspace_id: workspaceId, + action, + routing, + encrypted_body: encryptedBody, + nonce: nonceB64, + timestamp, + signature, + }); + markHealth("outbound", true); + return result; + } catch (err) { + markHealth("outbound", false, err); + throw err; + } } /** @@ -386,11 +502,13 @@ async function sendDirectToSlack(apiMethod, params) { const error = data.error || response.statusText; throw new Error(`Slack API ${apiMethod} failed: ${sanitizeError(error)}`); } - + + markHealth("outbound", true); return data; } catch (err) { // Sanitize any error messages to prevent token leakage const sanitizedMessage = sanitizeError(err.message || String(err)); + markHealth("outbound", false, sanitizedMessage); throw new Error(sanitizedMessage); } } @@ -519,7 +637,15 @@ async function processPulledMessage(message) { throw new Error("invalid broker envelope signature"); } - const payload = decryptEnvelope(message); + let payload; + try { + payload = decryptEnvelope(message); + markHealth("inbound_decrypt", true); + } catch (err) { + markHealth("inbound_decrypt", false, err); + throw err; + } + logInfo(`📦 decrypted envelope — type: ${payload?.type || "unknown"}`); if (payload?.type !== "event_callback") { @@ -723,6 +849,7 @@ async function startPollLoop() { pruneDedupe(); const messages = await pullInbox(); + markHealth("poll", true); pollCount++; const ackIds = []; @@ -756,6 +883,7 @@ async function startPollLoop() { logInfo(`📩 processing message ${message.message_id}`); const ok = await processPulledMessage(message); if (ok) { + markHealth("inbound_process", true); dedupe.set(message.message_id, Date.now() + DEDUPE_TTL_MS); ackIds.push(message.message_id); logInfo(`✅ processed & acked message ${message.message_id}`); @@ -763,6 +891,7 @@ async function startPollLoop() { logWarn(`⚠️ message ${message.message_id} returned not-ok, will retry next poll`); } } catch (err) { + markHealth("inbound_process", false, err); const errMsg = err instanceof Error ? err.message : "unknown error"; const errStack = err instanceof Error ? err.stack : ""; logError(`❌ message processing failed (${message.message_id}): ${errMsg}`); @@ -777,13 +906,20 @@ async function startPollLoop() { } if (ackIds.length > 0) { - await ackInbox(ackIds); - logInfo(`📤 acked ${ackIds.length} message(s)`); + try { + await ackInbox(ackIds); + markHealth("ack", true); + logInfo(`📤 acked ${ackIds.length} message(s)`); + } catch (err) { + markHealth("ack", false, err); + throw err; + } } backoffMs = POLL_INTERVAL_MS; await sleep(POLL_INTERVAL_MS); } catch (err) { + markHealth("poll", false, err); const errMsg = err instanceof Error ? err.message : "unknown error"; const errStack = err instanceof Error ? err.stack : ""; logError(`❌ inbox poll failed: ${errMsg}`); @@ -811,6 +947,7 @@ async function startPollLoop() { refreshSocket(); startApiServer(); + persistBrokerHealth(); logInfo("⚡ Slack broker pull bridge is running!"); logInfo(` outbound mode: ${outboundMode} ${outboundMode === "direct" ? "(using SLACK_BOT_TOKEN)" : "(via broker)"}`); logInfo(` broker: ${brokerBaseUrl}`); From 3afaa946009be0cb737d82874e309db07d1419c8 Mon Sep 17 00:00:00 2001 From: Ben Vinegar Date: Sat, 21 Feb 2026 16:59:46 -0500 Subject: [PATCH 4/4] bridge: avoid poll health false negatives on ack failures --- slack-bridge/broker-bridge.mjs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/slack-bridge/broker-bridge.mjs b/slack-bridge/broker-bridge.mjs index bc08a35..bff9f13 100755 --- a/slack-bridge/broker-bridge.mjs +++ b/slack-bridge/broker-bridge.mjs @@ -845,10 +845,12 @@ async function startPollLoop() { const STATUS_LOG_INTERVAL_MS = 60_000; // log a status line every 60s even when idle while (true) { + let pollSucceeded = false; try { pruneDedupe(); const messages = await pullInbox(); + pollSucceeded = true; markHealth("poll", true); pollCount++; const ackIds = []; @@ -919,11 +921,18 @@ async function startPollLoop() { backoffMs = POLL_INTERVAL_MS; await sleep(POLL_INTERVAL_MS); } catch (err) { - markHealth("poll", false, err); - const errMsg = err instanceof Error ? err.message : "unknown error"; - const errStack = err instanceof Error ? err.stack : ""; - logError(`❌ inbox poll failed: ${errMsg}`); - if (errStack) logError(` stack: ${errStack}`); + if (!pollSucceeded) { + markHealth("poll", false, err); + const errMsg = err instanceof Error ? err.message : "unknown error"; + const errStack = err instanceof Error ? err.stack : ""; + logError(`❌ inbox poll failed: ${errMsg}`); + if (errStack) logError(` stack: ${errStack}`); + } else { + const errMsg = err instanceof Error ? err.message : "unknown error"; + const errStack = err instanceof Error ? err.stack : ""; + logError(`❌ broker cycle failed after successful poll: ${errMsg}`); + if (errStack) logError(` stack: ${errStack}`); + } logError(` ↳ backing off ${backoffMs}ms before next attempt`); await sleep(backoffMs); backoffMs = Math.min(MAX_BACKOFF_MS, Math.max(POLL_INTERVAL_MS, backoffMs * 2));