From 566300571b9de441e723d0069aadb7874393945b Mon Sep 17 00:00:00 2001 From: Kai Liu Date: Mon, 1 Jun 2026 03:06:30 +0800 Subject: [PATCH 1/3] fix(cron): auto-disable jobs whose target channel is unreachable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a cron job's send hits a permanent channel-access error (channel_not_found, not_in_channel, is_archived, token_revoked, etc.), the scheduler now disables the job and records the reason. Previously the same broken job would re-fire on every cron tick and capture an identical Sentry event for the lifetime of the daemon — see Sentry ODE-DEAMON-7 (5 events over ~5 weeks from one stuck job). - Add @/shared/delivery/permanent-error with cross-platform detection (Slack error strings, Discord numeric codes, Lark chat_not_found) and intentionally narrow classification: transient failures stay retryable. - Wire it into both the success-path send and the failure-notification send in runCronJob. The check on the notify path uses the *notify* error, not the original turn error, so a transient agent timeout doesn't disable a job when the channel is fine. - Add unit tests covering Slack/Discord/Lark error shapes and ensuring rate-limits / 5xx / ECONNRESET / message_not_found stay retryable. --- packages/core/cron/scheduler.ts | 98 ++++++++++++++++++- .../shared/delivery/permanent-error.test.ts | 67 +++++++++++++ packages/shared/delivery/permanent-error.ts | 94 ++++++++++++++++++ 3 files changed, 258 insertions(+), 1 deletion(-) create mode 100644 packages/shared/delivery/permanent-error.test.ts create mode 100644 packages/shared/delivery/permanent-error.ts diff --git a/packages/core/cron/scheduler.ts b/packages/core/cron/scheduler.ts index 45b81c46..c756dc9a 100644 --- a/packages/core/cron/scheduler.ts +++ b/packages/core/cron/scheduler.ts @@ -15,6 +15,7 @@ import { markCronJobFailed, markCronJobRunning, markCronJobTriggered, + patchCronJob, reconcileInterruptedCronJobs, } from "@/config/local/cron-jobs"; import { @@ -37,6 +38,7 @@ import { buildSessionEnvironment, prepareSessionWorkspace } from "@/core/session import { sendChannelMessage as sendDiscordChannelMessage } from "@/ims/discord/client"; import { sendChannelMessage as sendLarkChannelMessage } from "@/ims/lark/client"; import { sendChannelMessage as sendSlackChannelMessage } from "@/ims/slack/client"; +import { isPermanentChannelError } from "@/shared/delivery/permanent-error"; import { log } from "@/utils"; const CRON_POLL_INTERVAL_MS = 15_000; @@ -265,6 +267,57 @@ async function prepareCronSession(job: CronJobRecord, runId: string): Promise<{ return { session, sessionId, cwd, created }; } +/** + * Auto-disable a cron job whose destination channel is permanently + * unreachable (bot kicked, channel archived, token revoked, etc.). Without + * this, every cron tick would re-attempt the same send and capture an + * identical Sentry event for the lifetime of the daemon — see + * `isPermanentChannelError` for the precise classification. + * + * Best-effort: we never let bookkeeping failures here shadow the original + * delivery error. The function does NOT throw. + */ +async function disableCronJobForPermanentChannelError( + job: CronJobRecord, + error: unknown, + agentResultDetailId: string | null, +): Promise { + const reason = error instanceof Error ? error.message : String(error); + const summary = `auto-disabled: destination channel unreachable (${reason})`; + if (agentResultDetailId) { + try { + failAgentResult({ detailId: agentResultDetailId, errorText: summary }); + } catch (failError) { + log.warn("Failed to mark cron agent_result detail as failed during auto-disable", { + detailId: agentResultDetailId, + error: String(failError), + }); + } + } + try { + patchCronJob(job.id, { enabled: false }); + } catch (disableError) { + log.warn("Failed to disable cron job after permanent channel error", { + cronJobId: job.id, + error: String(disableError), + }); + } + try { + markCronJobFailed(job.id, summary); + } catch (markError) { + log.warn("Failed to mark auto-disabled cron job as failed", { + cronJobId: job.id, + error: String(markError), + }); + } + log.warn("Auto-disabled cron job: destination channel unreachable", { + cronJobId: job.id, + title: job.title, + channelId: job.channelId, + error: reason, + }); +} + async function runCronJob(job: CronJobRecord, minuteStartMs: number): Promise { const agent = createAgentAdapter(); const runId = getCronRunId(minuteStartMs); @@ -350,7 +403,22 @@ async function runCronJob(job: CronJobRecord, minuteStartMs: number): Promise { + test("matches Slack channel_not_found in message", () => { + expect( + isPermanentChannelError( + new Error("An API error occurred: channel_not_found"), + ), + ).toBe(true); + }); + + test("matches Slack SDK error shape with data.error", () => { + const err = Object.assign(new Error("slack_webapi_platform_error"), { + data: { error: "channel_not_found" }, + }); + expect(isPermanentChannelError(err)).toBe(true); + }); + + test("matches Slack not_in_channel / is_archived / account_inactive", () => { + expect(isPermanentChannelError(new Error("not_in_channel"))).toBe(true); + expect(isPermanentChannelError(new Error("is_archived"))).toBe(true); + expect(isPermanentChannelError(new Error("channel_is_archived"))).toBe(true); + expect(isPermanentChannelError(new Error("account_inactive"))).toBe(true); + }); + + test("matches Slack auth-revoked errors", () => { + expect(isPermanentChannelError(new Error("token_revoked"))).toBe(true); + expect(isPermanentChannelError(new Error("invalid_auth"))).toBe(true); + expect(isPermanentChannelError(new Error("missing_scope"))).toBe(true); + }); + + test("matches Discord-style numeric codes (Missing Access, Unknown Channel)", () => { + expect( + isPermanentChannelError(Object.assign(new Error("Missing Access"), { code: 50001 })), + ).toBe(true); + expect( + isPermanentChannelError(Object.assign(new Error("Unknown Channel"), { code: 10003 })), + ).toBe(true); + }); + + test("matches Lark chat_not_found", () => { + expect(isPermanentChannelError(new Error("chat_not_found: chat does not exist"))).toBe(true); + }); + + test("ignores transient / retryable failures", () => { + expect(isPermanentChannelError(new Error("status 429"))).toBe(false); + expect(isPermanentChannelError(new Error("ECONNRESET"))).toBe(false); + expect(isPermanentChannelError(new Error("status 500"))).toBe(false); + expect(isPermanentChannelError(new Error("socket hang up"))).toBe(false); + expect(isPermanentChannelError(new Error("rate_limited"))).toBe(false); + }); + + test("ignores nullish / empty inputs", () => { + expect(isPermanentChannelError(null)).toBe(false); + expect(isPermanentChannelError(undefined)).toBe(false); + expect(isPermanentChannelError("")).toBe(false); + expect(isPermanentChannelError({})).toBe(false); + }); + + test("ignores message_not_found (that's a delete/update race, not permanent)", () => { + // `message_not_found` is already handled as a benign update/delete race + // in @/core/observability/sentry. It is NOT a channel-access problem and + // must remain retryable from this helper's perspective. + expect(isPermanentChannelError(new Error("message_not_found"))).toBe(false); + }); +}); diff --git a/packages/shared/delivery/permanent-error.ts b/packages/shared/delivery/permanent-error.ts new file mode 100644 index 00000000..bce81fb6 --- /dev/null +++ b/packages/shared/delivery/permanent-error.ts @@ -0,0 +1,94 @@ +/** + * Detect "permanent" channel-access delivery failures — cases where retrying + * the same send is guaranteed to fail until a human intervenes (bot removed + * from the channel, channel archived/deleted, workspace token revoked, etc.). + * + * This is used by the cron scheduler to auto-disable a job whose target + * channel is unreachable. Without this guard the scheduler keeps firing the + * job every cron tick, each failure capturing a fresh Sentry event with the + * same `channel_not_found` fingerprint — turning one broken job into a + * permanent noise source (see Sentry ODE-DEAMON-7). + * + * The detection is intentionally narrow: we only return `true` for errors + * whose remediation is *outside the daemon's control*. Transient failures + * (network blips, 5xx, rate limits) must remain retryable and therefore + * MUST NOT be classified as permanent here. + */ + +// Tokens are matched case-insensitively against the stringified error. +// +// Slack: +// - channel_not_found: channel doesn't exist OR bot can't see it +// - not_in_channel: bot needs to be invited to post +// - is_archived / channel_is_archived: channel was archived +// - account_inactive: workspace user/account disabled +// - token_revoked / invalid_auth / not_authed: token no longer works +// - missing_scope: token lacks chat:write or similar — retrying won't help +// +// Discord (discord.js DiscordAPIError exposes `code`): +// - 10003 Unknown Channel +// - 50001 Missing Access +// - 50013 Missing Permissions +// - 50007 Cannot send messages to this user +// +// Lark (open-platform error codes vary; we match the common "permission +// denied" / "chat not exist" message text): +// - chat_not_found / chat does not exist +// - permission denied (generic; only matched when paired with chat context) +const PERMANENT_MESSAGE_TOKENS = [ + // Slack + "channel_not_found", + "not_in_channel", + "is_archived", + "channel_is_archived", + "account_inactive", + "token_revoked", + "invalid_auth", + "not_authed", + "missing_scope", + // Lark + "chat_not_found", + "chat not exist", +]; + +// Discord.js surfaces the raw API code on `err.code` as a number. +const PERMANENT_DISCORD_CODES = new Set([ + 10003, // Unknown Channel + 50001, // Missing Access + 50013, // Missing Permissions + 50007, // Cannot send messages to this user +]); + +function stringifyError(err: unknown): string { + if (err instanceof Error) return err.message; + if (typeof err === "string") return err; + try { + return JSON.stringify(err); + } catch { + return String(err); + } +} + +export function isPermanentChannelError(err: unknown): boolean { + if (!err) return false; + const message = stringifyError(err).toLowerCase(); + for (const token of PERMANENT_MESSAGE_TOKENS) { + if (message.includes(token)) return true; + } + if (typeof err === "object" && err !== null) { + const code = (err as { code?: unknown }).code; + if (typeof code === "number" && PERMANENT_DISCORD_CODES.has(code)) { + return true; + } + // Slack SDK exposes the API error string on `err.data.error`. + const data = (err as { data?: { error?: unknown } }).data; + const apiError = data?.error; + if (typeof apiError === "string") { + const normalized = apiError.toLowerCase(); + for (const token of PERMANENT_MESSAGE_TOKENS) { + if (normalized.includes(token)) return true; + } + } + } + return false; +} From 46fe1725ca56c936dd3c0b7f4a66128877c7834d Mon Sep 17 00:00:00 2001 From: Kai Liu Date: Tue, 2 Jun 2026 03:07:49 +0800 Subject: [PATCH 2/3] fix(cron): bypass channel revalidation when auto-disabling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses Codex P2 review on PR #211. The previous patch routed auto-disable through patchCronJob -> updateCronJob -> getChannelSnapshot, which throws "Channel not found in configured workspaces" when the cron row's destination channel has since been removed from ode.json. In that exact case (stale local config, bot kicked, workspace re-onboarded) the auto-disable was silently swallowed and the row stayed enabled — the scheduler would keep firing and capturing the same channel_not_found Sentry event every tick, defeating the point of the PR. Add a direct-SQL disableCronJob(id) helper that flips enabled=0 without re-validating channel config, and use it from both call sites (success-path send and failure-notification send). Tests: new disableCronJob suite covers - happy path (enabled true -> false, returns true) - idempotent re-disable (returns false) - missing row (returns false) - stale-channel case where patchCronJob throws but disableCronJob still succeeds (the regression Codex flagged) bun test: 418 pass, 1 skip, 0 fail. --- packages/config/local/cron-jobs.test.ts | 76 +++++++++++++++++++++++++ packages/config/local/cron-jobs.ts | 25 ++++++++ packages/core/cron/scheduler.ts | 6 +- 3 files changed, 104 insertions(+), 3 deletions(-) diff --git a/packages/config/local/cron-jobs.test.ts b/packages/config/local/cron-jobs.test.ts index 2e47e081..b8a77dbd 100644 --- a/packages/config/local/cron-jobs.test.ts +++ b/packages/config/local/cron-jobs.test.ts @@ -9,10 +9,12 @@ import { clearCronJobsForTests, closeCronJobDatabaseForTests, createCronJob, + disableCronJob, getCronJobById, markCronJobCompleted, markCronJobFailed, markCronJobRunning, + patchCronJob, reconcileInterruptedCronJobs, } from "./cron-jobs"; @@ -202,3 +204,77 @@ describe("reconcileInterruptedCronJobs", () => { expect(getCronJobById(job.id)?.lastRunStatus).toBe("failed"); }); }); + +describe("disableCronJob", () => { + test("flips enabled to false and reports the transition", () => { + const job = createCronJob({ + title: "to-disable", + cronExpression: "*/5 * * * *", + channelId: "C_TEST", + messageText: "hi", + }); + expect(getCronJobById(job.id)?.enabled).toBe(true); + + const changed = disableCronJob(job.id); + expect(changed).toBe(true); + expect(getCronJobById(job.id)?.enabled).toBe(false); + }); + + test("returns false when the row is already disabled (idempotent)", () => { + const job = createCronJob({ + title: "already-off", + cronExpression: "*/5 * * * *", + channelId: "C_TEST", + messageText: "hi", + enabled: false, + }); + const changed = disableCronJob(job.id); + expect(changed).toBe(false); + expect(getCronJobById(job.id)?.enabled).toBe(false); + }); + + test("returns false when the row does not exist", () => { + expect(disableCronJob("does-not-exist")).toBe(false); + }); + + test("succeeds even when the row's channel was removed from local config", () => { + // This is the scenario `disableCronJob` exists to solve: + // a cron job points at a channel that has since been removed from + // ode.json. `patchCronJob` would throw via `getChannelSnapshot`, but + // the direct-SQL `disableCronJob` must still flip `enabled` to false + // so the scheduler stops firing the row. + const job = createCronJob({ + title: "stale-channel", + cronExpression: "*/5 * * * *", + channelId: "C_TEST", + messageText: "hi", + }); + + // Drop the channel from the on-disk config so getChannelSnapshot fails. + const emptyConfig = { + user: {}, + workspaces: [ + { + id: "ws-test", + name: "Test Workspace", + type: "slack", + channelDetails: [], // C_TEST is gone + }, + ], + }; + fs.writeFileSync(ODE_CONFIG_FILE, JSON.stringify(emptyConfig)); + invalidateOdeConfigCache(); + + // Sanity check: patchCronJob blows up because it re-validates the + // channel before writing. This is exactly the bug Codex flagged on + // PR #211 and is the reason we route around it. + expect(() => patchCronJob(job.id, { enabled: false })).toThrow( + /Channel not found/i, + ); + expect(getCronJobById(job.id)?.enabled).toBe(true); + + // disableCronJob bypasses channel resolution entirely. + expect(disableCronJob(job.id)).toBe(true); + expect(getCronJobById(job.id)?.enabled).toBe(false); + }); +}); diff --git a/packages/config/local/cron-jobs.ts b/packages/config/local/cron-jobs.ts index a07d7b4c..42621480 100644 --- a/packages/config/local/cron-jobs.ts +++ b/packages/config/local/cron-jobs.ts @@ -384,6 +384,31 @@ export function patchCronJob(id: string, params: PatchCronJobParams): CronJobRec return updateCronJob(id, merged); } +/** + * Directly flip `enabled` to 0 without re-validating the rest of the cron + * row. Unlike `patchCronJob`, this does NOT re-resolve the row's channel + * via `getChannelSnapshot`, so it is safe to call when the destination + * channel was removed from the local config — which is exactly the + * scenario that motivates auto-disable in the first place (bot kicked, + * workspace re-onboarded, channel id stale, etc.). + * + * Returns `true` if a row actually transitioned from enabled→disabled, so + * callers can avoid duplicate log/notification work if another writer beat + * them to it. A missing row returns `false` as well. + */ +export function disableCronJob(id: string): boolean { + const db = getDatabase(); + const result = db.query(` + UPDATE cron_jobs + SET + enabled = 0, + updated_at = ? + WHERE id = ? + AND enabled = 1 + `).run(Date.now(), id); + return result.changes > 0; +} + export function markCronJobTriggered(id: string, minuteStartMs: number): boolean { const db = getDatabase(); const result = db.query(` diff --git a/packages/core/cron/scheduler.ts b/packages/core/cron/scheduler.ts index c756dc9a..1a5e016a 100644 --- a/packages/core/cron/scheduler.ts +++ b/packages/core/cron/scheduler.ts @@ -9,13 +9,13 @@ import { } from "@/config"; import { type CronJobRecord, + disableCronJob, getCronJobById, listEnabledCronJobs, markCronJobCompleted, markCronJobFailed, markCronJobRunning, markCronJobTriggered, - patchCronJob, reconcileInterruptedCronJobs, } from "@/config/local/cron-jobs"; import { @@ -295,7 +295,7 @@ async function disableCronJobForPermanentChannelError( } } try { - patchCronJob(job.id, { enabled: false }); + disableCronJob(job.id); } catch (disableError) { log.warn("Failed to disable cron job after permanent channel error", { cronJobId: job.id, @@ -485,7 +485,7 @@ async function runCronJob(job: CronJobRecord, minuteStartMs: number): Promise Date: Wed, 3 Jun 2026 03:04:32 +0800 Subject: [PATCH 3/3] fix(discord): forward DiscordAPIError code through resolveTextChannel wrapper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses Codex P2 review on PR #211. resolveTextChannel() catches each client.channels.fetch() failure and then rethrows a generic Error("Discord channel ... is not text-based or inaccessible") with no `code` attached. That generic error reaches isPermanentChannelError(), which checks for numeric DiscordAPIErrorcode (10003 Unknown Channel / 50001 Missing Access / etc.) and falls through to false — so a Discord cron job whose bot has been removed from the target channel stays enabled and keeps firing every tick, exactly the noise pattern this PR exists to fix. Capture every Discord API code from each fetch attempt, pick a permanent one (10003 / 50001 / 50013 / 50007) when available, and attach it to the thrown wrapper as `code` (with the full list on `discordErrorCodes` for diagnostics). The wrapper's message is unchanged so callers that match on the string still work. Add a permanent-error.test.ts case asserting the wrapper Error with a forwarded code is classified as permanent. bun test: 419 pass, 1 skip, 0 fail. --- packages/ims/discord/client.ts | 31 ++++++++++++++++++- .../shared/delivery/permanent-error.test.ts | 12 +++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/packages/ims/discord/client.ts b/packages/ims/discord/client.ts index 2d021ffe..16d193cb 100644 --- a/packages/ims/discord/client.ts +++ b/packages/ims/discord/client.ts @@ -116,6 +116,20 @@ async function maybeHandleLauncherCommand(params: { } async function resolveTextChannel(channelId: string, processorId?: string) { const attempts: string[] = []; + // Collect Discord API error codes across every fetch attempt so the + // caller can detect permanent-access failures (Unknown Channel / Missing + // Access / etc.) even though we re-throw a generic wrapper Error below. + // discord.js surfaces these on `err.code` as numbers; we forward them on + // the wrapper as `discordErrorCodes` and also expose the first as `code` + // so `isPermanentChannelError` can pick it up via the `code` shape. + const discordErrorCodes: number[] = []; + const captureCode = (error: unknown) => { + if (typeof error === "object" && error !== null) { + const code = (error as { code?: unknown }).code; + if (typeof code === "number") discordErrorCodes.push(code); + } + }; + if (processorId) { const pinnedClient = discordClientByProcessorId.get(processorId); if (pinnedClient) { @@ -126,6 +140,7 @@ async function resolveTextChannel(channelId: string, processorId?: string) { } attempts.push(`bot=${pinnedClient.user?.id || "unknown"}: channel_not_text_or_missing`); } catch (error) { + captureCode(error); const errorMessage = error instanceof Error ? error.message : String(error); attempts.push(`bot=${pinnedClient.user?.id || "unknown"}: ${errorMessage}`); } @@ -141,6 +156,7 @@ async function resolveTextChannel(channelId: string, processorId?: string) { } attempts.push(`bot=${client.user?.id || "unknown"}: channel_not_text_or_missing`); } catch (error) { + captureCode(error); const errorMessage = error instanceof Error ? error.message : String(error); attempts.push(`bot=${client.user?.id || "unknown"}: ${errorMessage}`); } @@ -154,7 +170,20 @@ async function resolveTextChannel(channelId: string, processorId?: string) { }); } - throw new Error(`Discord channel ${channelId} is not text-based or inaccessible`); + const wrapper = new Error(`Discord channel ${channelId} is not text-based or inaccessible`); + if (discordErrorCodes.length > 0) { + // Prefer the most informative code: prioritise "permanent" classes + // (10003 Unknown Channel, 50001 Missing Access, 50013 Missing Perms, + // 50007 Cannot DM) so isPermanentChannelError can disable the cron row + // even if some bots failed transiently. + const PERMANENT_PRIORITY = new Set([10003, 50001, 50013, 50007]); + const permanent = discordErrorCodes.find((c) => PERMANENT_PRIORITY.has(c)); + (wrapper as Error & { code?: number; discordErrorCodes?: number[] }).code = + permanent ?? discordErrorCodes[0]; + (wrapper as Error & { code?: number; discordErrorCodes?: number[] }).discordErrorCodes = + [...discordErrorCodes]; + } + throw wrapper; } async function buildDiscordContext( diff --git a/packages/shared/delivery/permanent-error.test.ts b/packages/shared/delivery/permanent-error.test.ts index ef37e15c..84a57e74 100644 --- a/packages/shared/delivery/permanent-error.test.ts +++ b/packages/shared/delivery/permanent-error.test.ts @@ -39,6 +39,18 @@ describe("isPermanentChannelError", () => { ).toBe(true); }); + test("matches the Discord resolveTextChannel wrapper when it carries a DiscordAPIError code", () => { + // resolveTextChannel() in packages/ims/discord/client.ts wraps the + // underlying DiscordAPIError in a generic Error("... is not text-based + // or inaccessible"). The wrapper forwards the original numeric `code` + // so this helper can still classify it as permanent. + const wrapper = Object.assign( + new Error("Discord channel 123 is not text-based or inaccessible"), + { code: 10003 }, + ); + expect(isPermanentChannelError(wrapper)).toBe(true); + }); + test("matches Lark chat_not_found", () => { expect(isPermanentChannelError(new Error("chat_not_found: chat does not exist"))).toBe(true); });