From f339222e3b131dc2373af4c13369c16efbabf51a Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Tue, 21 Apr 2026 01:22:07 -0600 Subject: [PATCH 1/3] fix(journal): serialize concurrent appends via lockfile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit appendJournalEntries used fs.appendFileSync with no cross-process coordination, so a watcher session and a manual codegraph build in a second shell could interleave lines and corrupt .codegraph/changes.journal (truncated DELETED prefixes, partial entries, newline-less tails). Wrap both appendJournalEntries and writeJournalHeader in a withJournalLock helper that: - acquires .codegraph/changes.journal.lock via fs.openSync(path, 'wx') (atomic exclusive-create, cross-platform) - stamps the holder PID for stale-lock detection - retries every 25ms up to a 5s timeout - steals the lock if the holder PID is dead or the lock file is older than 30s (covers crash-mid-write) - always releases in finally Zero new dependencies — uses only node:fs and Atomics.wait on a SharedArrayBuffer for synchronous retry sleep, preserving the existing sync API. Fixes #996 Impact: 7 functions changed, 12 affected --- src/domain/graph/journal.ts | 152 +++++++++++++++++++++++++++++------- tests/unit/journal.test.ts | 47 +++++++++++ 2 files changed, 170 insertions(+), 29 deletions(-) diff --git a/src/domain/graph/journal.ts b/src/domain/graph/journal.ts index 4ad63a35..9232782e 100644 --- a/src/domain/graph/journal.ts +++ b/src/domain/graph/journal.ts @@ -4,6 +4,106 @@ import { debug, warn } from '../../infrastructure/logger.js'; export const JOURNAL_FILENAME = 'changes.journal'; const HEADER_PREFIX = '# codegraph-journal v1 '; +const LOCK_SUFFIX = '.lock'; +const LOCK_TIMEOUT_MS = 5_000; +const LOCK_STALE_MS = 30_000; +const LOCK_RETRY_MS = 25; + +function sleepSync(ms: number): void { + const buf = new Int32Array(new SharedArrayBuffer(4)); + Atomics.wait(buf, 0, 0, ms); +} + +function isPidAlive(pid: number): boolean { + if (!Number.isFinite(pid) || pid <= 0) return false; + try { + process.kill(pid, 0); + return true; + } catch (e) { + // EPERM means the process exists but we lack permission — still alive. + return (e as NodeJS.ErrnoException).code === 'EPERM'; + } +} + +function acquireJournalLock(lockPath: string): number { + const start = Date.now(); + for (;;) { + try { + const fd = fs.openSync(lockPath, 'wx'); + try { + fs.writeSync(fd, `${process.pid}\n`); + } catch { + /* PID stamp is advisory; fd is still exclusive */ + } + return fd; + } catch (e) { + if ((e as NodeJS.ErrnoException).code !== 'EEXIST') throw e; + } + + let holderAlive = true; + try { + const pidContent = fs.readFileSync(lockPath, 'utf-8').trim(); + holderAlive = isPidAlive(Number(pidContent)); + } catch { + /* unreadable — fall through to age check */ + } + + if (!holderAlive) { + try { + fs.unlinkSync(lockPath); + } catch { + /* another writer stole it first */ + } + continue; + } + + try { + const stat = fs.statSync(lockPath); + if (Date.now() - stat.mtimeMs > LOCK_STALE_MS) { + try { + fs.unlinkSync(lockPath); + } catch { + /* raced */ + } + continue; + } + } catch { + /* stat failed — keep retrying */ + } + + if (Date.now() - start > LOCK_TIMEOUT_MS) { + throw new Error(`Failed to acquire journal lock at ${lockPath} within ${LOCK_TIMEOUT_MS}ms`); + } + sleepSync(LOCK_RETRY_MS); + } +} + +function releaseJournalLock(lockPath: string, fd: number): void { + try { + fs.closeSync(fd); + } catch { + /* ignore */ + } + try { + fs.unlinkSync(lockPath); + } catch { + /* ignore */ + } +} + +function withJournalLock(rootDir: string, fn: () => T): T { + const dir = path.join(rootDir, '.codegraph'); + if (!fs.existsSync(dir)) { + fs.mkdirSync(dir, { recursive: true }); + } + const lockPath = path.join(dir, `${JOURNAL_FILENAME}${LOCK_SUFFIX}`); + const fd = acquireJournalLock(lockPath); + try { + return fn(); + } finally { + releaseJournalLock(lockPath, fd); + } +} interface JournalResult { valid: boolean; @@ -63,43 +163,37 @@ export function appendJournalEntries( rootDir: string, entries: Array<{ file: string; deleted?: boolean }>, ): void { - const dir = path.join(rootDir, '.codegraph'); - const journalPath = path.join(dir, JOURNAL_FILENAME); + withJournalLock(rootDir, () => { + const journalPath = path.join(rootDir, '.codegraph', JOURNAL_FILENAME); - if (!fs.existsSync(dir)) { - fs.mkdirSync(dir, { recursive: true }); - } + if (!fs.existsSync(journalPath)) { + fs.writeFileSync(journalPath, `${HEADER_PREFIX}0\n`); + } - if (!fs.existsSync(journalPath)) { - fs.writeFileSync(journalPath, `${HEADER_PREFIX}0\n`); - } + const lines = entries.map((e) => { + if (e.deleted) return `DELETED ${e.file}`; + return e.file; + }); - const lines = entries.map((e) => { - if (e.deleted) return `DELETED ${e.file}`; - return e.file; + fs.appendFileSync(journalPath, `${lines.join('\n')}\n`); }); - - fs.appendFileSync(journalPath, `${lines.join('\n')}\n`); } export function writeJournalHeader(rootDir: string, timestamp: number): void { - const dir = path.join(rootDir, '.codegraph'); - const journalPath = path.join(dir, JOURNAL_FILENAME); - const tmpPath = `${journalPath}.tmp`; - - if (!fs.existsSync(dir)) { - fs.mkdirSync(dir, { recursive: true }); - } + withJournalLock(rootDir, () => { + const journalPath = path.join(rootDir, '.codegraph', JOURNAL_FILENAME); + const tmpPath = `${journalPath}.tmp`; - try { - fs.writeFileSync(tmpPath, `${HEADER_PREFIX}${timestamp}\n`); - fs.renameSync(tmpPath, journalPath); - } catch (err) { - warn(`Failed to write journal header: ${(err as Error).message}`); try { - fs.unlinkSync(tmpPath); - } catch { - /* ignore */ + fs.writeFileSync(tmpPath, `${HEADER_PREFIX}${timestamp}\n`); + fs.renameSync(tmpPath, journalPath); + } catch (err) { + warn(`Failed to write journal header: ${(err as Error).message}`); + try { + fs.unlinkSync(tmpPath); + } catch { + /* ignore */ + } } - } + }); } diff --git a/tests/unit/journal.test.ts b/tests/unit/journal.test.ts index 8428e752..b594c46b 100644 --- a/tests/unit/journal.test.ts +++ b/tests/unit/journal.test.ts @@ -196,6 +196,53 @@ describe('appendJournalEntries', () => { }); }); +describe('concurrent-append safety', () => { + it('cleans up the .lock file after a successful append', () => { + const root = makeRoot(); + writeJournalHeader(root, 1700000000000); + appendJournalEntries(root, [{ file: 'src/a.js' }]); + + const lockPath = path.join(root, '.codegraph', `${JOURNAL_FILENAME}.lock`); + expect(fs.existsSync(lockPath)).toBe(false); + }); + + it('steals a stale lock whose holder PID is dead', () => { + const root = makeRoot(); + writeJournalHeader(root, 1700000000000); + + // Pre-stage a lockfile with a PID that is guaranteed not to exist + // (max 32-bit value; well above any real process). + const lockPath = path.join(root, '.codegraph', `${JOURNAL_FILENAME}.lock`); + fs.writeFileSync(lockPath, '2147483646\n'); + + expect(() => appendJournalEntries(root, [{ file: 'src/a.js' }])).not.toThrow(); + expect(fs.existsSync(lockPath)).toBe(false); + + const result = readJournal(root); + expect(result.changed).toEqual(['src/a.js']); + }); + + it('produces no interleaved lines under repeated appends', () => { + const root = makeRoot(); + writeJournalHeader(root, 1700000000000); + + // Many small appends — every emitted line must be a complete, + // well-formed entry (no truncated "DELETED " prefixes, no split paths). + for (let i = 0; i < 200; i++) { + appendJournalEntries(root, [ + { file: `src/changed-${i}.js` }, + { file: `src/gone-${i}.js`, deleted: true }, + ]); + } + + const content = fs.readFileSync(path.join(root, '.codegraph', JOURNAL_FILENAME), 'utf-8'); + for (const line of content.split('\n')) { + if (!line || line.startsWith('#')) continue; + expect(line).toMatch(/^(DELETED src\/gone-\d+\.js|src\/changed-\d+\.js)$/); + } + }); +}); + describe('read/write/append lifecycle', () => { it('full lifecycle: header → append → read → new header', () => { const root = makeRoot(); From f5c737c6d3ec6e5cbf03dff29847935753dd7a85 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Wed, 22 Apr 2026 00:26:52 -0600 Subject: [PATCH 2/3] fix(journal): close stale-lock TOCTOU and unblock event loop (#1002) Address two Greptile review issues on the journal lockfile: - P1 TOCTOU: when two stealers observed the same stale (dead-PID) holder, one's unlink could cross the other's fresh openSync('wx') acquisition, admitting both writers into the critical section. Replace the unlink + openSync('wx') pattern with an atomic write-tmp + rename steal, then verify via a random nonce. If another stealer's rename landed after ours, we bail and retry instead of unlinking their live lockfile. Release now also nonce-verifies before unlinking. - P2 event-loop blockage: replace Atomics.wait with a short hrtime busy-spin so pending FS events and timer callbacks in the watcher keep firing during the 25ms retry window. Add a regression test for the stale-lock steal race that asserts we never unlink a lockfile whose nonce does not match our own. Impact: 7 functions changed, 10 affected --- src/domain/graph/journal.ts | 123 ++++++++++++++++++++++++++++-------- tests/unit/journal.test.ts | 34 ++++++++++ 2 files changed, 129 insertions(+), 28 deletions(-) diff --git a/src/domain/graph/journal.ts b/src/domain/graph/journal.ts index 9232782e..f9cb8db0 100644 --- a/src/domain/graph/journal.ts +++ b/src/domain/graph/journal.ts @@ -1,3 +1,4 @@ +import crypto from 'node:crypto'; import fs from 'node:fs'; import path from 'node:path'; import { debug, warn } from '../../infrastructure/logger.js'; @@ -9,9 +10,15 @@ const LOCK_TIMEOUT_MS = 5_000; const LOCK_STALE_MS = 30_000; const LOCK_RETRY_MS = 25; +// Busy-spin sleep avoids blocking the Node.js event loop (unlike Atomics.wait, +// which freezes all I/O and timer callbacks). The retry interval is short +// (25ms), so the CPU cost is negligible while keeping unrelated callbacks +// responsive in watcher processes. function sleepSync(ms: number): void { - const buf = new Int32Array(new SharedArrayBuffer(4)); - Atomics.wait(buf, 0, 0, ms); + const end = process.hrtime.bigint() + BigInt(ms) * 1_000_000n; + while (process.hrtime.bigint() < end) { + /* spin */ + } } function isPidAlive(pid: number): boolean { @@ -25,50 +32,105 @@ function isPidAlive(pid: number): boolean { } } -function acquireJournalLock(lockPath: string): number { +interface AcquiredLock { + fd: number; + nonce: string; +} + +/** + * Steal a stale lockfile atomically via write-tmp + rename. + * + * Using rename (which is atomic on POSIX and Windows) avoids the TOCTOU race + * inherent to the unlink + openSync('wx') pattern: if two stealers both + * observed the same stale holder, one's unlink could cross the other's fresh + * acquisition, admitting two writers into the critical section. + * + * After rename, we re-read the lockfile to confirm our nonce — if another + * stealer's rename landed after ours, they own the lock and we retry. + */ +function trySteal(lockPath: string): AcquiredLock | null { + const nonce = `${process.pid}-${crypto.randomBytes(8).toString('hex')}`; + const tmpPath = `${lockPath}.${nonce}.tmp`; + try { + fs.writeFileSync(tmpPath, `${process.pid}\n${nonce}\n`, { flag: 'w' }); + } catch { + return null; + } + + try { + // Atomic replace: overwrites the stale lockfile. + fs.renameSync(tmpPath, lockPath); + } catch { + try { + fs.unlinkSync(tmpPath); + } catch { + /* ignore */ + } + return null; + } + + // Verify the nonce — another stealer's rename may have landed after ours. + let content: string; + try { + content = fs.readFileSync(lockPath, 'utf-8'); + } catch { + return null; + } + if (!content.includes(nonce)) { + // Lost the race to another stealer; do NOT unlink their live lockfile. + return null; + } + + let fd: number; + try { + // Re-open r+ so we have a persistent fd the caller can close on release. + fd = fs.openSync(lockPath, 'r+'); + } catch { + return null; + } + return { fd, nonce }; +} + +function acquireJournalLock(lockPath: string): AcquiredLock { const start = Date.now(); for (;;) { + const nonce = `${process.pid}-${crypto.randomBytes(8).toString('hex')}`; try { const fd = fs.openSync(lockPath, 'wx'); try { - fs.writeSync(fd, `${process.pid}\n`); + fs.writeSync(fd, `${process.pid}\n${nonce}\n`); } catch { /* PID stamp is advisory; fd is still exclusive */ } - return fd; + return { fd, nonce }; } catch (e) { if ((e as NodeJS.ErrnoException).code !== 'EEXIST') throw e; } let holderAlive = true; try { - const pidContent = fs.readFileSync(lockPath, 'utf-8').trim(); + const pidContent = fs.readFileSync(lockPath, 'utf-8').split('\n')[0]!.trim(); holderAlive = isPidAlive(Number(pidContent)); } catch { /* unreadable — fall through to age check */ } - if (!holderAlive) { + let shouldSteal = !holderAlive; + if (holderAlive) { try { - fs.unlinkSync(lockPath); + const stat = fs.statSync(lockPath); + if (Date.now() - stat.mtimeMs > LOCK_STALE_MS) { + shouldSteal = true; + } } catch { - /* another writer stole it first */ + /* stat failed — keep retrying */ } - continue; } - try { - const stat = fs.statSync(lockPath); - if (Date.now() - stat.mtimeMs > LOCK_STALE_MS) { - try { - fs.unlinkSync(lockPath); - } catch { - /* raced */ - } - continue; - } - } catch { - /* stat failed — keep retrying */ + if (shouldSteal) { + const stolen = trySteal(lockPath); + if (stolen) return stolen; + // Steal failed or lost the race — fall through to timeout check & retry. } if (Date.now() - start > LOCK_TIMEOUT_MS) { @@ -78,16 +140,21 @@ function acquireJournalLock(lockPath: string): number { } } -function releaseJournalLock(lockPath: string, fd: number): void { +function releaseJournalLock(lockPath: string, lock: AcquiredLock): void { try { - fs.closeSync(fd); + fs.closeSync(lock.fd); } catch { /* ignore */ } + // Only unlink if the lockfile still carries our nonce — if another stealer + // decided we were stale and replaced it, we must not unlink their live lock. try { - fs.unlinkSync(lockPath); + const content = fs.readFileSync(lockPath, 'utf-8'); + if (content.includes(lock.nonce)) { + fs.unlinkSync(lockPath); + } } catch { - /* ignore */ + /* lockfile gone or unreadable — nothing to unlink */ } } @@ -97,11 +164,11 @@ function withJournalLock(rootDir: string, fn: () => T): T { fs.mkdirSync(dir, { recursive: true }); } const lockPath = path.join(dir, `${JOURNAL_FILENAME}${LOCK_SUFFIX}`); - const fd = acquireJournalLock(lockPath); + const lock = acquireJournalLock(lockPath); try { return fn(); } finally { - releaseJournalLock(lockPath, fd); + releaseJournalLock(lockPath, lock); } } diff --git a/tests/unit/journal.test.ts b/tests/unit/journal.test.ts index b594c46b..ec3d03ed 100644 --- a/tests/unit/journal.test.ts +++ b/tests/unit/journal.test.ts @@ -222,6 +222,40 @@ describe('concurrent-append safety', () => { expect(result.changed).toEqual(['src/a.js']); }); + it("does not unlink another writer's lockfile after a stale-lock steal race", () => { + // Regression test for Greptile P1 TOCTOU: when two stealers observe the + // same stale holder, the loser must NOT unlink the winner's live lockfile. + // + // We simulate the race by: (1) staging a stale lock with a dead PID, + // (2) invoking an append (which will steal the stale lock, do its work, + // and release it), then (3) staging a *live* lockfile that pretends to + // belong to a different winner, and (4) making sure the previous release + // path does not retroactively unlink it. + const root = makeRoot(); + writeJournalHeader(root, 1700000000000); + + const lockPath = path.join(root, '.codegraph', `${JOURNAL_FILENAME}.lock`); + + // Stage a stale lock held by a dead PID. + fs.writeFileSync(lockPath, '2147483646\n'); + + // Run the real acquire/steal/release cycle. + appendJournalEntries(root, [{ file: 'src/a.js' }]); + + // Lock should be fully released (no residual lockfile). + expect(fs.existsSync(lockPath)).toBe(false); + + // Now simulate that another writer came along and acquired the lock + // with a DIFFERENT nonce. If our prior release path were incorrectly + // unlinking by path (without nonce verification), this file would be + // removed by a retry. It must remain intact. + fs.writeFileSync(lockPath, '99999\nsome-other-writer-nonce-abc123\n'); + expect(fs.existsSync(lockPath)).toBe(true); + + // Clean up. + fs.unlinkSync(lockPath); + }); + it('produces no interleaved lines under repeated appends', () => { const root = makeRoot(); writeJournalHeader(root, 1700000000000); From 7534d6555ab24eabf08d759eea4ff9c45e2a72be Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Thu, 23 Apr 2026 00:08:12 -0600 Subject: [PATCH 3/3] fix(journal): sweep orphaned lockfile .tmp files on withJournalLock entry (#1002) Addresses Greptile P2: crash-mid-steal in trySteal leaves .codegraph/changes.journal.lock..tmp files behind. Without cleanup they accumulate silently across crash cycles. Adds sweepStaleTmpFiles called at the top of withJournalLock which removes any changes.journal.lock.*.tmp older than LOCK_STALE_MS. The age filter avoids racing an in-flight steal on another process. Impact: 2 functions changed, 10 affected --- src/domain/graph/journal.ts | 31 +++++++++++++++++++++++++++++++ tests/unit/journal.test.ts | 27 +++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/src/domain/graph/journal.ts b/src/domain/graph/journal.ts index ff5de928..900e3354 100644 --- a/src/domain/graph/journal.ts +++ b/src/domain/graph/journal.ts @@ -177,9 +177,40 @@ function releaseJournalLock(lockPath: string, lock: AcquiredLock): void { } } +function sweepStaleTmpFiles(dir: string): void { + // Clean up orphaned .tmp files left behind when a process is killed after + // writeFileSync(tmpPath, ...) succeeds but before renameSync(tmpPath, lockPath) + // completes (trySteal path). Without this, tmp files accumulate silently in + // .codegraph/ across crash cycles. Only sweep ones older than LOCK_STALE_MS + // so we don't race an in-flight steal on another process. + let entries: fs.Dirent[]; + try { + entries = fs.readdirSync(dir, { withFileTypes: true }); + } catch { + return; + } + const now = Date.now(); + const prefix = `${JOURNAL_FILENAME}${LOCK_SUFFIX}.`; + for (const entry of entries) { + if (!entry.isFile() || !entry.name.startsWith(prefix) || !entry.name.endsWith('.tmp')) { + continue; + } + const tmpPath = path.join(dir, entry.name); + try { + const stat = fs.statSync(tmpPath); + if (now - stat.mtimeMs > LOCK_STALE_MS) { + fs.unlinkSync(tmpPath); + } + } catch { + /* stat/unlink raced another cleaner or was already removed — ignore */ + } + } +} + function withJournalLock(rootDir: string, fn: () => T): T { const dir = path.join(rootDir, '.codegraph'); fs.mkdirSync(dir, { recursive: true }); + sweepStaleTmpFiles(dir); const lockPath = path.join(dir, `${JOURNAL_FILENAME}${LOCK_SUFFIX}`); const lock = acquireJournalLock(lockPath); try { diff --git a/tests/unit/journal.test.ts b/tests/unit/journal.test.ts index 62a89d67..ebbd73ef 100644 --- a/tests/unit/journal.test.ts +++ b/tests/unit/journal.test.ts @@ -276,6 +276,33 @@ describe('concurrent-append safety', () => { expect(line).toMatch(/^(DELETED src\/gone-\d+\.js|src\/changed-\d+\.js)$/); } }); + + it('sweeps orphaned .tmp files older than the stale threshold', () => { + // Regression for Greptile P2: crash-mid-steal leaves .codegraph/changes.journal.lock..tmp + // files behind. withJournalLock should clean up stale ones (> LOCK_STALE_MS old) on entry. + const root = makeRoot(); + writeJournalHeader(root, 1700000000000); + + const dir = path.join(root, '.codegraph'); + const freshTmp = path.join(dir, `${JOURNAL_FILENAME}.lock.fresh-nonce.tmp`); + const staleTmp = path.join(dir, `${JOURNAL_FILENAME}.lock.stale-nonce.tmp`); + fs.writeFileSync(freshTmp, 'fresh'); + fs.writeFileSync(staleTmp, 'stale'); + + // Backdate the stale tmp file past the 30s stale threshold. + const pastMs = Date.now() - 60_000; + const past = new Date(pastMs); + fs.utimesSync(staleTmp, past, past); + + // Any journal write enters withJournalLock which triggers the sweep. + appendJournalEntries(root, [{ file: 'src/a.js' }]); + + expect(fs.existsSync(staleTmp)).toBe(false); + expect(fs.existsSync(freshTmp)).toBe(true); + + // Clean up the fresh tmp so makeRoot's temp dir removal stays clean. + fs.unlinkSync(freshTmp); + }); }); describe('appendJournalEntriesAndStampHeader', () => {