diff --git a/src/domain/graph/journal.ts b/src/domain/graph/journal.ts index b77399c9..900e3354 100644 --- a/src/domain/graph/journal.ts +++ b/src/domain/graph/journal.ts @@ -1,9 +1,224 @@ +import crypto from 'node:crypto'; import fs from 'node:fs'; import path from 'node:path'; import { debug, warn } from '../../infrastructure/logger.js'; export const JOURNAL_FILENAME = 'changes.journal'; const HEADER_PREFIX = '# codegraph-journal v1 '; +const LOCK_SUFFIX = '.lock'; +const LOCK_TIMEOUT_MS = 5_000; +const LOCK_STALE_MS = 30_000; +const LOCK_RETRY_MS = 25; + +// Busy-spin sleep avoids blocking the Node.js event loop (unlike Atomics.wait, +// which freezes all I/O and timer callbacks). The retry interval is short +// (25ms), so the CPU cost is negligible while keeping unrelated callbacks +// responsive in watcher processes. +function sleepSync(ms: number): void { + const end = process.hrtime.bigint() + BigInt(ms) * 1_000_000n; + while (process.hrtime.bigint() < end) { + /* spin */ + } +} + +function isPidAlive(pid: number): boolean { + if (!Number.isFinite(pid) || pid <= 0) return false; + try { + process.kill(pid, 0); + return true; + } catch (e) { + // EPERM means the process exists but we lack permission — still alive. + return (e as NodeJS.ErrnoException).code === 'EPERM'; + } +} + +interface AcquiredLock { + fd: number; + nonce: string; +} + +/** + * Steal a stale lockfile atomically via write-tmp + rename. + * + * Using rename (which is atomic on POSIX and Windows) avoids the TOCTOU race + * inherent to the unlink + openSync('wx') pattern: if two stealers both + * observed the same stale holder, one's unlink could cross the other's fresh + * acquisition, admitting two writers into the critical section. + * + * After rename, we re-read the lockfile to confirm our nonce — if another + * stealer's rename landed after ours, they own the lock and we retry. + */ +function trySteal(lockPath: string): AcquiredLock | null { + const nonce = `${process.pid}-${crypto.randomBytes(8).toString('hex')}`; + const tmpPath = `${lockPath}.${nonce}.tmp`; + try { + fs.writeFileSync(tmpPath, `${process.pid}\n${nonce}\n`, { flag: 'w' }); + } catch { + return null; + } + + try { + // Atomic replace: overwrites the stale lockfile. + fs.renameSync(tmpPath, lockPath); + } catch { + try { + fs.unlinkSync(tmpPath); + } catch { + /* ignore */ + } + return null; + } + + // Verify the nonce — another stealer's rename may have landed after ours. + let content: string; + try { + content = fs.readFileSync(lockPath, 'utf-8'); + } catch { + return null; + } + if (!content.includes(nonce)) { + // Lost the race to another stealer; do NOT unlink their live lockfile. + return null; + } + + let fd: number; + try { + // Re-open r+ so we have a persistent fd the caller can close on release. + fd = fs.openSync(lockPath, 'r+'); + } catch { + return null; + } + return { fd, nonce }; +} + +function acquireJournalLock(lockPath: string): AcquiredLock { + const start = Date.now(); + for (;;) { + const nonce = `${process.pid}-${crypto.randomBytes(8).toString('hex')}`; + try { + const fd = fs.openSync(lockPath, 'wx'); + try { + fs.writeSync(fd, `${process.pid}\n${nonce}\n`); + } catch { + // Stamp write failed (ENOSPC, I/O error). An empty lockfile would + // look stale to concurrent waiters (Number('') === 0, isPidAlive(0) + // returns false), so they'd steal our live lock. Release and retry. + try { + fs.closeSync(fd); + } catch { + /* ignore */ + } + try { + fs.unlinkSync(lockPath); + } catch { + /* ignore */ + } + if (Date.now() - start > LOCK_TIMEOUT_MS) { + throw new Error( + `Failed to acquire journal lock at ${lockPath} within ${LOCK_TIMEOUT_MS}ms`, + ); + } + sleepSync(LOCK_RETRY_MS); + continue; + } + return { fd, nonce }; + } catch (e) { + if ((e as NodeJS.ErrnoException).code !== 'EEXIST') throw e; + } + + let holderAlive = true; + try { + const pidContent = fs.readFileSync(lockPath, 'utf-8').split('\n')[0]!.trim(); + holderAlive = isPidAlive(Number(pidContent)); + } catch { + /* unreadable — fall through to age check */ + } + + let shouldSteal = !holderAlive; + if (holderAlive) { + try { + const stat = fs.statSync(lockPath); + if (Date.now() - stat.mtimeMs > LOCK_STALE_MS) { + shouldSteal = true; + } + } catch { + /* stat failed — keep retrying */ + } + } + + if (shouldSteal) { + const stolen = trySteal(lockPath); + if (stolen) return stolen; + // Steal failed or lost the race — fall through to timeout check & retry. + } + + if (Date.now() - start > LOCK_TIMEOUT_MS) { + throw new Error(`Failed to acquire journal lock at ${lockPath} within ${LOCK_TIMEOUT_MS}ms`); + } + sleepSync(LOCK_RETRY_MS); + } +} + +function releaseJournalLock(lockPath: string, lock: AcquiredLock): void { + try { + fs.closeSync(lock.fd); + } catch { + /* ignore */ + } + // Only unlink if the lockfile still carries our nonce — if another stealer + // decided we were stale and replaced it, we must not unlink their live lock. + try { + const content = fs.readFileSync(lockPath, 'utf-8'); + if (content.includes(lock.nonce)) { + fs.unlinkSync(lockPath); + } + } catch { + /* lockfile gone or unreadable — nothing to unlink */ + } +} + +function sweepStaleTmpFiles(dir: string): void { + // Clean up orphaned .tmp files left behind when a process is killed after + // writeFileSync(tmpPath, ...) succeeds but before renameSync(tmpPath, lockPath) + // completes (trySteal path). Without this, tmp files accumulate silently in + // .codegraph/ across crash cycles. Only sweep ones older than LOCK_STALE_MS + // so we don't race an in-flight steal on another process. + let entries: fs.Dirent[]; + try { + entries = fs.readdirSync(dir, { withFileTypes: true }); + } catch { + return; + } + const now = Date.now(); + const prefix = `${JOURNAL_FILENAME}${LOCK_SUFFIX}.`; + for (const entry of entries) { + if (!entry.isFile() || !entry.name.startsWith(prefix) || !entry.name.endsWith('.tmp')) { + continue; + } + const tmpPath = path.join(dir, entry.name); + try { + const stat = fs.statSync(tmpPath); + if (now - stat.mtimeMs > LOCK_STALE_MS) { + fs.unlinkSync(tmpPath); + } + } catch { + /* stat/unlink raced another cleaner or was already removed — ignore */ + } + } +} + +function withJournalLock(rootDir: string, fn: () => T): T { + const dir = path.join(rootDir, '.codegraph'); + fs.mkdirSync(dir, { recursive: true }); + sweepStaleTmpFiles(dir); + const lockPath = path.join(dir, `${JOURNAL_FILENAME}${LOCK_SUFFIX}`); + const lock = acquireJournalLock(lockPath); + try { + return fn(); + } finally { + releaseJournalLock(lockPath, lock); + } +} interface JournalResult { valid: boolean; @@ -63,41 +278,39 @@ export function appendJournalEntries( rootDir: string, entries: Array<{ file: string; deleted?: boolean }>, ): void { - const dir = path.join(rootDir, '.codegraph'); - const journalPath = path.join(dir, JOURNAL_FILENAME); + withJournalLock(rootDir, () => { + const journalPath = path.join(rootDir, '.codegraph', JOURNAL_FILENAME); - fs.mkdirSync(dir, { recursive: true }); + if (!fs.existsSync(journalPath)) { + fs.writeFileSync(journalPath, `${HEADER_PREFIX}0\n`); + } - if (!fs.existsSync(journalPath)) { - fs.writeFileSync(journalPath, `${HEADER_PREFIX}0\n`); - } + const lines = entries.map((e) => { + if (e.deleted) return `DELETED ${e.file}`; + return e.file; + }); - const lines = entries.map((e) => { - if (e.deleted) return `DELETED ${e.file}`; - return e.file; + fs.appendFileSync(journalPath, `${lines.join('\n')}\n`); }); - - fs.appendFileSync(journalPath, `${lines.join('\n')}\n`); } export function writeJournalHeader(rootDir: string, timestamp: number): void { - const dir = path.join(rootDir, '.codegraph'); - const journalPath = path.join(dir, JOURNAL_FILENAME); - const tmpPath = `${journalPath}.tmp`; - - fs.mkdirSync(dir, { recursive: true }); + withJournalLock(rootDir, () => { + const journalPath = path.join(rootDir, '.codegraph', JOURNAL_FILENAME); + const tmpPath = `${journalPath}.tmp`; - try { - fs.writeFileSync(tmpPath, `${HEADER_PREFIX}${timestamp}\n`); - fs.renameSync(tmpPath, journalPath); - } catch (err) { - warn(`Failed to write journal header: ${(err as Error).message}`); try { - fs.unlinkSync(tmpPath); - } catch { - /* ignore */ + fs.writeFileSync(tmpPath, `${HEADER_PREFIX}${timestamp}\n`); + fs.renameSync(tmpPath, journalPath); + } catch (err) { + warn(`Failed to write journal header: ${(err as Error).message}`); + try { + fs.unlinkSync(tmpPath); + } catch { + /* ignore */ + } } - } + }); } /** @@ -116,35 +329,34 @@ export function appendJournalEntriesAndStampHeader( entries: Array<{ file: string; deleted?: boolean }>, timestamp: number, ): void { - const dir = path.join(rootDir, '.codegraph'); - const journalPath = path.join(dir, JOURNAL_FILENAME); - const tmpPath = `${journalPath}.tmp`; - - fs.mkdirSync(dir, { recursive: true }); + withJournalLock(rootDir, () => { + const journalPath = path.join(rootDir, '.codegraph', JOURNAL_FILENAME); + const tmpPath = `${journalPath}.tmp`; - let existingBody = ''; - try { - const content = fs.readFileSync(journalPath, 'utf-8'); - const newlineIdx = content.indexOf('\n'); - if (newlineIdx >= 0) existingBody = content.slice(newlineIdx + 1); - } catch { - /* no existing journal — fall through to write header + new entries */ - } - if (existingBody && !existingBody.endsWith('\n')) existingBody = `${existingBody}\n`; + let existingBody = ''; + try { + const content = fs.readFileSync(journalPath, 'utf-8'); + const newlineIdx = content.indexOf('\n'); + if (newlineIdx >= 0) existingBody = content.slice(newlineIdx + 1); + } catch { + /* no existing journal — fall through to write header + new entries */ + } + if (existingBody && !existingBody.endsWith('\n')) existingBody = `${existingBody}\n`; - const newLines = entries.map((e) => (e.deleted ? `DELETED ${e.file}` : e.file)); - const appended = newLines.length > 0 ? `${newLines.join('\n')}\n` : ''; - const content = `${HEADER_PREFIX}${timestamp}\n${existingBody}${appended}`; + const newLines = entries.map((e) => (e.deleted ? `DELETED ${e.file}` : e.file)); + const appended = newLines.length > 0 ? `${newLines.join('\n')}\n` : ''; + const content = `${HEADER_PREFIX}${timestamp}\n${existingBody}${appended}`; - try { - fs.writeFileSync(tmpPath, content); - fs.renameSync(tmpPath, journalPath); - } catch (err) { - warn(`Failed to update journal: ${(err as Error).message}`); try { - fs.unlinkSync(tmpPath); - } catch { - /* ignore */ + fs.writeFileSync(tmpPath, content); + fs.renameSync(tmpPath, journalPath); + } catch (err) { + warn(`Failed to update journal: ${(err as Error).message}`); + try { + fs.unlinkSync(tmpPath); + } catch { + /* ignore */ + } } - } + }); } diff --git a/tests/unit/journal.test.ts b/tests/unit/journal.test.ts index a0dbfa78..ebbd73ef 100644 --- a/tests/unit/journal.test.ts +++ b/tests/unit/journal.test.ts @@ -197,6 +197,114 @@ describe('appendJournalEntries', () => { }); }); +describe('concurrent-append safety', () => { + it('cleans up the .lock file after a successful append', () => { + const root = makeRoot(); + writeJournalHeader(root, 1700000000000); + appendJournalEntries(root, [{ file: 'src/a.js' }]); + + const lockPath = path.join(root, '.codegraph', `${JOURNAL_FILENAME}.lock`); + expect(fs.existsSync(lockPath)).toBe(false); + }); + + it('steals a stale lock whose holder PID is dead', () => { + const root = makeRoot(); + writeJournalHeader(root, 1700000000000); + + // Pre-stage a lockfile with a PID that is guaranteed not to exist + // (max 32-bit value; well above any real process). + const lockPath = path.join(root, '.codegraph', `${JOURNAL_FILENAME}.lock`); + fs.writeFileSync(lockPath, '2147483646\n'); + + expect(() => appendJournalEntries(root, [{ file: 'src/a.js' }])).not.toThrow(); + expect(fs.existsSync(lockPath)).toBe(false); + + const result = readJournal(root); + expect(result.changed).toEqual(['src/a.js']); + }); + + it("does not unlink another writer's lockfile after a stale-lock steal race", () => { + // Regression test for Greptile P1 TOCTOU: when two stealers observe the + // same stale holder, the loser must NOT unlink the winner's live lockfile. + // + // We simulate the race by: (1) staging a stale lock with a dead PID, + // (2) invoking an append (which will steal the stale lock, do its work, + // and release it), then (3) staging a *live* lockfile that pretends to + // belong to a different winner, and (4) making sure the previous release + // path does not retroactively unlink it. + const root = makeRoot(); + writeJournalHeader(root, 1700000000000); + + const lockPath = path.join(root, '.codegraph', `${JOURNAL_FILENAME}.lock`); + + // Stage a stale lock held by a dead PID. + fs.writeFileSync(lockPath, '2147483646\n'); + + // Run the real acquire/steal/release cycle. + appendJournalEntries(root, [{ file: 'src/a.js' }]); + + // Lock should be fully released (no residual lockfile). + expect(fs.existsSync(lockPath)).toBe(false); + + // Now simulate that another writer came along and acquired the lock + // with a DIFFERENT nonce. If our prior release path were incorrectly + // unlinking by path (without nonce verification), this file would be + // removed by a retry. It must remain intact. + fs.writeFileSync(lockPath, '99999\nsome-other-writer-nonce-abc123\n'); + expect(fs.existsSync(lockPath)).toBe(true); + + // Clean up. + fs.unlinkSync(lockPath); + }); + + it('produces no interleaved lines under repeated appends', () => { + const root = makeRoot(); + writeJournalHeader(root, 1700000000000); + + // Many small appends — every emitted line must be a complete, + // well-formed entry (no truncated "DELETED " prefixes, no split paths). + for (let i = 0; i < 200; i++) { + appendJournalEntries(root, [ + { file: `src/changed-${i}.js` }, + { file: `src/gone-${i}.js`, deleted: true }, + ]); + } + + const content = fs.readFileSync(path.join(root, '.codegraph', JOURNAL_FILENAME), 'utf-8'); + for (const line of content.split('\n')) { + if (!line || line.startsWith('#')) continue; + expect(line).toMatch(/^(DELETED src\/gone-\d+\.js|src\/changed-\d+\.js)$/); + } + }); + + it('sweeps orphaned .tmp files older than the stale threshold', () => { + // Regression for Greptile P2: crash-mid-steal leaves .codegraph/changes.journal.lock..tmp + // files behind. withJournalLock should clean up stale ones (> LOCK_STALE_MS old) on entry. + const root = makeRoot(); + writeJournalHeader(root, 1700000000000); + + const dir = path.join(root, '.codegraph'); + const freshTmp = path.join(dir, `${JOURNAL_FILENAME}.lock.fresh-nonce.tmp`); + const staleTmp = path.join(dir, `${JOURNAL_FILENAME}.lock.stale-nonce.tmp`); + fs.writeFileSync(freshTmp, 'fresh'); + fs.writeFileSync(staleTmp, 'stale'); + + // Backdate the stale tmp file past the 30s stale threshold. + const pastMs = Date.now() - 60_000; + const past = new Date(pastMs); + fs.utimesSync(staleTmp, past, past); + + // Any journal write enters withJournalLock which triggers the sweep. + appendJournalEntries(root, [{ file: 'src/a.js' }]); + + expect(fs.existsSync(staleTmp)).toBe(false); + expect(fs.existsSync(freshTmp)).toBe(true); + + // Clean up the fresh tmp so makeRoot's temp dir removal stays clean. + fs.unlinkSync(freshTmp); + }); +}); + describe('appendJournalEntriesAndStampHeader', () => { it('creates journal with header + entries when none exists', () => { const root = makeRoot();