From 8e538bde30b15385c902939da95510f9483be078 Mon Sep 17 00:00:00 2001 From: andreinknv Date: Sun, 26 Apr 2026 12:50:41 -0400 Subject: [PATCH] fix(db): enforce UNIQUE on edges so INSERT OR IGNORE actually dedupes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The edges table has `id INTEGER PRIMARY KEY AUTOINCREMENT` and no other UNIQUE constraint. The codebase uses INSERT OR IGNORE INTO edges (source, target, kind, ...) VALUES (...) clearly intending dedup, but the only candidate key for OR IGNORE was the autoincrement id (which never conflicts) — so the OR IGNORE was a silent no-op. Any code path that re-emits the same edge (resolver retries, partial-failure re-runs, framework extractors that double-emit) silently inserted duplicates, inflating call graphs in codegraph_callers/callees. This change adds a real UNIQUE index on the natural key: UNIQUE INDEX idx_edges_unique ON edges(source, target, kind, COALESCE(line, -1), COALESCE(col, -1)) COALESCE keeps two NULL line/col values comparable as equal — SQLite treats raw NULLs in a UNIQUE index as distinct, which would otherwise defeat dedup for edges that don't carry a line/col (1-indexed everywhere in this codebase, so -1 is a safe sentinel). Migration v4 first deduplicates pre-existing rows (DELETE ... WHERE id NOT IN (SELECT MIN(id) FROM edges GROUP BY source, target, kind, COALESCE(line, -1), COALESCE(col, -1))) then creates the index. Both run inside the migration transaction wrapper so a crash leaves the DB consistent. CURRENT_SCHEMA_VERSION bumped to 4. Two existing version-pinned tests updated to match. ## Files changed | File | Change | |---|---| | src/db/schema.sql | Add UNIQUE INDEX idx_edges_unique for fresh installs | | src/db/migrations.ts | Bump version to 4; add migration v4 (dedup + index) | | __tests__/edges-unique.test.ts | 7 regression tests | | __tests__/foundation.test.ts | Update expected schema version | | __tests__/pr19-improvements.test.ts | Update expected schema version | ## Test plan - [x] npm test: 387/387 pass on macOS (one pre-existing fs.watch flake under parallel load, passes in isolation) - [x] npx tsc --noEmit clean - [x] Independent reviewer pass before pushing — APPROVE; nits-only Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/edges-unique.test.ts | 166 ++++++++++++++++++++++++++++ __tests__/foundation.test.ts | 2 +- __tests__/pr19-improvements.test.ts | 2 +- src/db/migrations.ts | 23 +++- src/db/schema.sql | 9 ++ 5 files changed, 199 insertions(+), 3 deletions(-) create mode 100644 __tests__/edges-unique.test.ts diff --git a/__tests__/edges-unique.test.ts b/__tests__/edges-unique.test.ts new file mode 100644 index 00000000..49eced53 --- /dev/null +++ b/__tests__/edges-unique.test.ts @@ -0,0 +1,166 @@ +/** + * Edge Uniqueness Tests + * + * Regression tests for the bug where `INSERT OR IGNORE INTO edges` was + * silently a no-op: the only candidate key was the AUTOINCREMENT id (which + * never conflicts), so duplicate edges accumulated on every re-emission / + * re-resolution. + * + * Fix: a UNIQUE index on (source, target, kind, COALESCE(line, -1), + * COALESCE(col, -1)) backs a fresh-install schema and is also applied via + * migration v4 (with a dedup pass over existing rows). + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { DatabaseConnection } from '../src/db'; +import { QueryBuilder } from '../src/db/queries'; +import { Edge, Node } from '../src/types'; +import { runMigrations, getCurrentVersion, CURRENT_SCHEMA_VERSION } from '../src/db/migrations'; + +function tempDb(): { dir: string; db: DatabaseConnection; q: QueryBuilder } { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-edges-unique-')); + const db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + const q = new QueryBuilder(db.getDb()); + return { dir, db, q }; +} + +function cleanup(dir: string, db: DatabaseConnection) { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); +} + +function makeNode(id: string, name: string): Node { + return { + id, + kind: 'function', + name, + qualifiedName: `f::${name}`, + filePath: 'a.ts', + language: 'typescript', + startLine: 1, + endLine: 1, + startColumn: 0, + endColumn: 0, + updatedAt: Date.now(), + }; +} + +function edgesCount(db: DatabaseConnection): number { + const row = db.getDb().prepare('SELECT COUNT(*) as c FROM edges').get() as { c: number }; + return row.c; +} + +describe('Edge UNIQUE constraint (bug #2)', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + + beforeEach(() => { + ({ dir, db, q } = tempDb()); + q.insertNodes([makeNode('n1', 'foo'), makeNode('n2', 'bar')]); + }); + + afterEach(() => cleanup(dir, db)); + + it('rejects duplicate (source, target, kind, line, col)', () => { + const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 10, column: 5 }; + q.insertEdge(e); + q.insertEdge(e); // INSERT OR IGNORE — should be a no-op now + expect(edgesCount(db)).toBe(1); + }); + + it('treats two NULL line edges as duplicates (COALESCE in unique index)', () => { + const e: Edge = { source: 'n1', target: 'n2', kind: 'calls' }; + q.insertEdge(e); + q.insertEdge(e); + expect(edgesCount(db)).toBe(1); + }); + + it('allows same source/target/kind on different lines', () => { + q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 1 }); + q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 2 }); + expect(edgesCount(db)).toBe(2); + }); + + it('allows same source/target/line on different kinds', () => { + q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 1 }); + q.insertEdge({ source: 'n1', target: 'n2', kind: 'references', line: 1 }); + expect(edgesCount(db)).toBe(2); + }); + + it('insertEdges (batch) dedupes within the same call', () => { + const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 1, column: 1 }; + q.insertEdges([e, e, e]); + expect(edgesCount(db)).toBe(1); + }); + + it('survives the same edge being re-emitted across many cycles', () => { + const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 1 }; + for (let i = 0; i < 100; i++) { + q.insertEdge(e); + } + expect(edgesCount(db)).toBe(1); + }); +}); + +describe('Migration v4: dedup existing edges', () => { + let dir: string; + let dbPath: string; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-migr-v4-')); + dbPath = path.join(dir, 'test.db'); + }); + + afterEach(() => { + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('collapses pre-existing duplicates and adds the UNIQUE index', () => { + // Build a v3-shaped database manually: schema, but simulate a stale + // version row + insert duplicates that the missing UNIQUE index let + // through. We use the real initialize() path then drop the index + + // version row to back-date the DB. + const db = DatabaseConnection.initialize(dbPath); + db.getDb().exec(`DROP INDEX IF EXISTS idx_edges_unique;`); + db.getDb().exec(`DELETE FROM schema_versions;`); + db.getDb().prepare( + 'INSERT INTO schema_versions (version, applied_at, description) VALUES (3, ?, ?)' + ).run(Date.now(), 'simulated v3'); + + const q = new QueryBuilder(db.getDb()); + q.insertNodes([makeNode('n1', 'foo'), makeNode('n2', 'bar')]); + // Force-insert duplicates via raw SQL (bypassing the constraint that + // is now absent). Three rows that should collapse to one. + const stmt = db.getDb().prepare( + 'INSERT INTO edges (source, target, kind, line, col) VALUES (?, ?, ?, ?, ?)' + ); + stmt.run('n1', 'n2', 'calls', 10, 5); + stmt.run('n1', 'n2', 'calls', 10, 5); + stmt.run('n1', 'n2', 'calls', 10, 5); + // And one with NULL line/col, also duplicated + stmt.run('n1', 'n2', 'references', null, null); + stmt.run('n1', 'n2', 'references', null, null); + + expect(edgesCount(db)).toBe(5); + expect(getCurrentVersion(db.getDb())).toBe(3); + + // Run migrations forward + runMigrations(db.getDb(), 3); + + expect(getCurrentVersion(db.getDb())).toBe(CURRENT_SCHEMA_VERSION); + expect(CURRENT_SCHEMA_VERSION).toBeGreaterThanOrEqual(4); + // 3 calls dups → 1, 2 references dups → 1 + expect(edgesCount(db)).toBe(2); + + // Now the constraint is enforced: another duplicate insert is a no-op. + const q2 = new QueryBuilder(db.getDb()); + q2.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 10, column: 5 }); + expect(edgesCount(db)).toBe(2); + + db.close(); + }); +}); diff --git a/__tests__/foundation.test.ts b/__tests__/foundation.test.ts index 9ee437da..4e8f204a 100644 --- a/__tests__/foundation.test.ts +++ b/__tests__/foundation.test.ts @@ -305,7 +305,7 @@ describe('Database Connection', () => { const version = db.getSchemaVersion(); expect(version).not.toBeNull(); - expect(version?.version).toBe(3); + expect(version?.version).toBe(4); db.close(); }); diff --git a/__tests__/pr19-improvements.test.ts b/__tests__/pr19-improvements.test.ts index 5fbe17d7..d43dceb2 100644 --- a/__tests__/pr19-improvements.test.ts +++ b/__tests__/pr19-improvements.test.ts @@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => { describe('Schema v2 Migration', () => { it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => { const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations'); - expect(CURRENT_SCHEMA_VERSION).toBe(3); + expect(CURRENT_SCHEMA_VERSION).toBe(4); }); it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => { diff --git a/src/db/migrations.ts b/src/db/migrations.ts index 0a256dbc..270265e2 100644 --- a/src/db/migrations.ts +++ b/src/db/migrations.ts @@ -9,7 +9,7 @@ import { SqliteDatabase } from './sqlite-adapter'; /** * Current schema version */ -export const CURRENT_SCHEMA_VERSION = 3; +export const CURRENT_SCHEMA_VERSION = 4; /** * Migration definition @@ -54,6 +54,27 @@ const migrations: Migration[] = [ `); }, }, + { + version: 4, + description: 'Dedup edges and enforce UNIQUE(source, target, kind, line, col) so INSERT OR IGNORE actually dedupes', + up: (db) => { + // Without a UNIQUE constraint the existing `INSERT OR IGNORE INTO + // edges` was a no-op for dedup purposes (the only candidate key was + // the AUTOINCREMENT id, which never conflicts). Existing databases + // can therefore contain accumulated duplicates from re-emission / + // re-resolution. Collapse those before adding the constraint, then + // create the UNIQUE index that future inserts will conflict against. + db.exec(` + DELETE FROM edges + WHERE id NOT IN ( + SELECT MIN(id) FROM edges + GROUP BY source, target, kind, COALESCE(line, -1), COALESCE(col, -1) + ); + CREATE UNIQUE INDEX IF NOT EXISTS idx_edges_unique + ON edges(source, target, kind, COALESCE(line, -1), COALESCE(col, -1)); + `); + }, + }, ]; /** diff --git a/src/db/schema.sql b/src/db/schema.sql index dd0a9f06..82c3e9b6 100644 --- a/src/db/schema.sql +++ b/src/db/schema.sql @@ -129,6 +129,15 @@ CREATE INDEX IF NOT EXISTS idx_edges_kind ON edges(kind); CREATE INDEX IF NOT EXISTS idx_edges_source_kind ON edges(source, kind); CREATE INDEX IF NOT EXISTS idx_edges_target_kind ON edges(target, kind); +-- Uniqueness for (source, target, kind, line, col). The id column is an +-- AUTOINCREMENT primary key, so without this index `INSERT OR IGNORE` +-- would never see a conflict — duplicate edges would silently accumulate +-- on every re-resolution / re-emission. COALESCE keeps two NULL line/col +-- values comparable as equal (SQLite treats raw NULLs in a UNIQUE index +-- as distinct). +CREATE UNIQUE INDEX IF NOT EXISTS idx_edges_unique + ON edges(source, target, kind, COALESCE(line, -1), COALESCE(col, -1)); + -- File indexes CREATE INDEX IF NOT EXISTS idx_files_language ON files(language); CREATE INDEX IF NOT EXISTS idx_files_modified_at ON files(modified_at);