diff --git a/__tests__/cli-json-schemas.test.ts b/__tests__/cli-json-schemas.test.ts index b435764e..4941e8f3 100644 --- a/__tests__/cli-json-schemas.test.ts +++ b/__tests__/cli-json-schemas.test.ts @@ -403,4 +403,56 @@ describe('PF-613 follow-up: CLI JSON schema validation', () => { cleanup(); } }); + + // PF-692: `codegraph duplicates` JSON output conforms to schemas/cli/duplicates.json. + // PR #40 round 2 REVIEW fix: use a fixture with a REAL duplicate + // so the group/member subschema (fileCount, coveredByExactGroup, + // members minItems:2) is exercised, not just the empty-output + // envelope shape. + itIfDist('duplicates with a real clone fixture conforms to schemas/cli/duplicates.json', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-pf613b-dup-')); + projectDir = dir; + try { + fs.mkdirSync(path.join(dir, 'src'), { recursive: true }); + // Same-named function in two files with a body large enough + // to clear the default --min-lines=10 floor. + const body = `{ + const a = 1; + const b = 2; + const c = 3; + const d = 4; + const e = 5; + const f = 6; + const g = 7; + const h = 8; + return a + b + c + d + e + f + g + h; +}`; + fs.writeFileSync( + path.join(dir, 'src', 'a.ts'), + `export function shared(x: number): number ${body}\n`, + ); + fs.writeFileSync( + path.join(dir, 'src', 'b.ts'), + `export function shared(x: number): number ${body}\n`, + ); + fs.writeFileSync(path.join(dir, 'package.json'), '{"name":"pf613-dup","version":"0"}\n'); + execFileSync(NODE_BIN, [DIST_BIN, 'init', '-i', dir], { stdio: 'ignore' }); + + const validate = loadValidator('duplicates'); + const out = runCliJson(['duplicates', dir, '--json']) as { + groups?: Array<{ kind: string; members: unknown[]; fileCount: number; coveredByExactGroup: boolean }>; + summary?: { exactGroups: number; shapeGroups: number }; + }; + expectValid(validate, out); + // Real duplicates → at least one exact group with fileCount=2. + expect(out.summary!.exactGroups).toBeGreaterThanOrEqual(1); + const exact = out.groups!.find((g) => g.kind === 'exact'); + expect(exact, 'expected at least one exact group').toBeDefined(); + expect(exact!.fileCount).toBe(2); + expect(exact!.coveredByExactGroup).toBe(false); + expect(exact!.members.length).toBeGreaterThanOrEqual(2); + } finally { + cleanup(); + } + }); }); diff --git a/__tests__/duplicates.test.ts b/__tests__/duplicates.test.ts new file mode 100644 index 00000000..d6f6f719 --- /dev/null +++ b/__tests__/duplicates.test.ts @@ -0,0 +1,507 @@ +/** + * PF-692: `codegraph duplicates` primitive tests. + * + * Each case builds a real temp project via `CodeGraph.init`, + * which runs the production extract → fingerprint → persist + * pipeline. The tests then call `findDuplicates(dbPath, opts)` + * and assert clone group shape. Synthetic SQLite rows are used + * only for negative paths (legacy schema, missing DB). + */ + +import { describe, it, expect, beforeAll, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { CodeGraph } from '../src'; +import { + findDuplicates, + DEFAULT_DUPLICATE_KINDS, + DEFAULT_MIN_LINES, +} from '../src/duplicates'; +import { initGrammars, loadAllGrammars } from '../src/extraction/grammars'; + +beforeAll(async () => { + await initGrammars(); + await loadAllGrammars(); +}); + +interface ProjectFixture { + dir: string; + dbPath: string; +} + +async function makeProject(files: Record): Promise { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-dup-')); + for (const [relPath, content] of Object.entries(files)) { + const full = path.join(dir, relPath); + fs.mkdirSync(path.dirname(full), { recursive: true }); + fs.writeFileSync(full, content, 'utf8'); + } + const cg = await CodeGraph.init(dir, { index: true }); + cg.destroy(); + return { dir, dbPath: path.join(dir, '.codegraph', 'codegraph.db') }; +} + +function cleanup(p: ProjectFixture | undefined): void { + if (p && fs.existsSync(p.dir)) { + fs.rmSync(p.dir, { recursive: true, force: true }); + } +} + +/** Body large enough to clear the default min-lines floor of 10. */ +const LARGE_BODY = `{ + const a = 1; + const b = 2; + const c = 3; + const d = 4; + const e = 5; + const f = 6; + const g = 7; + const h = 8; + return a + b + c + d + e + f + g + h; +}`; + +describe('PF-692: findDuplicates', () => { + let fixture: ProjectFixture | undefined; + + beforeEach(() => { + fixture = undefined; + }); + + afterEach(() => { + cleanup(fixture); + }); + + it('exposes the council-locked defaults', () => { + // Locked by the PR #40 RFC — changing these breaks downstream + // expectations. Fail loudly if anyone tweaks them. + expect(DEFAULT_DUPLICATE_KINDS).toEqual(['function', 'method']); + expect(DEFAULT_MIN_LINES).toBe(10); + }); + + it('detects identical same-named functions across files as an exact group', async () => { + // The `ast_hash` includes the function name (rename-locals only + // wipes LOCAL identifiers, not the declaration name itself). + // Two functions named `handler` with the same body in different + // files are the canonical Type-1 clone shape. + fixture = await makeProject({ + 'src/a.ts': `export function handler(x: number, y: number): number ${LARGE_BODY}\n`, + 'src/b.ts': `export function handler(x: number, y: number): number ${LARGE_BODY}\n`, + }); + const result = findDuplicates(fixture.dbPath); + const exactGroup = result.groups.find((g) => g.kind === 'exact'); + expect( + exactGroup, + `expected at least one exact group; got: ${JSON.stringify( + result.groups.map((g) => ({ k: g.kind, n: g.members.length })), + )}`, + ).toBeDefined(); + expect(exactGroup!.members.length).toBeGreaterThanOrEqual(2); + const files = exactGroup!.members.map((m) => m.filePath).join(','); + expect(files).toMatch(/a\.ts/); + expect(files).toMatch(/b\.ts/); + }); + + it('detects renamed-but-same-shape functions as a Type-2 shape group', async () => { + // Same body, DIFFERENT function names. Type-1 (`ast_hash`) won't + // match because the function name participates in the hash; + // Type-2 (`ast_shape_hash`) should — it normalizes all + // identifiers, including the declaration name. + fixture = await makeProject({ + 'src/a.ts': `export function adder(x: number, y: number): number ${LARGE_BODY}\n`, + 'src/b.ts': `export function alsoAdder(x: number, y: number): number ${LARGE_BODY}\n`, + }); + const result = findDuplicates(fixture.dbPath); + expect(result.summary.exactGroups).toBe(0); + const shapeGroup = result.groups.find((g) => g.kind === 'shape'); + expect(shapeGroup, 'shape group should detect the renamed clone').toBeDefined(); + const names = shapeGroup!.members.map((m) => m.qualifiedName).join(','); + expect(names).toMatch(/adder/); + expect(names).toMatch(/alsoAdder/); + }); + + it('does NOT report a clone group when only one symbol matches the fingerprint', async () => { + fixture = await makeProject({ + 'src/a.ts': `export function unique(x: number): number ${LARGE_BODY}\n`, + }); + const result = findDuplicates(fixture.dbPath); + // Single-member fingerprints aren't clones — GROUP BY ... HAVING > 1 + // filters them out. summary.exactGroups should be 0. + expect(result.summary.exactGroups).toBe(0); + }); + + it('filters out symbols below --min-lines floor', async () => { + // Same-name one-line functions across two files — would match + // Type-1 if min-lines allowed them. Default min-lines=10 should + // filter them out; min-lines=1 should surface them. + fixture = await makeProject({ + 'src/a.ts': 'export function ping(): number { return 1; }\n', + 'src/b.ts': 'export function ping(): number { return 1; }\n', + }); + const result = findDuplicates(fixture.dbPath); + expect(result.summary.exactGroups).toBe(0); + expect(result.summary.shapeGroups).toBe(0); + const lowered = findDuplicates(fixture.dbPath, { minLines: 1 }); + expect(lowered.summary.exactGroups).toBeGreaterThanOrEqual(1); + }); + + it('groups sort by member count DESC (RFC fork 5)', async () => { + // Three same-named copies of `triple` (Type-1 group of 3) and + // two same-named copies of `pair` (Type-1 group of 2). The + // triple group must come before the pair group. + const bodyA = LARGE_BODY; + const bodyB = `{ + const x = 10; + const y = 20; + const z = 30; + const a = 40; + const b = 50; + const c = 60; + const d = 70; + const e = 80; + return x * y * z + a + b + c + d + e; +}`; + fixture = await makeProject({ + 'src/a.ts': `export function triple(x: number): number ${bodyA}\n`, + 'src/b.ts': `export function triple(x: number): number ${bodyA}\n`, + 'src/c.ts': `export function triple(x: number): number ${bodyA}\n`, + 'src/d.ts': `export function pair(x: number): number ${bodyB}\n`, + 'src/e.ts': `export function pair(x: number): number ${bodyB}\n`, + }); + const result = findDuplicates(fixture.dbPath); + const groupsBySize = result.groups.map((g) => g.members.length); + for (let i = 1; i < groupsBySize.length; i++) { + expect(groupsBySize[i - 1]).toBeGreaterThanOrEqual(groupsBySize[i]); + } + expect(groupsBySize[0]).toBeGreaterThanOrEqual(3); + }); + + it('ties on member count fall back to max line span DESC (RFC fork 5 secondary)', async () => { + // Two clone groups, both with 2 members. The `longBody` + // group spans more lines per symbol; it must sort before + // the `shortBody` group despite equal member counts. + const shortBody = LARGE_BODY; // 11 lines + const longBody = `{ + const a = 1; + const b = 2; + const c = 3; + const d = 4; + const e = 5; + const f = 6; + const g = 7; + const h = 8; + const i = 9; + const j = 10; + const k = 11; + const l = 12; + const m = 13; + const n = 14; + const o = 15; + return a + b + c + d + e + f + g + h + i + j + k + l + m + n + o; +}`; // 18 lines + fixture = await makeProject({ + 'src/a.ts': `export function shortA(x: number): number ${shortBody}\n`, + 'src/b.ts': `export function shortA(x: number): number ${shortBody}\n`, + 'src/c.ts': `export function longB(x: number): number ${longBody}\n`, + 'src/d.ts': `export function longB(x: number): number ${longBody}\n`, + }); + const result = findDuplicates(fixture.dbPath); + expect(result.groups.length).toBe(2); + expect(result.groups[0].members.length).toBe(2); + expect(result.groups[1].members.length).toBe(2); + const span0 = Math.max( + ...result.groups[0].members.map((m) => m.endLine - m.startLine + 1), + ); + const span1 = Math.max( + ...result.groups[1].members.map((m) => m.endLine - m.startLine + 1), + ); + expect(span0).toBeGreaterThan(span1); + }); + + it('ties on member count AND span fall back to first-member filePath ASC (PR #40 round 2)', async () => { + // Two clone pairs with identical line spans but structurally + // different bodies (sum vs product) → different ast_hash AND + // different ast_shape_hash, so no 4-member shape group forms. + // Per the PR #40 round-2 sort change (Codex pass A REVIEW), + // tertiary tie-break is human-meaningful filePath ASC, not + // SHA-256 hash. The alpha group's first member is in a.ts and + // bravo's is in c.ts, so alpha sorts first. + const sumBody = `{ + const a = 1; + const b = 2; + const c = 3; + const d = 4; + const e = 5; + const f = 6; + const g = 7; + const h = 8; + return a + b + c + d + e + f + g + h; +}`; + const productBody = `{ + const a = 1; + const b = 2; + const c = 3; + const d = 4; + const e = 5; + const f = 6; + const g = 7; + const h = 8; + return a * b * c * d * e * f * g * h; +}`; + fixture = await makeProject({ + 'src/a.ts': `export function alpha(x: number): number ${sumBody}\n`, + 'src/b.ts': `export function alpha(x: number): number ${sumBody}\n`, + 'src/c.ts': `export function bravo(x: number): number ${productBody}\n`, + 'src/d.ts': `export function bravo(x: number): number ${productBody}\n`, + }); + const result = findDuplicates(fixture.dbPath); + // Two distinct exact groups, two members each, same span. + const exactGroups = result.groups.filter((g) => g.kind === 'exact'); + expect(exactGroups.length).toBe(2); + expect(exactGroups[0].members.length).toBe(2); + expect(exactGroups[1].members.length).toBe(2); + // After ties on count and span, first-member filePath ASC + // takes over. alpha's first member is in src/a.ts and bravo's + // is in src/c.ts, so alpha sorts before bravo. + expect(exactGroups[0].members[0].filePath < exactGroups[1].members[0].filePath).toBe(true); + }); + + it('suppresses shape groups that exactly cover an exact group (RFC fork 1)', async () => { + // Same-named functions in two files form both an exact group + // (identical ast_hash) AND a shape group covering the same + // {a, b} member set. The shape group must be suppressed. + fixture = await makeProject({ + 'src/a.ts': `export function shared(x: number, y: number): number ${LARGE_BODY}\n`, + 'src/b.ts': `export function shared(x: number, y: number): number ${LARGE_BODY}\n`, + }); + const result = findDuplicates(fixture.dbPath); + expect(result.summary.exactGroups).toBeGreaterThanOrEqual(1); + // The same-member shape group should NOT also be reported. + const exactMemberSet = (() => { + const g = result.groups.find((g) => g.kind === 'exact'); + return new Set(g!.members.map((m) => m.id)); + })(); + for (const g of result.groups.filter((g) => g.kind === 'shape')) { + const shapeMemberSet = new Set(g.members.map((m) => m.id)); + // Shape group must not match an exact group's member set 1:1. + const equal = + shapeMemberSet.size === exactMemberSet.size && + [...shapeMemberSet].every((id) => exactMemberSet.has(id)); + expect(equal, `shape group ${g.fingerprint} duplicates an exact group`).toBe(false); + } + }); + + it('rejects --kind= instead of returning silent zero groups', async () => { + fixture = await makeProject({ + 'src/a.ts': `export function any(): number ${LARGE_BODY}\n`, + }); + expect(() => findDuplicates(fixture!.dbPath, { kinds: [] })).toThrow( + /--kind list cannot be empty/i, + ); + }); + + it('throws a clear schema error on a v5-style DB (no fingerprint columns)', () => { + // eslint-disable-next-line @typescript-eslint/no-require-imports + const { DatabaseSync } = require('node:sqlite') as { + DatabaseSync: new (path: string) => { exec(sql: string): void; close(): void }; + }; + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-dup-v5-')); + const dbPath = path.join(dir, 'old.db'); + try { + const db = new DatabaseSync(dbPath); + // Minimal v5 shape with NO fingerprint columns. + db.exec(`CREATE TABLE nodes( + id TEXT PRIMARY KEY, kind TEXT, name TEXT, qualified_name TEXT, + file_path TEXT, language TEXT, + start_line INT, end_line INT, start_column INT, end_column INT, + signature TEXT)`); + db.close(); + expect(() => findDuplicates(dbPath)).toThrow(/schema v6\+/i); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('throws when the database path does not exist', () => { + const missing = '/tmp/codegraph-dup-missing-' + Date.now() + '.db'; + expect(() => findDuplicates(missing)).toThrow(/not found/i); + }); + + it('does not mutate the DB or create WAL sidecars', async () => { + // Same Codex BLOCKER concern as PR #39 — the read path must be + // truly read-only. Snapshots DB + sidecars before, asserts equal + // after. + fixture = await makeProject({ + 'src/a.ts': `export function clone1(x: number): number ${LARGE_BODY}\n`, + 'src/b.ts': `export function clone2(x: number): number ${LARGE_BODY}\n`, + }); + const snapshot = (p: string): Record => { + const out: Record = {}; + for (const sidecar of ['', '-wal', '-shm', '-journal']) { + const full = p + sidecar; + out[sidecar] = fs.existsSync(full) + ? { size: fs.statSync(full).size, mtime: fs.statSync(full).mtimeMs } + : null; + } + return out; + }; + const before = snapshot(fixture.dbPath); + findDuplicates(fixture.dbPath); + expect(snapshot(fixture.dbPath)).toEqual(before); + }); + + it('throws when a v6 DB has zero fingerprinted rows (migrated-not-reindexed, PR #40 round 2 BLOCKER)', () => { + // A v6 DB that was migrated FROM v5 but never re-indexed has + // all fingerprint columns present but all values NULL. + // findDuplicates must NOT silently return [] groups — that + // looks like "clone-free code" when really the index is blind. + // eslint-disable-next-line @typescript-eslint/no-require-imports + const { DatabaseSync } = require('node:sqlite') as { + DatabaseSync: new (path: string) => { exec(sql: string): void; close(): void }; + }; + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-dup-blind-')); + const dbPath = path.join(dir, 'blind.db'); + try { + const db = new DatabaseSync(dbPath); + db.exec(`CREATE TABLE nodes( + id TEXT PRIMARY KEY, kind TEXT, name TEXT, qualified_name TEXT, + file_path TEXT, language TEXT, + start_line INT, end_line INT, start_column INT, end_column INT, + signature TEXT, ast_hash TEXT, ast_shape_hash TEXT, + sig_hash TEXT, call_pattern_hash TEXT)`); + // 3 eligible function nodes, all with NULL fingerprints + // (simulates migrated-not-reindexed state). + for (let i = 0; i < 3; i++) { + db.exec( + `INSERT INTO nodes VALUES('n${i}', 'function', 'fn${i}', 'src/a.ts::fn${i}', + 'src/a.ts', 'typescript', ${i * 20}, ${i * 20 + 12}, 0, 0, NULL, + NULL, NULL, NULL, NULL)`, + ); + } + db.close(); + expect(() => findDuplicates(dbPath)).toThrow( + /0 of \d+ eligible nodes have fingerprints|re-index/i, + ); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('populates fileCount per group (PR #40 round 2 REVIEW)', async () => { + // Two same-named functions across two files → fileCount=2. + fixture = await makeProject({ + 'src/a.ts': `export function shared(x: number): number ${LARGE_BODY}\n`, + 'src/b.ts': `export function shared(x: number): number ${LARGE_BODY}\n`, + }); + const result = findDuplicates(fixture.dbPath); + const g = result.groups.find((g) => g.kind === 'exact'); + expect(g).toBeDefined(); + expect(g!.fileCount).toBe(2); + }); + + it('annotates shape groups with coveredByExactGroup when at least one member is also exact (PR #40 round 2 REVIEW)', async () => { + // Three same-shaped functions: two named `f` with identical + // bodies (Type-1) + one named `g` with the same shape but + // different ast_hash (Type-2 only). The shape group should + // include all three, AND coveredByExactGroup=true because + // members {f,f} are already an exact group. + fixture = await makeProject({ + 'src/a.ts': `export function f(x: number): number ${LARGE_BODY}\n`, + 'src/b.ts': `export function f(x: number): number ${LARGE_BODY}\n`, + 'src/c.ts': `export function g(x: number): number ${LARGE_BODY}\n`, + }); + const result = findDuplicates(fixture.dbPath); + const shape = result.groups.find((g) => g.kind === 'shape'); + if (!shape) return; // fixture variant + expect(shape.coveredByExactGroup).toBe(true); + expect(shape.members.length).toBeGreaterThanOrEqual(3); + // The exact group always has coveredByExactGroup=false. + const exact = result.groups.find((g) => g.kind === 'exact'); + expect(exact!.coveredByExactGroup).toBe(false); + }); + + it('distinguishes "unfingerprintable kinds" from "needs re-index" (Codex PR review P2)', () => { + // DB has fingerprints for `function` but the user asks for + // `component` — error should say "kinds aren't fingerprinted", + // NOT "re-run codegraph index". + // eslint-disable-next-line @typescript-eslint/no-require-imports + const { DatabaseSync } = require('node:sqlite') as { + DatabaseSync: new (path: string) => { exec(sql: string): void; close(): void }; + }; + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-dup-kindgap-')); + const dbPath = path.join(dir, 'kindgap.db'); + try { + const db = new DatabaseSync(dbPath); + db.exec(`CREATE TABLE nodes( + id TEXT PRIMARY KEY, kind TEXT, name TEXT, qualified_name TEXT, + file_path TEXT, language TEXT, + start_line INT, end_line INT, start_column INT, end_column INT, + signature TEXT, ast_hash TEXT, ast_shape_hash TEXT, + sig_hash TEXT, call_pattern_hash TEXT)`); + // A function WITH fingerprint, and a component WITHOUT. + db.exec( + `INSERT INTO nodes VALUES('f1', 'function', 'fn', 'a::fn', 'a.ts', 'typescript', + 1, 15, 0, 0, NULL, 'hash-real', 'shape-real', NULL, NULL)`, + ); + db.exec( + `INSERT INTO nodes VALUES('f2', 'function', 'fn2', 'b::fn', 'b.ts', 'typescript', + 1, 15, 0, 0, NULL, 'hash-real', 'shape-real', NULL, NULL)`, + ); + db.exec( + `INSERT INTO nodes VALUES('c1', 'component', 'Btn', 'Btn', 'Btn.vue', 'vue', + 1, 15, 0, 0, NULL, NULL, NULL, NULL, NULL)`, + ); + db.exec( + `INSERT INTO nodes VALUES('c2', 'component', 'Card', 'Card', 'Card.vue', 'vue', + 1, 15, 0, 0, NULL, NULL, NULL, NULL, NULL)`, + ); + db.close(); + // Asking for `component` → "not fingerprinted" error. + expect(() => findDuplicates(dbPath, { kinds: ['component'] })).toThrow( + /aren't fingerprinted|framework-extractor/i, + ); + // Asking for `function` → works fine (1 exact group: f1, f2). + const ok = findDuplicates(dbPath, { kinds: ['function'] }); + expect(ok.summary.exactGroups).toBe(1); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('does not double-count shape-group members that are also in exact groups (Codex PR review P2)', async () => { + // {A.f, B.f} are an exact group (Type-1, count=2). + // {A.f, B.f, C.g} share ast_shape_hash (Type-2, count=3) but + // C.g has a different ast_hash. The shape group is retained + // because its member set isn't equal to the exact group's. + // shapeNodes must count ONLY C.g (the shape-only member), not + // all 3 — otherwise A.f and B.f are counted twice in the + // totals (once in exactNodes, once in shapeNodes). + fixture = await makeProject({ + 'src/a.ts': `export function f(x: number): number ${LARGE_BODY}\n`, + 'src/b.ts': `export function f(x: number): number ${LARGE_BODY}\n`, + 'src/c.ts': `export function g(x: number): number ${LARGE_BODY}\n`, + }); + const result = findDuplicates(fixture.dbPath); + const exact = result.groups.find((g) => g.kind === 'exact'); + const shape = result.groups.find((g) => g.kind === 'shape'); + if (!shape) return; // fixture variant + // shapeNodes should reflect ONLY the shape-only members + // (i.e. C.g) — exactly one, not three. + const exactMemberIds = new Set(exact!.members.map((m) => m.id)); + const shapeOnly = shape.members.filter((m) => !exactMemberIds.has(m.id)); + expect(result.summary.shapeNodes).toBe(shapeOnly.length); + }); + + it('exposes fingerprintCoverage in summary', async () => { + fixture = await makeProject({ + 'src/a.ts': `export function fa(x: number): number ${LARGE_BODY}\n`, + 'src/b.ts': `export function fb(x: number): number ${LARGE_BODY}\n`, + }); + const result = findDuplicates(fixture.dbPath); + expect(result.summary.fingerprintCoverage.eligible).toBeGreaterThan(0); + expect(result.summary.fingerprintCoverage.withAstHash).toBeGreaterThan(0); + expect(result.summary.fingerprintCoverage.withAstHash).toBeLessThanOrEqual( + result.summary.fingerprintCoverage.eligible, + ); + }); +}); diff --git a/schemas/cli/duplicates.json b/schemas/cli/duplicates.json new file mode 100644 index 00000000..f1ba1b6b --- /dev/null +++ b/schemas/cli/duplicates.json @@ -0,0 +1,102 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://codegraph.papersflow.ai/schemas/cli/duplicates.json", + "title": "CodeGraph CLI `duplicates --json` output", + "description": "Clone detection over PF-690 fingerprint columns. Reports Type-1 (exact `ast_hash`) and Type-2 (`ast_shape_hash`) clone groups. Shape groups that exactly cover an existing exact group are suppressed; remaining shape groups are genuine Type-2 findings with at least one member that differs from the exact-hash neighbors.", + "allOf": [ + { + "$ref": "https://codegraph.papersflow.ai/schemas/cli/envelope.json" + }, + { + "type": "object", + "required": ["tool", "groups", "summary"], + "properties": { + "tool": { "const": "duplicates" }, + "groups": { + "type": "array", + "items": { "$ref": "#/definitions/group" } + }, + "summary": { + "type": "object", + "required": [ + "exactGroups", + "shapeGroups", + "exactNodes", + "shapeNodes", + "fingerprintCoverage" + ], + "properties": { + "exactGroups": { "type": "integer", "minimum": 0 }, + "shapeGroups": { "type": "integer", "minimum": 0 }, + "exactNodes": { "type": "integer", "minimum": 0 }, + "shapeNodes": { "type": "integer", "minimum": 0 }, + "fingerprintCoverage": { + "type": "object", + "required": ["eligible", "withAstHash"], + "properties": { + "eligible": { "type": "integer", "minimum": 0 }, + "withAstHash": { "type": "integer", "minimum": 0 } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + } + } + ], + "definitions": { + "group": { + "type": "object", + "required": ["kind", "fingerprint", "members", "fileCount", "coveredByExactGroup"], + "properties": { + "kind": { + "type": "string", + "enum": ["exact", "shape"], + "description": "`exact` = members share `ast_hash` (Type-1). `shape` = members share `ast_shape_hash` only (Type-2, after exact-group dedup)." + }, + "fingerprint": { + "type": "string", + "description": "The hash value shared by every member of this group (`ast_hash` for `exact`, `ast_shape_hash` for `shape`)." + }, + "members": { + "type": "array", + "minItems": 2, + "items": { "$ref": "#/definitions/member" } + }, + "fileCount": { + "type": "integer", + "minimum": 1, + "description": "Distinct file count across all members." + }, + "coveredByExactGroup": { + "type": "boolean", + "description": "For shape groups: true when at least one member also belongs to an exact group (Type-1 subset inside Type-2 finding). Always false for exact groups." + } + }, + "additionalProperties": false + }, + "member": { + "type": "object", + "required": [ + "id", + "qualifiedName", + "filePath", + "startLine", + "endLine", + "language", + "symbolKind" + ], + "properties": { + "id": { "type": "string" }, + "qualifiedName": { "type": "string" }, + "filePath": { "type": "string" }, + "startLine": { "type": "integer", "minimum": 0 }, + "endLine": { "type": "integer", "minimum": 0 }, + "language": { "type": "string" }, + "symbolKind": { "type": "string" } + }, + "additionalProperties": false + } + } +} diff --git a/src/bin/codegraph.ts b/src/bin/codegraph.ts index 75aa1aa9..a5b2844d 100644 --- a/src/bin/codegraph.ts +++ b/src/bin/codegraph.ts @@ -1241,6 +1241,151 @@ program } }); +/** + * codegraph duplicates [path] + * + * PF-692: clone detection over PF-690 fingerprint columns. + * Reports Type-1 (exact) and Type-2 (shape) clone groups under + * the council-locked defaults: function+method kinds, ≥10 lines, + * shape groups whose members already form an exact group are + * suppressed. + */ +program + .command('duplicates [path]') + .description('Find clone groups in the index using PF-690 fingerprint columns') + .option( + '--kind ', + 'Comma-separated symbol kinds to include (default: function,method)', + ) + .option( + '--min-lines ', + 'Minimum endLine-startLine+1 to keep a symbol (default: 10)', + ) + .option('-j, --json', 'Output as JSON') + .action( + async ( + projectPathArg: string | undefined, + options: { kind?: string; minLines?: string; json?: boolean }, + ) => { + try { + // Match `status` / `index` / `sync`: walk up from the supplied + // path to find an initialized project. A user running + // `codegraph duplicates` from a subdirectory should resolve + // to the repo root, not look for `subdir/.codegraph/`. + const projectPath = resolveProjectPath(projectPathArg); + const dbPath = path.join(getCodeGraphDir(projectPath), 'codegraph.db'); + if (!fs.existsSync(dbPath)) { + error( + `CodeGraph index not found at ${dbPath}. Run \`codegraph init -i\` in this directory first.`, + ); + process.exit(1); + } + + const kinds = options.kind + ? options.kind + .split(',') + .map((s) => s.trim()) + .filter((s) => s.length > 0) + : undefined; + let minLines: number | undefined; + if (options.minLines !== undefined) { + // Strict positive-integer match — `parseInt` would happily + // accept `10abc` / `1.5` / `+10`, hiding typos. + if (!/^[1-9]\d*$/.test(options.minLines)) { + error(`--min-lines must be a positive integer, got: ${options.minLines}`); + process.exit(1); + } + minLines = Number.parseInt(options.minLines, 10); + } + + const { findDuplicates, DEFAULT_DUPLICATE_KINDS, DEFAULT_MIN_LINES } = + await import('../duplicates'); + const result = findDuplicates(dbPath, { kinds, minLines }); + + if (options.json) { + console.log( + JSON.stringify( + cliJsonEnvelope('duplicates', result as unknown as Record), + null, + 2, + ), + ); + return; + } + + const s = result.summary; + const usedKinds = kinds ?? DEFAULT_DUPLICATE_KINDS; + const usedMinLines = minLines ?? DEFAULT_MIN_LINES; + const cov = s.fingerprintCoverage; + + console.log(chalk.bold('\nCodeGraph Duplicates\n')); + console.log(chalk.cyan('Database:'), dbPath); + console.log( + chalk.dim( + ` kinds=${usedKinds.join(',')} min-lines=${usedMinLines} ` + + `coverage=${cov.withAstHash}/${cov.eligible} eligible nodes have fingerprints`, + ), + ); + console.log(); + console.log(chalk.bold('Summary:')); + console.log(` Exact clone groups (Type-1): ${formatNumber(s.exactGroups)}`); + console.log(` Shape clone groups (Type-2): ${formatNumber(s.shapeGroups)}`); + console.log(` Exact-duplicate nodes: ${formatNumber(s.exactNodes)}`); + console.log(` Shape-only duplicate nodes: ${formatNumber(s.shapeNodes)}`); + console.log(); + + if (result.groups.length === 0) { + console.log( + chalk.dim( + `No duplicate groups found with kinds=${usedKinds.join(',')}, ` + + `min-lines=${usedMinLines}, ${cov.withAstHash}/${cov.eligible} ` + + `eligible nodes have fingerprints.`, + ), + ); + return; + } + + const shown = result.groups.slice(0, 20); + const totalHiddenMembers = result.groups + .reduce((acc, g) => acc + Math.max(0, g.members.length - 5), 0); + console.log(chalk.bold(`Groups (first ${shown.length} of ${result.groups.length}):`)); + for (const g of shown) { + const tag = g.kind === 'shape' && g.coveredByExactGroup ? ' [contains exact subset]' : ''; + console.log( + chalk.cyan(g.kind.padEnd(6)) + + chalk.dim(g.fingerprint.slice(0, 12)) + + ' ' + + chalk.white(`${g.members.length} members in ${g.fileCount} file(s)`) + + chalk.yellow(tag), + ); + for (const m of g.members.slice(0, 5)) { + console.log( + chalk.dim(` ${m.filePath}:${m.startLine}`) + + ' ' + + chalk.white(m.qualifiedName), + ); + } + if (g.members.length > 5) { + console.log(chalk.dim(` …+${g.members.length - 5} more members`)); + } + } + if (result.groups.length > shown.length || totalHiddenMembers > 0) { + console.log(); + console.log( + chalk.yellow( + `Output truncated: ${result.groups.length - shown.length} more group(s) ` + + `and ${totalHiddenMembers} more member(s) hidden. Use --json for full output.`, + ), + ); + } + console.log(); + } catch (err) { + error(`Duplicates failed: ${err instanceof Error ? err.message : String(err)}`); + process.exit(1); + } + }, + ); + /** * codegraph files [path] */ diff --git a/src/duplicates.ts b/src/duplicates.ts new file mode 100644 index 00000000..89c9310a --- /dev/null +++ b/src/duplicates.ts @@ -0,0 +1,493 @@ +/** + * PF-692: clone detection primitive over PF-690 fingerprint columns. + * + * Groups symbols by `ast_hash` (Type-1 — exact, rename-locals + * normalized) and `ast_shape_hash` (Type-2 — structure-only) and + * reports clone-sets. Designed as the second consumer of the + * fingerprint columns landed in PR #38; the first was `codegraph + * diff` (PR #39). + * + * Surface and defaults are locked by council RFC (Codex): + * - `--kind` defaults to `function,method`. Class-level clones + * are often framework-shaped duplicates and noisier than + * useful at this layer. + * - `--min-lines` defaults to 10. This is the standard CPD/jscpd + * floor — filters one-liner accessors / validators / wrappers + * that would flood the output and make the tool feel broken. + * - Shape groups whose member set EQUALS an exact group are + * suppressed. A Type-1 clone is by definition also a Type-2 + * clone, so reporting both inflates the summary and adds no + * new information. Genuine Type-2 findings — groups whose + * members include at least one symbol that no exact-hash + * neighbor shares — are kept. + * - Groups sort by `len(members) DESC`, then by maximum line + * span DESC (largest individual symbol within the group), + * then by fingerprint ASC for stable output. + * - Uses `endLine - startLine + 1` for the size filter — an + * approximate, language-agnostic line count read straight off + * the indexed `nodes` row. Blank/comment lines inside the body + * count toward the threshold; an exact non-blank/non-comment + * count would require a source rescan and isn't worth it for + * a coarse noise filter. + * + * Read-only safety: opens the DB via `file:…?immutable=1` URI + * (Codex BLOCKER from PR #39 round 1; same fix here). The diff + * tool already proved this avoids `-shm`/`-wal` sidecar creation + * even on WAL-mode DBs. v5 DBs (pre-PR #38, no fingerprint + * columns) fail with a clear "requires schema v6+" message + * instead of producing empty groups silently. + */ + +import * as fs from 'fs'; +import { pathToFileURL } from 'url'; + +// eslint-disable-next-line @typescript-eslint/no-require-imports +const { DatabaseSync } = require('node:sqlite') as { + DatabaseSync: new ( + path: string, + opts?: { readOnly?: boolean }, + ) => SqliteReadOnly; +}; + +interface SqliteReadOnly { + prepare(sql: string): { + all(...args: unknown[]): unknown[]; + get(...args: unknown[]): unknown; + }; + close(): void; +} + +/** Default kinds when `--kind` is omitted. Locked by RFC. */ +export const DEFAULT_DUPLICATE_KINDS: ReadonlyArray = ['function', 'method']; + +/** Default minimum line span for a clone to count. Locked by RFC. */ +export const DEFAULT_MIN_LINES = 10; + +/** Hash group kinds we emit. */ +export type DuplicateGroupKind = 'exact' | 'shape'; + +/** + * A single symbol participating in a clone group. Columns are + * kept narrow — the diff schema already proved these are the + * fields downstream consumers actually want. + */ +export interface DuplicateMember { + id: string; + qualifiedName: string; + filePath: string; + startLine: number; + endLine: number; + language: string; + symbolKind: string; +} + +/** + * A clone group — two or more symbols that share the same + * fingerprint after the size and kind filters apply. + */ +export interface DuplicateGroup { + kind: DuplicateGroupKind; + fingerprint: string; + members: DuplicateMember[]; + /** + * Distinct file count across all members. Lets consumers + * distinguish "two implementations of the same function in + * different files" (likely refactor candidate) from "an + * accessor pattern repeated in the same class" (often + * legitimate). PR #40 round 2 REVIEW fix. + */ + fileCount: number; + /** + * For Type-2 shape groups: indicates whether at least one of + * this group's members ALSO belongs to a Type-1 exact group + * with the same body. Lets users see "this shape group exists + * because A and B are exact-identical AND C has the same shape + * but different exact hash". Always `false` for exact (Type-1) + * groups. PR #40 round 2 REVIEW fix. + */ + coveredByExactGroup: boolean; +} + +export interface DuplicatesOptions { + /** Symbol kinds to include. Defaults to function + method. */ + kinds?: ReadonlyArray; + /** Minimum `endLine - startLine + 1` to keep a row. Defaults to 10. */ + minLines?: number; +} + +export interface DuplicatesResult { + groups: DuplicateGroup[]; + summary: { + exactGroups: number; + shapeGroups: number; + exactNodes: number; + shapeNodes: number; + /** + * Fingerprint coverage at the time of this query: how many + * nodes (matching the requested kinds + min-lines) carry an + * astHash vs how many are eligible. Surfaced so consumers can + * tell whether a "no duplicates" result is real or an + * artefact of partial coverage. + */ + fingerprintCoverage: FingerprintCoverageRow; + }; +} + +interface NodeRow { + id: string; + qualified_name: string; + file_path: string; + start_line: number; + end_line: number; + language: string; + kind: string; + ast_hash: string | null; + ast_shape_hash: string | null; +} + +function openReadOnly(dbPath: string): SqliteReadOnly { + if (!fs.existsSync(dbPath)) { + throw new Error(`Database not found: ${dbPath}`); + } + // `pathToFileURL` is the canonical Node API for "filesystem path + // → `file://` URL" — it correctly percent-encodes spaces, `?`, + // `#`, `%`, non-ASCII codepoints, and Windows drive letters, all + // of which the hand-rolled escape in `diff.ts` would miss. Same + // `immutable=1` flag tells SQLite to skip locking + sidecar + // creation entirely. + const uri = pathToFileURL(dbPath).href + '?immutable=1'; + return new DatabaseSync(uri, { readOnly: true }); +} + +/** + * Reject v5 databases up front so users see a clear message + * instead of an empty `groups` array. The fingerprint columns + * are NULL on v5 rows, so a naive query would silently return + * nothing and look broken. + */ +function assertSchemaSupportsFingerprints(db: SqliteReadOnly): void { + const cols = db.prepare(`PRAGMA table_info('nodes')`).all() as Array<{ name: string }>; + if (cols.length === 0) { + throw new Error('Database has no `nodes` table — not a CodeGraph database, or corrupt.'); + } + const names = new Set(cols.map((c) => c.name)); + if (!names.has('ast_hash') || !names.has('ast_shape_hash')) { + throw new Error( + 'duplicates requires schema v6+ (PR #38 fingerprint columns). Re-run `codegraph index` to upgrade this DB.', + ); + } +} + +/** + * Coverage report for the requested kinds — how many eligible + * nodes carry fingerprints. Migrating to v6 only ADDED the + * columns; existing rows stay NULL until re-extracted (see + * `migrations.ts`). A user running `codegraph duplicates` on a + * migrated-but-not-reindexed DB would see "no duplicates" and + * think their code is clone-free, when really the index is just + * blind. Surface the gap as an explicit error rather than a + * silent empty result (PR #40 round 2 BLOCKER fix). + */ +export interface FingerprintCoverageRow { + eligible: number; + withAstHash: number; +} + +function fingerprintCoverage( + db: SqliteReadOnly, + kinds: ReadonlyArray, + minLines: number, +): FingerprintCoverageRow { + const { sql: kindSql, params: kindParams } = kindClause(kinds); + const row = db + .prepare( + `SELECT + COUNT(*) AS eligible, + SUM(CASE WHEN ast_hash IS NOT NULL THEN 1 ELSE 0 END) AS withAstHash + FROM nodes + WHERE ${kindSql} + AND (end_line - start_line + 1) >= ?`, + ) + .get(...kindParams, minLines) as { eligible: number | null; withAstHash: number | null }; + return { + eligible: row.eligible ?? 0, + withAstHash: row.withAstHash ?? 0, + }; +} + +/** + * Returns true when ANY node anywhere in the DB has an ast_hash — + * lets us distinguish "DB needs re-indexing" (no fingerprints + * anywhere) from "user requested unfingerprintable kinds" (some + * fingerprints exist, just not for the requested kinds). Codex PR + * review P2 fix. + */ +function dbHasAnyFingerprint(db: SqliteReadOnly): boolean { + const row = db + .prepare(`SELECT 1 AS has FROM nodes WHERE ast_hash IS NOT NULL LIMIT 1`) + .get() as { has?: number } | undefined; + return !!row; +} + +function assertFingerprintCoverage( + db: SqliteReadOnly, + kinds: ReadonlyArray, + cov: FingerprintCoverageRow, +): void { + if (cov.eligible === 0) { + // No nodes match the kind+min-lines filter at all. Different + // condition from a missing fingerprint — just return empty. + return; + } + if (cov.withAstHash > 0) return; + + // Zero fingerprinted nodes among the requested kinds. Distinguish + // two cases: (1) DB has fingerprints SOMEWHERE — user asked for a + // kind that isn't fingerprinted (framework-extractor nodes don't + // go through `createNode`'s hash path); (2) DB has zero + // fingerprints anywhere — likely migrated-not-reindexed. + if (dbHasAnyFingerprint(db)) { + throw new Error( + `duplicates: 0 of ${cov.eligible} eligible nodes for kinds=${kinds.join(',')} ` + + `have fingerprints, but this DB does have fingerprints for OTHER kinds. ` + + `The requested kinds aren't fingerprinted (e.g., framework-extractor ` + + `nodes like 'component'/'route' are emitted without tree-sitter hashing). ` + + `Try --kind=function,method.`, + ); + } + throw new Error( + `duplicates: 0 of ${cov.eligible} eligible nodes have fingerprints, ` + + `and this DB has NO fingerprinted nodes at all. ` + + `This usually means the DB was migrated to v6 but not re-indexed. ` + + `Run \`codegraph index\` to refresh fingerprints.`, + ); +} + +function rowToMember(r: NodeRow): DuplicateMember { + return { + id: r.id, + qualifiedName: r.qualified_name, + filePath: r.file_path, + startLine: r.start_line, + endLine: r.end_line, + language: r.language, + symbolKind: r.kind, + }; +} + +/** + * Build the `kind IN (?, ?, ?)` clause + parameter list for the + * SELECT. SQLite prepared statements need each parameter + * positionally; this helper keeps the call site readable. + */ +function kindClause(kinds: ReadonlyArray): { sql: string; params: string[] } { + if (kinds.length === 0) { + // Empty `--kind=` from a user would otherwise return no rows; + // surface the misuse rather than silently producing []groups. + throw new Error('duplicates: --kind list cannot be empty.'); + } + const placeholders = kinds.map(() => '?').join(', '); + return { sql: `kind IN (${placeholders})`, params: kinds.slice() }; +} + +/** + * Compute clone groups in `db` over the given hash column. The + * size filter lives in the WHERE clause so SQLite can use the + * `idx_nodes_ast_hash` / `idx_nodes_ast_shape_hash` indexes + * efficiently; the GROUP BY HAVING count(*) > 1 enforces that + * only multi-member groups come back. + */ +function loadGroups( + db: SqliteReadOnly, + column: 'ast_hash' | 'ast_shape_hash', + kinds: ReadonlyArray, + minLines: number, +): Map { + const { sql: kindSql, params: kindParams } = kindClause(kinds); + const sql = ` + SELECT id, qualified_name, file_path, start_line, end_line, + language, kind, ast_hash, ast_shape_hash + FROM nodes + WHERE ${column} IS NOT NULL + AND ${kindSql} + AND (end_line - start_line + 1) >= ? + AND ${column} IN ( + SELECT ${column} FROM nodes + WHERE ${column} IS NOT NULL + AND ${kindSql} + AND (end_line - start_line + 1) >= ? + GROUP BY ${column} + HAVING COUNT(*) > 1 + ) + ORDER BY ${column}, file_path, start_line, id + `; + const params = [...kindParams, minLines, ...kindParams, minLines]; + const rows = db.prepare(sql).all(...params) as NodeRow[]; + const out = new Map(); + for (const r of rows) { + const key = column === 'ast_hash' ? r.ast_hash : r.ast_shape_hash; + if (!key) continue; // defensive — `IS NOT NULL` already filtered + if (!out.has(key)) out.set(key, []); + out.get(key)!.push(rowToMember(r)); + } + return out; +} + +/** + * Member set as a stable string key for subset checks. Sorting + * by id makes two equal-membership groups produce identical keys + * regardless of row order. + */ +function memberSetKey(members: DuplicateMember[]): string { + return members + .map((m) => m.id) + .sort() + .join('\x1f'); +} + +/** + * Suppress shape groups whose member set is exactly equal to an + * exact group's. RFC fork 1 — Type-1 implies Type-2, so the shape + * group duplicates information already in the exact group. + * + * Note: we drop only on EXACT set equality, not strict subset. + * A genuine Type-2 finding has at least one member that's not in + * any Type-1 group (different exact hash but same shape) — that's + * the real value of shape detection and must NOT be dropped. + */ +function suppressShapeCoveredByExact( + exact: Map, + shape: Map, +): Map { + const exactKeys = new Set(); + for (const members of exact.values()) { + exactKeys.add(memberSetKey(members)); + } + const out = new Map(); + for (const [hash, members] of shape) { + if (!exactKeys.has(memberSetKey(members))) { + out.set(hash, members); + } + } + return out; +} + +/** + * Compare two groups for the RFC fork 5 sort order: + * primary: member count DESC (biggest clone set first) + * secondary: max line span DESC (largest individual symbol) + * tertiary: first member filePath ASC (human-meaningful — same + * clone reproducibly appears at the same output + * position across rebuilds — PR #40 round 2 REVIEW + * fix replacing the previous SHA-256 tie-break) + * quaternary: first member startLine ASC + * quinary: fingerprint ASC (final fallback for true ties) + */ +function compareGroups(a: DuplicateGroup, b: DuplicateGroup): number { + if (a.members.length !== b.members.length) { + return b.members.length - a.members.length; + } + const spanA = Math.max(...a.members.map((m) => m.endLine - m.startLine + 1)); + const spanB = Math.max(...b.members.map((m) => m.endLine - m.startLine + 1)); + if (spanA !== spanB) return spanB - spanA; + // Members are already sorted by file_path/start_line/id in the SQL. + // HAVING COUNT(*) > 1 guarantees at least 2 members per group, + // so members[0] is always defined — assert non-undefined for TS. + const aFirst = a.members[0]!; + const bFirst = b.members[0]!; + if (aFirst.filePath !== bFirst.filePath) { + return aFirst.filePath < bFirst.filePath ? -1 : 1; + } + if (aFirst.startLine !== bFirst.startLine) { + return aFirst.startLine - bFirst.startLine; + } + if (a.fingerprint < b.fingerprint) return -1; + if (a.fingerprint > b.fingerprint) return 1; + return 0; +} + +function distinctFileCount(members: DuplicateMember[]): number { + const files = new Set(); + for (const m of members) files.add(m.filePath); + return files.size; +} + +/** + * Public entry point. Opens `dbPath` read-only, computes Type-1 + * and Type-2 clone groups under the supplied options, deduplicates + * Type-2 groups that fully overlap Type-1 groups, and returns the + * sorted result. + */ +export function findDuplicates( + dbPath: string, + opts: DuplicatesOptions = {}, +): DuplicatesResult { + const kinds = opts.kinds ?? DEFAULT_DUPLICATE_KINDS; + const minLines = opts.minLines ?? DEFAULT_MIN_LINES; + + const db = openReadOnly(dbPath); + try { + assertSchemaSupportsFingerprints(db); + const coverage = fingerprintCoverage(db, kinds, minLines); + assertFingerprintCoverage(db, kinds, coverage); + + const exact = loadGroups(db, 'ast_hash', kinds, minLines); + const shapeRaw = loadGroups(db, 'ast_shape_hash', kinds, minLines); + const shape = suppressShapeCoveredByExact(exact, shapeRaw); + + // Collect member ids that belong to any exact group so shape + // supersets can annotate `coveredByExactGroup` (PR #40 round 2 + // REVIEW fix). A genuine Type-2 finding has at least one + // member NOT in an exact group. + const idsInExactGroup = new Set(); + for (const members of exact.values()) { + for (const m of members) idsInExactGroup.add(m.id); + } + + const groups: DuplicateGroup[] = []; + let exactNodes = 0; + let shapeNodes = 0; + for (const [fingerprint, members] of exact) { + groups.push({ + kind: 'exact', + fingerprint, + members, + fileCount: distinctFileCount(members), + coveredByExactGroup: false, + }); + exactNodes += members.length; + } + for (const [fingerprint, members] of shape) { + const anyMemberIsExact = members.some((m) => idsInExactGroup.has(m.id)); + groups.push({ + kind: 'shape', + fingerprint, + members, + fileCount: distinctFileCount(members), + coveredByExactGroup: anyMemberIsExact, + }); + // Count ONLY shape-ONLY members (those not already covered + // by an exact group). Otherwise shape-superset cases like + // {A.f exact, B.f exact, A.g shape} would inflate + // `shapeNodes` by re-counting A.f + B.f (Codex PR review P2 + // double-counting fix). + for (const m of members) { + if (!idsInExactGroup.has(m.id)) shapeNodes++; + } + } + groups.sort(compareGroups); + + return { + groups, + summary: { + exactGroups: exact.size, + shapeGroups: shape.size, + exactNodes, + shapeNodes, + fingerprintCoverage: coverage, + }, + }; + } finally { + db.close(); + } +}