diff --git a/src/domain/graph/builder/pipeline.ts b/src/domain/graph/builder/pipeline.ts index 2a395f60..1c745016 100644 --- a/src/domain/graph/builder/pipeline.ts +++ b/src/domain/graph/builder/pipeline.ts @@ -32,7 +32,13 @@ import type { ExtractorOutput, SqliteStatement, } from '../../../types.js'; -import { getActiveEngine, getInstalledWasmExtensions, parseFilesAuto } from '../../parser.js'; +import { + classifyNativeDrops, + formatDropExtensionSummary, + getActiveEngine, + getInstalledWasmExtensions, + parseFilesAuto, +} from '../../parser.js'; import { setWorkspaces } from '../resolve.js'; import { PipelineContext } from './context.js'; import { batchInsertNodes, collectFiles as collectFilesUtil, loadPathAliases } from './helpers.js'; @@ -761,18 +767,32 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise { // minimal installs) can't be parsed by either engine, so they're not a // native regression — excluding them keeps the warn count meaningful. const installedExts = getInstalledWasmExtensions(); + const missingRel: string[] = []; const missingAbs: string[] = []; for (const rel of expected) { if (existing.has(rel)) continue; const ext = path.extname(rel).toLowerCase(); if (!installedExts.has(ext)) continue; + missingRel.push(rel); missingAbs.push(path.join(ctx.rootDir, rel)); } if (missingAbs.length === 0) return; - warn( - `Native orchestrator dropped ${missingAbs.length} file(s); backfilling via WASM for engine parity`, - ); + // Classify drops so users see per-extension reasons instead of just a count + // (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust + // extractor); `native-extractor-failure` indicates a real native bug since + // the language IS supported by the addon yet the file was dropped anyway. + const { byReason, totals } = classifyNativeDrops(missingRel); + if (totals['unsupported-by-native'] > 0) { + info( + `Native orchestrator skipped ${totals['unsupported-by-native']} file(s) in languages without a Rust extractor; backfilling via WASM: ${formatDropExtensionSummary(byReason['unsupported-by-native'])}`, + ); + } + if (totals['native-extractor-failure'] > 0) { + warn( + `Native orchestrator dropped ${totals['native-extractor-failure']} file(s) in natively-supported languages — likely a Rust extractor bug. Backfilling via WASM: ${formatDropExtensionSummary(byReason['native-extractor-failure'])}`, + ); + } const wasmResults = await parseFilesAuto(missingAbs, ctx.rootDir, { engine: 'wasm' }); const rows: unknown[][] = []; diff --git a/src/domain/parser.ts b/src/domain/parser.ts index 67e6ef93..6aa19c3c 100644 --- a/src/domain/parser.ts +++ b/src/domain/parser.ts @@ -412,6 +412,128 @@ export function getInstalledWasmExtensions(): Set { return exts; } +/** + * Lowercase file extensions covered by the native Rust addon. + * + * Mirrors `LanguageKind::from_extension` in + * `crates/codegraph-core/src/parser_registry.rs`. Used to classify why the + * native orchestrator dropped a file: extensions outside this set are a + * legitimate parser limit (no Rust extractor exists), while extensions inside + * it indicate a real native bug (parse/read/extract failure). + * + * Keep this list in sync with the Rust enum — the native addon is a separate + * npm package, so JS has no runtime way to discover its language coverage. + */ +export const NATIVE_SUPPORTED_EXTENSIONS: ReadonlySet = new Set([ + '.js', + '.jsx', + '.mjs', + '.cjs', + '.ts', + '.tsx', + '.py', + '.pyi', + '.tf', + '.hcl', + '.go', + '.rs', + '.java', + '.cs', + '.rb', + '.rake', + '.gemspec', + '.php', + '.phtml', + '.c', + '.h', + '.cpp', + '.cc', + '.cxx', + '.hpp', + '.kt', + '.kts', + '.swift', + '.scala', + '.sh', + '.bash', + '.ex', + '.exs', + '.lua', + '.dart', + '.zig', + '.hs', + '.ml', + '.mli', +]); + +/** + * Classification for a file the native orchestrator dropped. + * - `unsupported-by-native`: extension has no Rust extractor (legitimate parser limit). + * - `native-extractor-failure`: extension is supported by native but the file was + * still dropped — points at a real bug (read error, parse failure, extractor crash). + */ +export type NativeDropReason = 'unsupported-by-native' | 'native-extractor-failure'; + +export interface NativeDropClassification { + /** Per-reason → per-extension → list of relative paths that hit that bucket. */ + byReason: Record>; + /** Total file count per reason. */ + totals: Record; +} + +/** + * Group the missing files (relative paths) by drop reason and extension so the + * caller can log per-extension counts and a sample path. Pure function — no + * I/O, safe to unit-test independently of the build pipeline. + */ +export function classifyNativeDrops(relPaths: Iterable): NativeDropClassification { + const byReason: Record> = { + 'unsupported-by-native': new Map(), + 'native-extractor-failure': new Map(), + }; + const totals: Record = { + 'unsupported-by-native': 0, + 'native-extractor-failure': 0, + }; + for (const rel of relPaths) { + const ext = path.extname(rel).toLowerCase(); + const reason: NativeDropReason = NATIVE_SUPPORTED_EXTENSIONS.has(ext) + ? 'native-extractor-failure' + : 'unsupported-by-native'; + const bucket = byReason[reason]; + let list = bucket.get(ext); + if (!list) { + list = []; + bucket.set(ext, list); + } + list.push(rel); + totals[reason]++; + } + return { byReason, totals }; +} + +/** + * Render `{ ext → paths[] }` as `ext (n: sample.ext, ...)` slices for log lines. + * Caps at 3 sample paths per extension and 6 extensions total to keep warnings + * readable when many languages are dropped at once. Extensions are sorted by + * descending file count so the loudest offender shows up first; ties keep + * insertion order. Pure function — safe to unit-test independently. + */ +export function formatDropExtensionSummary(buckets: Map): string { + const MAX_EXTS = 6; + const MAX_SAMPLES = 3; + const entries = Array.from(buckets.entries()).sort((a, b) => b[1].length - a[1].length); + const shown = entries.slice(0, MAX_EXTS).map(([ext, paths]) => { + const sample = paths.slice(0, MAX_SAMPLES).join(', '); + const more = paths.length > MAX_SAMPLES ? `, +${paths.length - MAX_SAMPLES} more` : ''; + return `${ext} (${paths.length}: ${sample}${more})`; + }); + if (entries.length > MAX_EXTS) { + shown.push(`+${entries.length - MAX_EXTS} more extension(s)`); + } + return shown.join('; '); +} + // ── Unified API ────────────────────────────────────────────────────────────── function resolveEngine(opts: ParseEngineOpts = {}): ResolvedEngine { diff --git a/tests/parsers/native-drop-classification.test.ts b/tests/parsers/native-drop-classification.test.ts new file mode 100644 index 00000000..24aee1d5 --- /dev/null +++ b/tests/parsers/native-drop-classification.test.ts @@ -0,0 +1,204 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { describe, expect, it } from 'vitest'; +import { + classifyNativeDrops, + formatDropExtensionSummary, + NATIVE_SUPPORTED_EXTENSIONS, +} from '../../src/domain/parser.js'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); +const REPO_ROOT = path.resolve(__dirname, '..', '..'); + +describe('classifyNativeDrops', () => { + it('groups WASM-only languages under unsupported-by-native', () => { + const { byReason, totals } = classifyNativeDrops([ + 'src/a.fs', + 'src/b.gleam', + 'src/c.clj', + 'src/d.jl', + 'src/e.R', + 'src/f.erl', + 'src/g.sol', + 'src/h.cu', + 'src/i.groovy', + 'src/j.v', + 'src/k.m', + ]); + expect(totals['unsupported-by-native']).toBe(11); + expect(totals['native-extractor-failure']).toBe(0); + expect(byReason['unsupported-by-native'].get('.fs')).toEqual(['src/a.fs']); + expect(byReason['unsupported-by-native'].get('.gleam')).toEqual(['src/b.gleam']); + expect(byReason['unsupported-by-native'].get('.r')).toEqual(['src/e.R']); + }); + + it('flags natively-supported extensions as native-extractor-failure', () => { + const { byReason, totals } = classifyNativeDrops([ + 'src/a.ts', + 'src/b.py', + 'src/c.go', + 'src/d.rs', + ]); + expect(totals['native-extractor-failure']).toBe(4); + expect(totals['unsupported-by-native']).toBe(0); + expect(byReason['native-extractor-failure'].get('.ts')).toEqual(['src/a.ts']); + expect(byReason['native-extractor-failure'].get('.py')).toEqual(['src/b.py']); + }); + + it('handles a mix of supported and unsupported extensions', () => { + const { byReason, totals } = classifyNativeDrops([ + 'src/a.ts', + 'src/b.fs', + 'src/c.fs', + 'src/d.gleam', + ]); + expect(totals['native-extractor-failure']).toBe(1); + expect(totals['unsupported-by-native']).toBe(3); + expect(byReason['unsupported-by-native'].get('.fs')).toEqual(['src/b.fs', 'src/c.fs']); + expect(byReason['unsupported-by-native'].get('.gleam')).toEqual(['src/d.gleam']); + }); + + it('lowercases extensions so .R and .r share a bucket', () => { + const { byReason, totals } = classifyNativeDrops(['scripts/a.R', 'scripts/b.r']); + expect(totals['unsupported-by-native']).toBe(2); + expect(byReason['unsupported-by-native'].get('.r')).toEqual(['scripts/a.R', 'scripts/b.r']); + }); + + it('returns empty buckets when no files are passed', () => { + const { byReason, totals } = classifyNativeDrops([]); + expect(totals['native-extractor-failure']).toBe(0); + expect(totals['unsupported-by-native']).toBe(0); + expect(byReason['native-extractor-failure'].size).toBe(0); + expect(byReason['unsupported-by-native'].size).toBe(0); + }); + + it('exposes the native-supported extension set for callers', () => { + expect(NATIVE_SUPPORTED_EXTENSIONS.has('.ts')).toBe(true); + expect(NATIVE_SUPPORTED_EXTENSIONS.has('.py')).toBe(true); + expect(NATIVE_SUPPORTED_EXTENSIONS.has('.fs')).toBe(false); + expect(NATIVE_SUPPORTED_EXTENSIONS.has('.gleam')).toBe(false); + }); +}); + +describe('formatDropExtensionSummary', () => { + it('returns an empty string when no buckets are present', () => { + expect(formatDropExtensionSummary(new Map())).toBe(''); + }); + + it('lists every extension when under the cap', () => { + const buckets = new Map([ + ['.ts', ['a.ts', 'b.ts']], + ['.py', ['c.py']], + ]); + expect(formatDropExtensionSummary(buckets)).toBe('.ts (2: a.ts, b.ts); .py (1: c.py)'); + }); + + it('caps samples per extension at 3 and renders +N more', () => { + const buckets = new Map([['.ts', ['a.ts', 'b.ts', 'c.ts', 'd.ts', 'e.ts']]]); + expect(formatDropExtensionSummary(buckets)).toBe('.ts (5: a.ts, b.ts, c.ts, +2 more)'); + }); + + it('shows exactly MAX_SAMPLES samples without a +N suffix when count equals the cap', () => { + const buckets = new Map([['.ts', ['a.ts', 'b.ts', 'c.ts']]]); + expect(formatDropExtensionSummary(buckets)).toBe('.ts (3: a.ts, b.ts, c.ts)'); + }); + + it('caps extensions at 6 and renders +N more extension(s)', () => { + // 8 extensions, all with 1 file — sorted by count is a stable tie so insertion + // order wins, and the first 6 are shown. + const buckets = new Map([ + ['.a', ['1.a']], + ['.b', ['1.b']], + ['.c', ['1.c']], + ['.d', ['1.d']], + ['.e', ['1.e']], + ['.f', ['1.f']], + ['.g', ['1.g']], + ['.h', ['1.h']], + ]); + const out = formatDropExtensionSummary(buckets); + expect(out.endsWith('; +2 more extension(s)')).toBe(true); + // First 6 extensions are present, the last 2 (.g, .h) are not. + expect(out).toContain('.a (1: 1.a)'); + expect(out).toContain('.f (1: 1.f)'); + expect(out).not.toContain('.g ('); + expect(out).not.toContain('.h ('); + }); + + it('sorts by descending file count so the loudest offender is first', () => { + const buckets = new Map([ + ['.small', ['x']], + ['.huge', ['a', 'b', 'c', 'd']], + ['.medium', ['m', 'n']], + ]); + const out = formatDropExtensionSummary(buckets); + const positions = ['.huge', '.medium', '.small'].map((ext) => out.indexOf(ext)); + expect(positions[0]).toBeLessThan(positions[1]); + expect(positions[1]).toBeLessThan(positions[2]); + }); +}); + +/** + * Drift guard for `NATIVE_SUPPORTED_EXTENSIONS`. + * + * Greptile flagged that this set is keyed to one snapshot of + * `LanguageKind::from_extension` in the Rust addon, and silent drift between + * the JS and Rust sides would mis-classify drops (real native failures shown + * as info, parser-limit gaps shown as warn). The native addon doesn't expose + * its own metadata, so we parse the Rust source instead and assert the two + * lists agree at build time. If `parser_registry.rs` is ever refactored, this + * test fails loudly so the maintainer notices. + */ +describe('NATIVE_SUPPORTED_EXTENSIONS drift guard', () => { + it('matches the extension set in crates/codegraph-core/src/parser_registry.rs', () => { + const registryPath = path.join( + REPO_ROOT, + 'crates', + 'codegraph-core', + 'src', + 'parser_registry.rs', + ); + const src = fs.readFileSync(registryPath, 'utf8'); + const fromExtStart = src.indexOf('pub fn from_extension'); + expect(fromExtStart, 'from_extension not found in parser_registry.rs').toBeGreaterThan(-1); + // Slice from `pub fn from_extension` to the next `pub fn` (boundary of + // the next method) so we don't accidentally pick up extensions from + // unrelated functions like `from_lang_id` (which contains lang_id + // strings that look extension-like, e.g. "javascript", "python"). + const tail = src.slice(fromExtStart); + const nextFnRel = tail.slice(1).search(/\n\s*\/\/\/|\n\s*pub fn /); + const body = nextFnRel === -1 ? tail : tail.slice(0, nextFnRel + 1); + const rustExts = new Set(); + // Match string literals like "ts", "py", "tsx", "d.ts" etc. + for (const m of body.matchAll(/"([A-Za-z0-9.]+)"/g)) { + rustExts.add(m[1]); + } + // Normalize Rust forms to the JS `.ext` form. The function mixes: + // - prefix branches with leading dot: ".tsx", ".d.ts" + // - `match ext` arms without dot: "ts", "py", "rb", ... + // `.d.ts` is a TypeScript declaration file alias mapped to TypeScript; + // JS treats those files via `.ts` so the alias is not in the JS set. + const normalized = new Set(); + for (const e of rustExts) { + // `.d.ts` declaration files are mapped to TypeScript via a special + // prefix branch — JS handles those via the `.ts` entry, so skip the + // alias in either matched form. + if (e === 'd.ts' || e === '.d.ts') continue; + const withDot = e.startsWith('.') ? e : `.${e}`; + normalized.add(withDot.toLowerCase()); + } + const jsExts = new Set(NATIVE_SUPPORTED_EXTENSIONS); + const onlyInRust = [...normalized].filter((e) => !jsExts.has(e)); + const onlyInJs = [...jsExts].filter((e) => !normalized.has(e)); + expect( + onlyInRust, + `Extensions in parser_registry.rs but missing from NATIVE_SUPPORTED_EXTENSIONS: ${onlyInRust.join(', ')}`, + ).toEqual([]); + expect( + onlyInJs, + `Extensions in NATIVE_SUPPORTED_EXTENSIONS but missing from parser_registry.rs: ${onlyInJs.join(', ')}`, + ).toEqual([]); + }); +});