Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions src/domain/graph/builder/pipeline.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,13 @@ import type {
ExtractorOutput,
SqliteStatement,
} from '../../../types.js';
import { getActiveEngine, getInstalledWasmExtensions, parseFilesAuto } from '../../parser.js';
import {
classifyNativeDrops,
formatDropExtensionSummary,
getActiveEngine,
getInstalledWasmExtensions,
parseFilesAuto,
} from '../../parser.js';
import { setWorkspaces } from '../resolve.js';
import { PipelineContext } from './context.js';
import { batchInsertNodes, collectFiles as collectFilesUtil, loadPathAliases } from './helpers.js';
Expand Down Expand Up @@ -761,18 +767,32 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
// minimal installs) can't be parsed by either engine, so they're not a
// native regression — excluding them keeps the warn count meaningful.
const installedExts = getInstalledWasmExtensions();
const missingRel: string[] = [];
const missingAbs: string[] = [];
for (const rel of expected) {
if (existing.has(rel)) continue;
const ext = path.extname(rel).toLowerCase();
if (!installedExts.has(ext)) continue;
missingRel.push(rel);
missingAbs.push(path.join(ctx.rootDir, rel));
}
if (missingAbs.length === 0) return;

warn(
`Native orchestrator dropped ${missingAbs.length} file(s); backfilling via WASM for engine parity`,
);
// Classify drops so users see per-extension reasons instead of just a count
// (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust
// extractor); `native-extractor-failure` indicates a real native bug since
// the language IS supported by the addon yet the file was dropped anyway.
const { byReason, totals } = classifyNativeDrops(missingRel);
if (totals['unsupported-by-native'] > 0) {
info(
`Native orchestrator skipped ${totals['unsupported-by-native']} file(s) in languages without a Rust extractor; backfilling via WASM: ${formatDropExtensionSummary(byReason['unsupported-by-native'])}`,
);
}
if (totals['native-extractor-failure'] > 0) {
warn(
`Native orchestrator dropped ${totals['native-extractor-failure']} file(s) in natively-supported languages — likely a Rust extractor bug. Backfilling via WASM: ${formatDropExtensionSummary(byReason['native-extractor-failure'])}`,
);
}
const wasmResults = await parseFilesAuto(missingAbs, ctx.rootDir, { engine: 'wasm' });

const rows: unknown[][] = [];
Expand Down
122 changes: 122 additions & 0 deletions src/domain/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,128 @@ export function getInstalledWasmExtensions(): Set<string> {
return exts;
}

/**
* Lowercase file extensions covered by the native Rust addon.
*
* Mirrors `LanguageKind::from_extension` in
* `crates/codegraph-core/src/parser_registry.rs`. Used to classify why the
* native orchestrator dropped a file: extensions outside this set are a
* legitimate parser limit (no Rust extractor exists), while extensions inside
* it indicate a real native bug (parse/read/extract failure).
*
* Keep this list in sync with the Rust enum — the native addon is a separate
* npm package, so JS has no runtime way to discover its language coverage.
*/
export const NATIVE_SUPPORTED_EXTENSIONS: ReadonlySet<string> = new Set([
'.js',
'.jsx',
'.mjs',
'.cjs',
'.ts',
'.tsx',
'.py',
'.pyi',
'.tf',
'.hcl',
'.go',
'.rs',
'.java',
'.cs',
'.rb',
'.rake',
'.gemspec',
'.php',
'.phtml',
'.c',
'.h',
'.cpp',
'.cc',
'.cxx',
'.hpp',
'.kt',
'.kts',
'.swift',
'.scala',
'.sh',
'.bash',
'.ex',
'.exs',
'.lua',
'.dart',
'.zig',
'.hs',
'.ml',
'.mli',
]);
Comment on lines +415 to +467
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Silent mis-classification risk on native addon version drift

NATIVE_SUPPORTED_EXTENSIONS is keyed to one specific snapshot of LanguageKind::from_extension. If the Rust addon gains a new language (or drops one) between addon releases without a matching JS update, drops will be silently mis-classified: a real native failure shows up as unsupported-by-native (info, quiet) instead of native-extractor-failure (warn, loud). The inverse case — removed support — would spam false native-extractor-failure warnings. There's no runtime assertion that the two lists agree, so the drift won't be caught until a user notices wrong log levels. Consider adding a CI step or a startup assertion that cross-checks the set against the native addon's own exported metadata if the addon exposes it; if it doesn't, at minimum add an integration test that verifies the current addon version is the one this set was generated from.

Fix in Claude Code

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in cbbc9ae — added a drift guard test that parses crates/codegraph-core/src/parser_registry.rs and asserts NATIVE_SUPPORTED_EXTENSIONS agrees with the Rust LanguageKind::from_extension arms. The native addon doesn't expose its own metadata, so source-level cross-check at CI time is the cheapest way to catch drift before users see mis-classified log levels. If parser_registry.rs adds or removes an extension, the test fails loudly with a list of mismatches.


/**
* Classification for a file the native orchestrator dropped.
* - `unsupported-by-native`: extension has no Rust extractor (legitimate parser limit).
* - `native-extractor-failure`: extension is supported by native but the file was
* still dropped — points at a real bug (read error, parse failure, extractor crash).
*/
export type NativeDropReason = 'unsupported-by-native' | 'native-extractor-failure';

export interface NativeDropClassification {
/** Per-reason → per-extension → list of relative paths that hit that bucket. */
byReason: Record<NativeDropReason, Map<string, string[]>>;
/** Total file count per reason. */
totals: Record<NativeDropReason, number>;
}

/**
* Group the missing files (relative paths) by drop reason and extension so the
* caller can log per-extension counts and a sample path. Pure function — no
* I/O, safe to unit-test independently of the build pipeline.
*/
export function classifyNativeDrops(relPaths: Iterable<string>): NativeDropClassification {
const byReason: Record<NativeDropReason, Map<string, string[]>> = {
'unsupported-by-native': new Map(),
'native-extractor-failure': new Map(),
};
const totals: Record<NativeDropReason, number> = {
'unsupported-by-native': 0,
'native-extractor-failure': 0,
};
for (const rel of relPaths) {
const ext = path.extname(rel).toLowerCase();
const reason: NativeDropReason = NATIVE_SUPPORTED_EXTENSIONS.has(ext)
? 'native-extractor-failure'
: 'unsupported-by-native';
const bucket = byReason[reason];
let list = bucket.get(ext);
if (!list) {
list = [];
bucket.set(ext, list);
}
list.push(rel);
totals[reason]++;
}
return { byReason, totals };
}

/**
* Render `{ ext → paths[] }` as `ext (n: sample.ext, ...)` slices for log lines.
* Caps at 3 sample paths per extension and 6 extensions total to keep warnings
* readable when many languages are dropped at once. Extensions are sorted by
* descending file count so the loudest offender shows up first; ties keep
* insertion order. Pure function — safe to unit-test independently.
*/
export function formatDropExtensionSummary(buckets: Map<string, string[]>): string {
const MAX_EXTS = 6;
const MAX_SAMPLES = 3;
const entries = Array.from(buckets.entries()).sort((a, b) => b[1].length - a[1].length);
const shown = entries.slice(0, MAX_EXTS).map(([ext, paths]) => {
const sample = paths.slice(0, MAX_SAMPLES).join(', ');
const more = paths.length > MAX_SAMPLES ? `, +${paths.length - MAX_SAMPLES} more` : '';
return `${ext} (${paths.length}: ${sample}${more})`;
});
if (entries.length > MAX_EXTS) {
shown.push(`+${entries.length - MAX_EXTS} more extension(s)`);
}
return shown.join('; ');
}

// ── Unified API ──────────────────────────────────────────────────────────────

function resolveEngine(opts: ParseEngineOpts = {}): ResolvedEngine {
Expand Down
204 changes: 204 additions & 0 deletions tests/parsers/native-drop-classification.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { describe, expect, it } from 'vitest';
import {
classifyNativeDrops,
formatDropExtensionSummary,
NATIVE_SUPPORTED_EXTENSIONS,
} from '../../src/domain/parser.js';

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const REPO_ROOT = path.resolve(__dirname, '..', '..');

describe('classifyNativeDrops', () => {
it('groups WASM-only languages under unsupported-by-native', () => {
const { byReason, totals } = classifyNativeDrops([
'src/a.fs',
'src/b.gleam',
'src/c.clj',
'src/d.jl',
'src/e.R',
'src/f.erl',
'src/g.sol',
'src/h.cu',
'src/i.groovy',
'src/j.v',
'src/k.m',
]);
expect(totals['unsupported-by-native']).toBe(11);
expect(totals['native-extractor-failure']).toBe(0);
expect(byReason['unsupported-by-native'].get('.fs')).toEqual(['src/a.fs']);
expect(byReason['unsupported-by-native'].get('.gleam')).toEqual(['src/b.gleam']);
expect(byReason['unsupported-by-native'].get('.r')).toEqual(['src/e.R']);
});

it('flags natively-supported extensions as native-extractor-failure', () => {
const { byReason, totals } = classifyNativeDrops([
'src/a.ts',
'src/b.py',
'src/c.go',
'src/d.rs',
]);
expect(totals['native-extractor-failure']).toBe(4);
expect(totals['unsupported-by-native']).toBe(0);
expect(byReason['native-extractor-failure'].get('.ts')).toEqual(['src/a.ts']);
expect(byReason['native-extractor-failure'].get('.py')).toEqual(['src/b.py']);
});

it('handles a mix of supported and unsupported extensions', () => {
const { byReason, totals } = classifyNativeDrops([
'src/a.ts',
'src/b.fs',
'src/c.fs',
'src/d.gleam',
]);
expect(totals['native-extractor-failure']).toBe(1);
expect(totals['unsupported-by-native']).toBe(3);
expect(byReason['unsupported-by-native'].get('.fs')).toEqual(['src/b.fs', 'src/c.fs']);
expect(byReason['unsupported-by-native'].get('.gleam')).toEqual(['src/d.gleam']);
});

it('lowercases extensions so .R and .r share a bucket', () => {
const { byReason, totals } = classifyNativeDrops(['scripts/a.R', 'scripts/b.r']);
expect(totals['unsupported-by-native']).toBe(2);
expect(byReason['unsupported-by-native'].get('.r')).toEqual(['scripts/a.R', 'scripts/b.r']);
});

it('returns empty buckets when no files are passed', () => {
const { byReason, totals } = classifyNativeDrops([]);
expect(totals['native-extractor-failure']).toBe(0);
expect(totals['unsupported-by-native']).toBe(0);
expect(byReason['native-extractor-failure'].size).toBe(0);
expect(byReason['unsupported-by-native'].size).toBe(0);
});

it('exposes the native-supported extension set for callers', () => {
expect(NATIVE_SUPPORTED_EXTENSIONS.has('.ts')).toBe(true);
expect(NATIVE_SUPPORTED_EXTENSIONS.has('.py')).toBe(true);
expect(NATIVE_SUPPORTED_EXTENSIONS.has('.fs')).toBe(false);
expect(NATIVE_SUPPORTED_EXTENSIONS.has('.gleam')).toBe(false);
});
});
Comment on lines +4 to +83
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 formatDropExtensionSummary cap logic is not unit-tested

The non-trivial formatting function (MAX_EXTS = 6, MAX_SAMPLES = 3, +N more extension(s) display) lives in pipeline.ts but has no dedicated unit tests. The current test suite only covers classifyNativeDrops. A regression in the cap logic (e.g., off-by-one in the +N more calculation, or the sort being reversed) would produce silently truncated log lines without any test failure. Adding a small test or extracting and testing formatDropExtensionSummary directly would cover that path.

Fix in Claude Code

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in cbbc9ae — moved formatDropExtensionSummary from pipeline.ts to parser.ts (next to classifyNativeDrops) and exported it. Added 6 unit tests covering: empty buckets, under-cap rendering, the +N more per-extension sample cap, the exact-cap boundary, the +N more extension(s) suffix when the extension cap is hit, and descending-count ordering so a future regression in the sort or off-by-one in the +N math fails loudly.


describe('formatDropExtensionSummary', () => {
it('returns an empty string when no buckets are present', () => {
expect(formatDropExtensionSummary(new Map())).toBe('');
});

it('lists every extension when under the cap', () => {
const buckets = new Map<string, string[]>([
['.ts', ['a.ts', 'b.ts']],
['.py', ['c.py']],
]);
expect(formatDropExtensionSummary(buckets)).toBe('.ts (2: a.ts, b.ts); .py (1: c.py)');
});

it('caps samples per extension at 3 and renders +N more', () => {
const buckets = new Map<string, string[]>([['.ts', ['a.ts', 'b.ts', 'c.ts', 'd.ts', 'e.ts']]]);
expect(formatDropExtensionSummary(buckets)).toBe('.ts (5: a.ts, b.ts, c.ts, +2 more)');
});

it('shows exactly MAX_SAMPLES samples without a +N suffix when count equals the cap', () => {
const buckets = new Map<string, string[]>([['.ts', ['a.ts', 'b.ts', 'c.ts']]]);
expect(formatDropExtensionSummary(buckets)).toBe('.ts (3: a.ts, b.ts, c.ts)');
});

it('caps extensions at 6 and renders +N more extension(s)', () => {
// 8 extensions, all with 1 file — sorted by count is a stable tie so insertion
// order wins, and the first 6 are shown.
const buckets = new Map<string, string[]>([
['.a', ['1.a']],
['.b', ['1.b']],
['.c', ['1.c']],
['.d', ['1.d']],
['.e', ['1.e']],
['.f', ['1.f']],
['.g', ['1.g']],
['.h', ['1.h']],
]);
const out = formatDropExtensionSummary(buckets);
expect(out.endsWith('; +2 more extension(s)')).toBe(true);
// First 6 extensions are present, the last 2 (.g, .h) are not.
expect(out).toContain('.a (1: 1.a)');
expect(out).toContain('.f (1: 1.f)');
expect(out).not.toContain('.g (');
expect(out).not.toContain('.h (');
});

it('sorts by descending file count so the loudest offender is first', () => {
const buckets = new Map<string, string[]>([
['.small', ['x']],
['.huge', ['a', 'b', 'c', 'd']],
['.medium', ['m', 'n']],
]);
const out = formatDropExtensionSummary(buckets);
const positions = ['.huge', '.medium', '.small'].map((ext) => out.indexOf(ext));
expect(positions[0]).toBeLessThan(positions[1]);
expect(positions[1]).toBeLessThan(positions[2]);
});
});

/**
* Drift guard for `NATIVE_SUPPORTED_EXTENSIONS`.
*
* Greptile flagged that this set is keyed to one snapshot of
* `LanguageKind::from_extension` in the Rust addon, and silent drift between
* the JS and Rust sides would mis-classify drops (real native failures shown
* as info, parser-limit gaps shown as warn). The native addon doesn't expose
* its own metadata, so we parse the Rust source instead and assert the two
* lists agree at build time. If `parser_registry.rs` is ever refactored, this
* test fails loudly so the maintainer notices.
*/
describe('NATIVE_SUPPORTED_EXTENSIONS drift guard', () => {
it('matches the extension set in crates/codegraph-core/src/parser_registry.rs', () => {
const registryPath = path.join(
REPO_ROOT,
'crates',
'codegraph-core',
'src',
'parser_registry.rs',
);
const src = fs.readFileSync(registryPath, 'utf8');
const fromExtStart = src.indexOf('pub fn from_extension');
expect(fromExtStart, 'from_extension not found in parser_registry.rs').toBeGreaterThan(-1);
// Slice from `pub fn from_extension` to the next `pub fn` (boundary of
// the next method) so we don't accidentally pick up extensions from
// unrelated functions like `from_lang_id` (which contains lang_id
// strings that look extension-like, e.g. "javascript", "python").
const tail = src.slice(fromExtStart);
const nextFnRel = tail.slice(1).search(/\n\s*\/\/\/|\n\s*pub fn /);
const body = nextFnRel === -1 ? tail : tail.slice(0, nextFnRel + 1);
const rustExts = new Set<string>();
// Match string literals like "ts", "py", "tsx", "d.ts" etc.
for (const m of body.matchAll(/"([A-Za-z0-9.]+)"/g)) {
rustExts.add(m[1]);
}
// Normalize Rust forms to the JS `.ext` form. The function mixes:
// - prefix branches with leading dot: ".tsx", ".d.ts"
// - `match ext` arms without dot: "ts", "py", "rb", ...
// `.d.ts` is a TypeScript declaration file alias mapped to TypeScript;
// JS treats those files via `.ts` so the alias is not in the JS set.
const normalized = new Set<string>();
for (const e of rustExts) {
// `.d.ts` declaration files are mapped to TypeScript via a special
// prefix branch — JS handles those via the `.ts` entry, so skip the
// alias in either matched form.
if (e === 'd.ts' || e === '.d.ts') continue;
const withDot = e.startsWith('.') ? e : `.${e}`;
normalized.add(withDot.toLowerCase());
}
const jsExts = new Set(NATIVE_SUPPORTED_EXTENSIONS);
const onlyInRust = [...normalized].filter((e) => !jsExts.has(e));
const onlyInJs = [...jsExts].filter((e) => !normalized.has(e));
expect(
onlyInRust,
`Extensions in parser_registry.rs but missing from NATIVE_SUPPORTED_EXTENSIONS: ${onlyInRust.join(', ')}`,
).toEqual([]);
expect(
onlyInJs,
`Extensions in NATIVE_SUPPORTED_EXTENSIONS but missing from parser_registry.rs: ${onlyInJs.join(', ')}`,
).toEqual([]);
});
});
Loading