optave · carlos-alm · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026 · greptile-apps
diff --git a/src/domain/graph/builder/pipeline.ts b/src/domain/graph/builder/pipeline.ts
@@ -32,7 +32,13 @@ import type {
   ExtractorOutput,
   SqliteStatement,
 } from '../../../types.js';
-import { getActiveEngine, getInstalledWasmExtensions, parseFilesAuto } from '../../parser.js';
+import {
+  classifyNativeDrops,
+  formatDropExtensionSummary,
+  getActiveEngine,
+  getInstalledWasmExtensions,
+  parseFilesAuto,
+} from '../../parser.js';
 import { setWorkspaces } from '../resolve.js';
 import { PipelineContext } from './context.js';
 import { batchInsertNodes, collectFiles as collectFilesUtil, loadPathAliases } from './helpers.js';
@@ -761,18 +767,32 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
   // minimal installs) can't be parsed by either engine, so they're not a
   // native regression — excluding them keeps the warn count meaningful.
   const installedExts = getInstalledWasmExtensions();
+  const missingRel: string[] = [];
   const missingAbs: string[] = [];
   for (const rel of expected) {
     if (existing.has(rel)) continue;
     const ext = path.extname(rel).toLowerCase();
     if (!installedExts.has(ext)) continue;
+    missingRel.push(rel);
     missingAbs.push(path.join(ctx.rootDir, rel));
   }
   if (missingAbs.length === 0) return;
 
-  warn(
-    `Native orchestrator dropped ${missingAbs.length} file(s); backfilling via WASM for engine parity`,
-  );
+  // Classify drops so users see per-extension reasons instead of just a count
+  // (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust
+  // extractor); `native-extractor-failure` indicates a real native bug since
+  // the language IS supported by the addon yet the file was dropped anyway.
+  const { byReason, totals } = classifyNativeDrops(missingRel);
+  if (totals['unsupported-by-native'] > 0) {
+    info(
+      `Native orchestrator skipped ${totals['unsupported-by-native']} file(s) in languages without a Rust extractor; backfilling via WASM: ${formatDropExtensionSummary(byReason['unsupported-by-native'])}`,
+    );
+  }
+  if (totals['native-extractor-failure'] > 0) {
+    warn(
+      `Native orchestrator dropped ${totals['native-extractor-failure']} file(s) in natively-supported languages — likely a Rust extractor bug. Backfilling via WASM: ${formatDropExtensionSummary(byReason['native-extractor-failure'])}`,
+    );
+  }
   const wasmResults = await parseFilesAuto(missingAbs, ctx.rootDir, { engine: 'wasm' });
 
   const rows: unknown[][] = [];

diff --git a/src/domain/parser.ts b/src/domain/parser.ts
@@ -412,6 +412,128 @@ export function getInstalledWasmExtensions(): Set<string> {
   return exts;
 }
 
+/**
+ * Lowercase file extensions covered by the native Rust addon.
+ *
+ * Mirrors `LanguageKind::from_extension` in
+ * `crates/codegraph-core/src/parser_registry.rs`. Used to classify why the
+ * native orchestrator dropped a file: extensions outside this set are a
+ * legitimate parser limit (no Rust extractor exists), while extensions inside
+ * it indicate a real native bug (parse/read/extract failure).
+ *
+ * Keep this list in sync with the Rust enum — the native addon is a separate
+ * npm package, so JS has no runtime way to discover its language coverage.
+ */
+export const NATIVE_SUPPORTED_EXTENSIONS: ReadonlySet<string> = new Set([
+  '.js',
+  '.jsx',
+  '.mjs',
+  '.cjs',
+  '.ts',
+  '.tsx',
+  '.py',
+  '.pyi',
+  '.tf',
+  '.hcl',
+  '.go',
+  '.rs',
+  '.java',
+  '.cs',
+  '.rb',
+  '.rake',
+  '.gemspec',
+  '.php',
+  '.phtml',
+  '.c',
+  '.h',
+  '.cpp',
+  '.cc',
+  '.cxx',
+  '.hpp',
+  '.kt',
+  '.kts',
+  '.swift',
+  '.scala',
+  '.sh',
+  '.bash',
+  '.ex',
+  '.exs',
+  '.lua',
+  '.dart',
+  '.zig',
+  '.hs',
+  '.ml',
+  '.mli',
+]);
+
+/**
+ * Classification for a file the native orchestrator dropped.
+ * - `unsupported-by-native`: extension has no Rust extractor (legitimate parser limit).
+ * - `native-extractor-failure`: extension is supported by native but the file was
+ *   still dropped — points at a real bug (read error, parse failure, extractor crash).
+ */
+export type NativeDropReason = 'unsupported-by-native' | 'native-extractor-failure';
+
+export interface NativeDropClassification {
+  /** Per-reason → per-extension → list of relative paths that hit that bucket. */
+  byReason: Record<NativeDropReason, Map<string, string[]>>;
+  /** Total file count per reason. */
+  totals: Record<NativeDropReason, number>;
+}
+
+/**
+ * Group the missing files (relative paths) by drop reason and extension so the
+ * caller can log per-extension counts and a sample path. Pure function — no
+ * I/O, safe to unit-test independently of the build pipeline.
+ */
+export function classifyNativeDrops(relPaths: Iterable<string>): NativeDropClassification {
+  const byReason: Record<NativeDropReason, Map<string, string[]>> = {
+    'unsupported-by-native': new Map(),
+    'native-extractor-failure': new Map(),
+  };
+  const totals: Record<NativeDropReason, number> = {
+    'unsupported-by-native': 0,
+    'native-extractor-failure': 0,
+  };
+  for (const rel of relPaths) {
+    const ext = path.extname(rel).toLowerCase();
+    const reason: NativeDropReason = NATIVE_SUPPORTED_EXTENSIONS.has(ext)
+      ? 'native-extractor-failure'
+      : 'unsupported-by-native';
+    const bucket = byReason[reason];
+    let list = bucket.get(ext);
+    if (!list) {
+      list = [];
+      bucket.set(ext, list);
+    }
+    list.push(rel);
+    totals[reason]++;
+  }
+  return { byReason, totals };
+}
+
+/**
+ * Render `{ ext → paths[] }` as `ext (n: sample.ext, ...)` slices for log lines.
+ * Caps at 3 sample paths per extension and 6 extensions total to keep warnings
+ * readable when many languages are dropped at once. Extensions are sorted by
+ * descending file count so the loudest offender shows up first; ties keep
+ * insertion order. Pure function — safe to unit-test independently.
+ */
+export function formatDropExtensionSummary(buckets: Map<string, string[]>): string {
+  const MAX_EXTS = 6;
+  const MAX_SAMPLES = 3;
+  const entries = Array.from(buckets.entries()).sort((a, b) => b[1].length - a[1].length);
+  const shown = entries.slice(0, MAX_EXTS).map(([ext, paths]) => {
+    const sample = paths.slice(0, MAX_SAMPLES).join(', ');
+    const more = paths.length > MAX_SAMPLES ? `, +${paths.length - MAX_SAMPLES} more` : '';
+    return `${ext} (${paths.length}: ${sample}${more})`;
+  });
+  if (entries.length > MAX_EXTS) {
+    shown.push(`+${entries.length - MAX_EXTS} more extension(s)`);
+  }
+  return shown.join('; ');
+}
+
 // ── Unified API ──────────────────────────────────────────────────────────────
 
 function resolveEngine(opts: ParseEngineOpts = {}): ResolvedEngine {

diff --git a/tests/parsers/native-drop-classification.test.ts b/tests/parsers/native-drop-classification.test.ts
@@ -0,0 +1,204 @@
+import fs from 'node:fs';
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { describe, expect, it } from 'vitest';
+import {
+  classifyNativeDrops,
+  formatDropExtensionSummary,
+  NATIVE_SUPPORTED_EXTENSIONS,
+} from '../../src/domain/parser.js';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const REPO_ROOT = path.resolve(__dirname, '..', '..');
+
+describe('classifyNativeDrops', () => {
+  it('groups WASM-only languages under unsupported-by-native', () => {
+    const { byReason, totals } = classifyNativeDrops([
+      'src/a.fs',
+      'src/b.gleam',
+      'src/c.clj',
+      'src/d.jl',
+      'src/e.R',
+      'src/f.erl',
+      'src/g.sol',
+      'src/h.cu',
+      'src/i.groovy',
+      'src/j.v',
+      'src/k.m',
+    ]);
+    expect(totals['unsupported-by-native']).toBe(11);
+    expect(totals['native-extractor-failure']).toBe(0);
+    expect(byReason['unsupported-by-native'].get('.fs')).toEqual(['src/a.fs']);
+    expect(byReason['unsupported-by-native'].get('.gleam')).toEqual(['src/b.gleam']);
+    expect(byReason['unsupported-by-native'].get('.r')).toEqual(['src/e.R']);
+  });
+
+  it('flags natively-supported extensions as native-extractor-failure', () => {
+    const { byReason, totals } = classifyNativeDrops([
+      'src/a.ts',
+      'src/b.py',
+      'src/c.go',
+      'src/d.rs',
+    ]);
+    expect(totals['native-extractor-failure']).toBe(4);
+    expect(totals['unsupported-by-native']).toBe(0);
+    expect(byReason['native-extractor-failure'].get('.ts')).toEqual(['src/a.ts']);
+    expect(byReason['native-extractor-failure'].get('.py')).toEqual(['src/b.py']);
+  });
+
+  it('handles a mix of supported and unsupported extensions', () => {
+    const { byReason, totals } = classifyNativeDrops([
+      'src/a.ts',
+      'src/b.fs',
+      'src/c.fs',
+      'src/d.gleam',
+    ]);
+    expect(totals['native-extractor-failure']).toBe(1);
+    expect(totals['unsupported-by-native']).toBe(3);
+    expect(byReason['unsupported-by-native'].get('.fs')).toEqual(['src/b.fs', 'src/c.fs']);
+    expect(byReason['unsupported-by-native'].get('.gleam')).toEqual(['src/d.gleam']);
+  });
+
+  it('lowercases extensions so .R and .r share a bucket', () => {
+    const { byReason, totals } = classifyNativeDrops(['scripts/a.R', 'scripts/b.r']);
+    expect(totals['unsupported-by-native']).toBe(2);
+    expect(byReason['unsupported-by-native'].get('.r')).toEqual(['scripts/a.R', 'scripts/b.r']);
+  });
+
+  it('returns empty buckets when no files are passed', () => {
+    const { byReason, totals } = classifyNativeDrops([]);
+    expect(totals['native-extractor-failure']).toBe(0);
+    expect(totals['unsupported-by-native']).toBe(0);
+    expect(byReason['native-extractor-failure'].size).toBe(0);
+    expect(byReason['unsupported-by-native'].size).toBe(0);
+  });
+
+  it('exposes the native-supported extension set for callers', () => {
+    expect(NATIVE_SUPPORTED_EXTENSIONS.has('.ts')).toBe(true);
+    expect(NATIVE_SUPPORTED_EXTENSIONS.has('.py')).toBe(true);
+    expect(NATIVE_SUPPORTED_EXTENSIONS.has('.fs')).toBe(false);
+    expect(NATIVE_SUPPORTED_EXTENSIONS.has('.gleam')).toBe(false);
+  });
+});
+
+describe('formatDropExtensionSummary', () => {
+  it('returns an empty string when no buckets are present', () => {
+    expect(formatDropExtensionSummary(new Map())).toBe('');
+  });
+
+  it('lists every extension when under the cap', () => {
+    const buckets = new Map<string, string[]>([
+      ['.ts', ['a.ts', 'b.ts']],
+      ['.py', ['c.py']],
+    ]);
+    expect(formatDropExtensionSummary(buckets)).toBe('.ts (2: a.ts, b.ts); .py (1: c.py)');
+  });
+
+  it('caps samples per extension at 3 and renders +N more', () => {
+    const buckets = new Map<string, string[]>([['.ts', ['a.ts', 'b.ts', 'c.ts', 'd.ts', 'e.ts']]]);
+    expect(formatDropExtensionSummary(buckets)).toBe('.ts (5: a.ts, b.ts, c.ts, +2 more)');
+  });
+
+  it('shows exactly MAX_SAMPLES samples without a +N suffix when count equals the cap', () => {
+    const buckets = new Map<string, string[]>([['.ts', ['a.ts', 'b.ts', 'c.ts']]]);
+    expect(formatDropExtensionSummary(buckets)).toBe('.ts (3: a.ts, b.ts, c.ts)');
+  });
+
+  it('caps extensions at 6 and renders +N more extension(s)', () => {
+    // 8 extensions, all with 1 file — sorted by count is a stable tie so insertion
+    // order wins, and the first 6 are shown.
+    const buckets = new Map<string, string[]>([
+      ['.a', ['1.a']],
+      ['.b', ['1.b']],
+      ['.c', ['1.c']],
+      ['.d', ['1.d']],
+      ['.e', ['1.e']],
+      ['.f', ['1.f']],
+      ['.g', ['1.g']],
+      ['.h', ['1.h']],
+    ]);
+    const out = formatDropExtensionSummary(buckets);
+    expect(out.endsWith('; +2 more extension(s)')).toBe(true);
+    // First 6 extensions are present, the last 2 (.g, .h) are not.
+    expect(out).toContain('.a (1: 1.a)');
+    expect(out).toContain('.f (1: 1.f)');
+    expect(out).not.toContain('.g (');
+    expect(out).not.toContain('.h (');
+  });
+
+  it('sorts by descending file count so the loudest offender is first', () => {
+    const buckets = new Map<string, string[]>([
+      ['.small', ['x']],
+      ['.huge', ['a', 'b', 'c', 'd']],
+      ['.medium', ['m', 'n']],
+    ]);
+    const out = formatDropExtensionSummary(buckets);
+    const positions = ['.huge', '.medium', '.small'].map((ext) => out.indexOf(ext));
+    expect(positions[0]).toBeLessThan(positions[1]);
+    expect(positions[1]).toBeLessThan(positions[2]);
+  });
+});
+
+/**
+ * Drift guard for `NATIVE_SUPPORTED_EXTENSIONS`.
+ *
+ * Greptile flagged that this set is keyed to one snapshot of
+ * `LanguageKind::from_extension` in the Rust addon, and silent drift between
+ * the JS and Rust sides would mis-classify drops (real native failures shown
+ * as info, parser-limit gaps shown as warn). The native addon doesn't expose
+ * its own metadata, so we parse the Rust source instead and assert the two
+ * lists agree at build time. If `parser_registry.rs` is ever refactored, this
+ * test fails loudly so the maintainer notices.
+ */
+describe('NATIVE_SUPPORTED_EXTENSIONS drift guard', () => {
+  it('matches the extension set in crates/codegraph-core/src/parser_registry.rs', () => {
+    const registryPath = path.join(
+      REPO_ROOT,
+      'crates',
+      'codegraph-core',
+      'src',
+      'parser_registry.rs',
+    );
+    const src = fs.readFileSync(registryPath, 'utf8');
+    const fromExtStart = src.indexOf('pub fn from_extension');
+    expect(fromExtStart, 'from_extension not found in parser_registry.rs').toBeGreaterThan(-1);
+    // Slice from `pub fn from_extension` to the next `pub fn` (boundary of
+    // the next method) so we don't accidentally pick up extensions from
+    // unrelated functions like `from_lang_id` (which contains lang_id
+    // strings that look extension-like, e.g. "javascript", "python").
+    const tail = src.slice(fromExtStart);
+    const nextFnRel = tail.slice(1).search(/\n\s*\/\/\/|\n\s*pub fn /);
+    const body = nextFnRel === -1 ? tail : tail.slice(0, nextFnRel + 1);
+    const rustExts = new Set<string>();
+    // Match string literals like "ts", "py", "tsx", "d.ts" etc.
+    for (const m of body.matchAll(/"([A-Za-z0-9.]+)"/g)) {
+      rustExts.add(m[1]);
+    }
+    // Normalize Rust forms to the JS `.ext` form. The function mixes:
+    //   - prefix branches with leading dot: ".tsx", ".d.ts"
+    //   - `match ext` arms without dot: "ts", "py", "rb", ...
+    // `.d.ts` is a TypeScript declaration file alias mapped to TypeScript;
+    // JS treats those files via `.ts` so the alias is not in the JS set.
+    const normalized = new Set<string>();
+    for (const e of rustExts) {
+      // `.d.ts` declaration files are mapped to TypeScript via a special
+      // prefix branch — JS handles those via the `.ts` entry, so skip the
+      // alias in either matched form.
+      if (e === 'd.ts' || e === '.d.ts') continue;
+      const withDot = e.startsWith('.') ? e : `.${e}`;
+      normalized.add(withDot.toLowerCase());
+    }
+    const jsExts = new Set(NATIVE_SUPPORTED_EXTENSIONS);
+    const onlyInRust = [...normalized].filter((e) => !jsExts.has(e));
+    const onlyInJs = [...jsExts].filter((e) => !normalized.has(e));
+    expect(
+      onlyInRust,
+      `Extensions in parser_registry.rs but missing from NATIVE_SUPPORTED_EXTENSIONS: ${onlyInRust.join(', ')}`,
+    ).toEqual([]);
+    expect(
+      onlyInJs,
+      `Extensions in NATIVE_SUPPORTED_EXTENSIONS but missing from parser_registry.rs: ${onlyInJs.join(', ')}`,
+    ).toEqual([]);
+  });
+});