mschreib28 · mschreib28 · Apr 26, 2026 · Apr 26, 2026
diff --git a/__tests__/extraction-resolution-accuracy.test.ts b/__tests__/extraction-resolution-accuracy.test.ts
@@ -0,0 +1,266 @@
+/**
+ * Extraction & Resolution Accuracy Tests
+ *
+ * Regression tests for three accuracy bugs fixed in one PR:
+ *   1. Parse-retry comment strip was hardcoded to `//`, no-op on Python/Ruby/etc.
+ *   2. Framework route extractors ran regex over raw file content, matching
+ *      examples in docstrings/comments as real routes.
+ *   3. UTF-8 BOM caused spurious "modified" hash mismatches between editors.
+ */
+
+import { describe, it, expect } from 'vitest';
+import { stripBom, stripCommentLinesForRetry, stripCommentsForRegex } from '../src/utils';
+import { hashContent } from '../src/extraction';
+import { flaskResolver, fastapiResolver, djangoResolver } from '../src/resolution/frameworks/python';
+import { expressResolver } from '../src/resolution/frameworks/express';
+import { aspnetResolver } from '../src/resolution/frameworks/csharp';
+import { rustResolver } from '../src/resolution/frameworks/rust';
+import { laravelResolver } from '../src/resolution/frameworks/laravel';
+
+describe('UTF-8 BOM normalization (bug #5)', () => {
+  it('stripBom removes leading U+FEFF', () => {
+    expect(stripBom('hello')).toBe('hello');
+    expect(stripBom('hello')).toBe('hello');
+    expect(stripBom('')).toBe('');
+  });
+
+  it('stripBom only removes leading BOM, not embedded ones', () => {
+    expect(stripBom('ab')).toBe('ab');
+  });
+
+  it('hashContent treats BOM and no-BOM as identical', () => {
+    const withBom = 'export function hello() { return 42; }';
+    const withoutBom = 'export function hello() { return 42; }';
+    expect(hashContent(withBom)).toBe(hashContent(withoutBom));
+  });
+});
+
+describe('Per-language comment-line stripping (bug #1)', () => {
+  it('strips `#` lines for Python', () => {
+    const input = ['# CHECK: foo', 'def x():', '    pass'].join('\n');
+    const out = stripCommentLinesForRetry(input, 'python');
+    expect(out.split('\n')).toEqual(['', 'def x():', '    pass']);
+  });
+
+  it('strips `#` lines for Ruby', () => {
+    const input = ['# top comment', 'def x; end'].join('\n');
+    const out = stripCommentLinesForRetry(input, 'ruby');
+    expect(out.split('\n')).toEqual(['', 'def x; end']);
+  });
+
+  it('strips `//` lines for TypeScript', () => {
+    const input = ['// header', 'function x() {}'].join('\n');
+    const out = stripCommentLinesForRetry(input, 'typescript');
+    expect(out.split('\n')).toEqual(['', 'function x() {}']);
+  });
+
+  it('strips both `//` and `#` lines for PHP', () => {
+    const input = ['// js-style', '# perl-style', '<?php $x = 1;'].join('\n');
+    const out = stripCommentLinesForRetry(input, 'php');
+    expect(out.split('\n')).toEqual(['', '', '<?php $x = 1;']);
+  });
+
+  it('returns content unchanged for unknown languages', () => {
+    const input = '// looks like a comment\ncode';
+    expect(stripCommentLinesForRetry(input, 'unknown-lang')).toBe(input);
+  });
+
+  it('preserves line count so node positions stay correct', () => {
+    const input = ['# c1', 'a', '# c2', 'b'].join('\n');
+    const out = stripCommentLinesForRetry(input, 'python');
+    expect(out.split('\n').length).toBe(input.split('\n').length);
+  });
+
+  it('does NOT strip indented `#` inside Python (still recognized as line comment)', () => {
+    // The marker matches optional leading whitespace + `#`, so an indented
+    // pure comment line is correctly stripped. Non-comment code on the same
+    // line as `#` (mid-line comment) is intentionally not stripped here.
+    const input = ['    # indented comment', '    pass  # trailing'].join('\n');
+    const out = stripCommentLinesForRetry(input, 'python');
+    expect(out.split('\n')).toEqual(['', '    pass  # trailing']);
+  });
+});
+
+describe('Framework regex no longer matches docstrings/comments (bug #4)', () => {
+  describe('Flask', () => {
+    it('skips routes inside `#` comments', () => {
+      const content = [
+        'from flask import Flask',
+        'app = Flask(__name__)',
+        '# Example: @app.route("/fake")',
+        '@app.route("/real")',
+        'def real(): pass',
+      ].join('\n');
+      const nodes = flaskResolver.extractNodes!('app.py', content);
+      const paths = nodes.map((n) => n.name);
+      expect(paths).toContain('/real');
+      expect(paths).not.toContain('/fake');
+    });
+
+    it('skips routes inside triple-quoted docstrings', () => {
+      const content = [
+        'def example():',
+        '    """',
+        '    Usage: @app.route("/fake")',
+        '    """',
+        '    pass',
+        '@app.route("/real")',
+        'def real(): pass',
+      ].join('\n');
+      const nodes = flaskResolver.extractNodes!('app.py', content);
+      const paths = nodes.map((n) => n.name);
+      expect(paths).toContain('/real');
+      expect(paths).not.toContain('/fake');
+    });
+  });
+
+  describe('FastAPI', () => {
+    it('skips routes inside `#` comments and triple-quoted docstrings', () => {
+      const content = [
+        '"""',
+        'Module docs — example: @app.get("/docfake")',
+        '"""',
+        '# @app.post("/commentfake")',
+        '@app.get("/real")',
+        'def real(): pass',
+      ].join('\n');
+      const nodes = fastapiResolver.extractNodes!('app.py', content);
+      const names = nodes.map((n) => n.name);
+      expect(names.some((n) => n.includes('/real'))).toBe(true);
+      expect(names.some((n) => n.includes('/docfake'))).toBe(false);
+      expect(names.some((n) => n.includes('/commentfake'))).toBe(false);
+    });
+
+    it('preserves correct line numbers for real routes after stripping', () => {
+      const content = [
+        '"""',                    // line 1
+        '@app.get("/fake")',      // line 2 — inside docstring
+        '"""',                    // line 3
+        '',                       // line 4
+        '@app.get("/real")',      // line 5 — real
+      ].join('\n');
+      const nodes = fastapiResolver.extractNodes!('app.py', content);
+      const real = nodes.find((n) => n.name.includes('/real'));
+      expect(real).toBeDefined();
+      expect(real!.startLine).toBe(5);
+    });
+  });
+
+  describe('Django URL patterns', () => {
+    it('skips path() inside `#` comments', () => {
+      const content = [
+        'from django.urls import path',
+        '# example: path("fake/", fake_view)',
+        'urlpatterns = [path("real/", real_view)]',
+      ].join('\n');
+      const nodes = djangoResolver.extractNodes!('urls.py', content);
+      const names = nodes.map((n) => n.name);
+      expect(names).toContain('real/');
+      expect(names).not.toContain('fake/');
+    });
+  });
+
+  describe('Express', () => {
+    it('skips routes inside `//` comments', () => {
+      const content = [
+        'const app = express();',
+        '// app.get("/fake", fakeHandler);',
+        'app.get("/real", realHandler);',
+      ].join('\n');
+      const nodes = expressResolver.extractNodes!('server.js', content);
+      const names = nodes.map((n) => n.name);
+      expect(names.some((n) => n.includes('/real'))).toBe(true);
+      expect(names.some((n) => n.includes('/fake'))).toBe(false);
+    });
+
+    it('skips routes inside `/* ... */` block comments', () => {
+      const content = [
+        '/*',
+        ' * app.post("/blockfake", h);',
+        ' */',
+        'app.get("/real", h);',
+      ].join('\n');
+      const nodes = expressResolver.extractNodes!('server.js', content);
+      const names = nodes.map((n) => n.name);
+      expect(names.some((n) => n.includes('/real'))).toBe(true);
+      expect(names.some((n) => n.includes('/blockfake'))).toBe(false);
+    });
+  });
+
+  describe('Laravel', () => {
+    it('skips routes inside PHP `//` and `#` comments', () => {
+      const content = [
+        '<?php',
+        '// Route::get("/jsfake", $h);',
+        '# Route::get("/perlfake", $h);',
+        'Route::get("/real", $h);',
+      ].join('\n');
+      const nodes = laravelResolver.extractNodes!('routes.php', content);
+      const names = nodes.map((n) => n.name);
+      expect(names.some((n) => n.includes('/real'))).toBe(true);
+      expect(names.some((n) => n.includes('/jsfake'))).toBe(false);
+      expect(names.some((n) => n.includes('/perlfake'))).toBe(false);
+    });
+  });
+
+  describe('Rust', () => {
+    it('skips actix/rocket routes inside `///` doc comments', () => {
+      const content = [
+        '/// Example route: #[get("/docfake")]',
+        '#[get("/real")]',
+        'fn real() {}',
+      ].join('\n');
+      const nodes = rustResolver.extractNodes!('main.rs', content);
+      const names = nodes.map((n) => n.name);
+      expect(names.some((n) => n.includes('/real'))).toBe(true);
+      expect(names.some((n) => n.includes('/docfake'))).toBe(false);
+    });
+  });
+
+  describe('ASP.NET (C#)', () => {
+    it('skips route attributes inside `///` XML doc comments', () => {
+      const content = [
+        '/// <summary>',
+        '/// Example: [HttpGet("/docfake")]',
+        '/// </summary>',
+        '[HttpGet("/real")]',
+        'public class C {}',
+      ].join('\n');
+      const nodes = aspnetResolver.extractNodes!('Controller.cs', content);
+      const names = nodes.map((n) => n.name);
+      expect(names.some((n) => n.includes('/real'))).toBe(true);
+      expect(names.some((n) => n.includes('/docfake'))).toBe(false);
+    });
+
+    it('skips minimal-API MapGet/MapPost calls inside comments', () => {
+      // Regression: the minimalApiPattern loop below the routePatterns
+      // loop was initially missed when applying the strip helper, leaving
+      // commented-out `app.MapGet("/x")` calls extracted as real routes.
+      const content = [
+        '// app.MapGet("/linefake", h);',
+        '/*',
+        ' * app.MapPost("/blockfake", h);',
+        ' */',
+        'app.MapGet("/real", h);',
+      ].join('\n');
+      const nodes = aspnetResolver.extractNodes!('Program.cs', content);
+      const names = nodes.map((n) => n.name);
+      expect(names.some((n) => n.includes('/real'))).toBe(true);
+      expect(names.some((n) => n.includes('/linefake'))).toBe(false);
+      expect(names.some((n) => n.includes('/blockfake'))).toBe(false);
+    });
+  });
+});
+
+describe('stripCommentsForRegex preserves line offsets', () => {
+  it('keeps newlines so match.index → original line number', () => {
+    const input = '"""\n@app.get("/x")\n"""\n@app.get("/y")';
+    const out = stripCommentsForRegex(input, 'python');
+    // Newlines preserved
+    expect(out.split('\n').length).toBe(input.split('\n').length);
+    // The /y route survives
+    expect(out).toContain('/y');
+    // The docstring contents are blanked
+    expect(out).not.toContain('/x');
+  });
+});
diff --git a/src/extraction/index.ts b/src/extraction/index.ts
@@ -20,7 +20,7 @@ import { QueryBuilder } from '../db/queries';
 import { extractFromSource } from './tree-sitter';
 import { detectLanguage, isLanguageSupported, initGrammars, loadGrammarsForLanguages } from './grammars';
 import { logDebug, logWarn } from '../errors';
-import { validatePathWithinRoot, normalizePath } from '../utils';
+import { validatePathWithinRoot, normalizePath, stripBom, stripCommentLinesForRetry } from '../utils';
 import picomatch from 'picomatch';
 
 /**
@@ -85,10 +85,15 @@ export interface SyncResult {
 }
 
 /**
- * Calculate SHA256 hash of file contents
+ * Calculate SHA256 hash of file contents.
+ *
+ * A leading UTF-8 BOM is stripped before hashing so files round-tripped
+ * through editors that disagree about BOM handling (VSCode strips by
+ * default; some Windows editors preserve it) hash identically and don't
+ * appear "modified" on every sync.
  */
 export function hashContent(content: string): string {
-  return crypto.createHash('sha256').update(content).digest('hex');
+  return crypto.createHash('sha256').update(stripBom(content)).digest('hex');
 }
 
 /**
@@ -820,11 +825,12 @@ export class ExtractionOrchestrator {
           }
 
           // Strip lines that are entirely comments (preserving line numbers
-          // by replacing with empty lines so node positions stay correct)
-          const stripped = fullContent
-            .split('\n')
-            .map(line => /^\s*\/\//.test(line) ? '' : line)
-            .join('\n');
+          // by replacing with empty lines so node positions stay correct).
+          // The marker is language-specific — the previous hardcoded `//`
+          // was a no-op for Python (`#`), Ruby (`#`), etc., so those files
+          // would silently keep failing on the retry.
+          const language = detectLanguage(filePath, fullContent);
+          const stripped = stripCommentLinesForRetry(fullContent, language);
 
           let result: ExtractionResult;
           try {
@@ -834,7 +840,6 @@ export class ExtractionOrchestrator {
           }
 
           if (result.nodes.length > 0 || result.errors.length === 0) {
-            const language = detectLanguage(filePath, fullContent);
             const stats = await fsp.stat(path.join(this.rootDir, filePath));
             this.storeExtractionResult(filePath, fullContent, language, stats, result);
 

diff --git a/src/resolution/frameworks/csharp.ts b/src/resolution/frameworks/csharp.ts
@@ -6,6 +6,7 @@
 
 import { Node } from '../../types';
 import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types';
+import { stripCommentsForRegex } from '../../utils';
 
 export const aspnetResolver: FrameworkResolver = {
   name: 'aspnet',
@@ -117,6 +118,9 @@ export const aspnetResolver: FrameworkResolver = {
   extractNodes(filePath: string, content: string): Node[] {
     const nodes: Node[] = [];
     const now = Date.now();
+    // Strip `//` and `/* */` comments so XML-doc examples like
+    // `/// [HttpGet("/x")]` aren't treated as real route attributes.
+    const safe = stripCommentsForRegex(content, 'csharp');
 
     // Extract route attributes
     // [HttpGet("path")], [HttpPost("path")], [Route("path")]
@@ -128,8 +132,8 @@ export const aspnetResolver: FrameworkResolver = {
 
     for (const pattern of routePatterns) {
       let match;
-      while ((match = pattern.exec(content)) !== null) {
-        const line = content.slice(0, match.index).split('\n').length;
+      while ((match = pattern.exec(safe)) !== null) {
+        const line = safe.slice(0, match.index).split('\n').length;
 
         if (pattern.source.includes('Http')) {
           if (match[3]) {
@@ -190,9 +194,9 @@ export const aspnetResolver: FrameworkResolver = {
     const minimalApiPattern = /\.Map(Get|Post|Put|Patch|Delete)\s*\(\s*["']([^"']+)["']/g;
 
     let match;
-    while ((match = minimalApiPattern.exec(content)) !== null) {
+    while ((match = minimalApiPattern.exec(safe)) !== null) {
       const [, method, path] = match;
-      const line = content.slice(0, match.index).split('\n').length;
+      const line = safe.slice(0, match.index).split('\n').length;
 
       nodes.push({
         id: `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`,

diff --git a/src/resolution/frameworks/express.ts b/src/resolution/frameworks/express.ts
@@ -6,6 +6,7 @@
 
 import { Node } from '../../types';
 import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types';
+import { stripCommentsForRegex } from '../../utils';
 
 export const expressResolver: FrameworkResolver = {
   name: 'express',
@@ -93,6 +94,9 @@ export const expressResolver: FrameworkResolver = {
   extractNodes(filePath: string, content: string): Node[] {
     const nodes: Node[] = [];
     const now = Date.now();
+    // Neutralize comments and JSDoc blocks so a `app.get('/x')` example in
+    // a comment isn't extracted as a real route.
+    const safe = stripCommentsForRegex(content, 'javascript');
 
     // Extract route definitions
     // app.get('/path', handler) or router.get('/path', handler)
@@ -102,9 +106,9 @@ export const expressResolver: FrameworkResolver = {
 
     for (const pattern of routePatterns) {
       let match;
-      while ((match = pattern.exec(content)) !== null) {
+      while ((match = pattern.exec(safe)) !== null) {
         const [, _obj, method, path] = match;
-        const line = content.slice(0, match.index).split('\n').length;
+        const line = safe.slice(0, match.index).split('\n').length;
 
         // Skip middleware use() without paths
         if (method === 'use' && !path?.startsWith('/')) {