Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
266 changes: 266 additions & 0 deletions __tests__/extraction-resolution-accuracy.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
/**
* Extraction & Resolution Accuracy Tests
*
* Regression tests for three accuracy bugs fixed in one PR:
* 1. Parse-retry comment strip was hardcoded to `//`, no-op on Python/Ruby/etc.
* 2. Framework route extractors ran regex over raw file content, matching
* examples in docstrings/comments as real routes.
* 3. UTF-8 BOM caused spurious "modified" hash mismatches between editors.
*/

import { describe, it, expect } from 'vitest';
import { stripBom, stripCommentLinesForRetry, stripCommentsForRegex } from '../src/utils';
import { hashContent } from '../src/extraction';
import { flaskResolver, fastapiResolver, djangoResolver } from '../src/resolution/frameworks/python';
import { expressResolver } from '../src/resolution/frameworks/express';
import { aspnetResolver } from '../src/resolution/frameworks/csharp';
import { rustResolver } from '../src/resolution/frameworks/rust';
import { laravelResolver } from '../src/resolution/frameworks/laravel';

describe('UTF-8 BOM normalization (bug #5)', () => {
it('stripBom removes leading U+FEFF', () => {
expect(stripBom('hello')).toBe('hello');
expect(stripBom('hello')).toBe('hello');
expect(stripBom('')).toBe('');
});

it('stripBom only removes leading BOM, not embedded ones', () => {
expect(stripBom('ab')).toBe('ab');
});

it('hashContent treats BOM and no-BOM as identical', () => {
const withBom = 'export function hello() { return 42; }';
const withoutBom = 'export function hello() { return 42; }';
expect(hashContent(withBom)).toBe(hashContent(withoutBom));
});
});

describe('Per-language comment-line stripping (bug #1)', () => {
it('strips `#` lines for Python', () => {
const input = ['# CHECK: foo', 'def x():', ' pass'].join('\n');
const out = stripCommentLinesForRetry(input, 'python');
expect(out.split('\n')).toEqual(['', 'def x():', ' pass']);
});

it('strips `#` lines for Ruby', () => {
const input = ['# top comment', 'def x; end'].join('\n');
const out = stripCommentLinesForRetry(input, 'ruby');
expect(out.split('\n')).toEqual(['', 'def x; end']);
});

it('strips `//` lines for TypeScript', () => {
const input = ['// header', 'function x() {}'].join('\n');
const out = stripCommentLinesForRetry(input, 'typescript');
expect(out.split('\n')).toEqual(['', 'function x() {}']);
});

it('strips both `//` and `#` lines for PHP', () => {
const input = ['// js-style', '# perl-style', '<?php $x = 1;'].join('\n');
const out = stripCommentLinesForRetry(input, 'php');
expect(out.split('\n')).toEqual(['', '', '<?php $x = 1;']);
});

it('returns content unchanged for unknown languages', () => {
const input = '// looks like a comment\ncode';
expect(stripCommentLinesForRetry(input, 'unknown-lang')).toBe(input);
});

it('preserves line count so node positions stay correct', () => {
const input = ['# c1', 'a', '# c2', 'b'].join('\n');
const out = stripCommentLinesForRetry(input, 'python');
expect(out.split('\n').length).toBe(input.split('\n').length);
});

it('does NOT strip indented `#` inside Python (still recognized as line comment)', () => {
// The marker matches optional leading whitespace + `#`, so an indented
// pure comment line is correctly stripped. Non-comment code on the same
// line as `#` (mid-line comment) is intentionally not stripped here.
const input = [' # indented comment', ' pass # trailing'].join('\n');
const out = stripCommentLinesForRetry(input, 'python');
expect(out.split('\n')).toEqual(['', ' pass # trailing']);
});
});

describe('Framework regex no longer matches docstrings/comments (bug #4)', () => {
describe('Flask', () => {
it('skips routes inside `#` comments', () => {
const content = [
'from flask import Flask',
'app = Flask(__name__)',
'# Example: @app.route("/fake")',
'@app.route("/real")',
'def real(): pass',
].join('\n');
const nodes = flaskResolver.extractNodes!('app.py', content);
const paths = nodes.map((n) => n.name);
expect(paths).toContain('/real');
expect(paths).not.toContain('/fake');
});

it('skips routes inside triple-quoted docstrings', () => {
const content = [
'def example():',
' """',
' Usage: @app.route("/fake")',
' """',
' pass',
'@app.route("/real")',
'def real(): pass',
].join('\n');
const nodes = flaskResolver.extractNodes!('app.py', content);
const paths = nodes.map((n) => n.name);
expect(paths).toContain('/real');
expect(paths).not.toContain('/fake');
});
});

describe('FastAPI', () => {
it('skips routes inside `#` comments and triple-quoted docstrings', () => {
const content = [
'"""',
'Module docs — example: @app.get("/docfake")',
'"""',
'# @app.post("/commentfake")',
'@app.get("/real")',
'def real(): pass',
].join('\n');
const nodes = fastapiResolver.extractNodes!('app.py', content);
const names = nodes.map((n) => n.name);
expect(names.some((n) => n.includes('/real'))).toBe(true);
expect(names.some((n) => n.includes('/docfake'))).toBe(false);
expect(names.some((n) => n.includes('/commentfake'))).toBe(false);
});

it('preserves correct line numbers for real routes after stripping', () => {
const content = [
'"""', // line 1
'@app.get("/fake")', // line 2 — inside docstring
'"""', // line 3
'', // line 4
'@app.get("/real")', // line 5 — real
].join('\n');
const nodes = fastapiResolver.extractNodes!('app.py', content);
const real = nodes.find((n) => n.name.includes('/real'));
expect(real).toBeDefined();
expect(real!.startLine).toBe(5);
});
});

describe('Django URL patterns', () => {
it('skips path() inside `#` comments', () => {
const content = [
'from django.urls import path',
'# example: path("fake/", fake_view)',
'urlpatterns = [path("real/", real_view)]',
].join('\n');
const nodes = djangoResolver.extractNodes!('urls.py', content);
const names = nodes.map((n) => n.name);
expect(names).toContain('real/');
expect(names).not.toContain('fake/');
});
});

describe('Express', () => {
it('skips routes inside `//` comments', () => {
const content = [
'const app = express();',
'// app.get("/fake", fakeHandler);',
'app.get("/real", realHandler);',
].join('\n');
const nodes = expressResolver.extractNodes!('server.js', content);
const names = nodes.map((n) => n.name);
expect(names.some((n) => n.includes('/real'))).toBe(true);
expect(names.some((n) => n.includes('/fake'))).toBe(false);
});

it('skips routes inside `/* ... */` block comments', () => {
const content = [
'/*',
' * app.post("/blockfake", h);',
' */',
'app.get("/real", h);',
].join('\n');
const nodes = expressResolver.extractNodes!('server.js', content);
const names = nodes.map((n) => n.name);
expect(names.some((n) => n.includes('/real'))).toBe(true);
expect(names.some((n) => n.includes('/blockfake'))).toBe(false);
});
});

describe('Laravel', () => {
it('skips routes inside PHP `//` and `#` comments', () => {
const content = [
'<?php',
'// Route::get("/jsfake", $h);',
'# Route::get("/perlfake", $h);',
'Route::get("/real", $h);',
].join('\n');
const nodes = laravelResolver.extractNodes!('routes.php', content);
const names = nodes.map((n) => n.name);
expect(names.some((n) => n.includes('/real'))).toBe(true);
expect(names.some((n) => n.includes('/jsfake'))).toBe(false);
expect(names.some((n) => n.includes('/perlfake'))).toBe(false);
});
});

describe('Rust', () => {
it('skips actix/rocket routes inside `///` doc comments', () => {
const content = [
'/// Example route: #[get("/docfake")]',
'#[get("/real")]',
'fn real() {}',
].join('\n');
const nodes = rustResolver.extractNodes!('main.rs', content);
const names = nodes.map((n) => n.name);
expect(names.some((n) => n.includes('/real'))).toBe(true);
expect(names.some((n) => n.includes('/docfake'))).toBe(false);
});
});

describe('ASP.NET (C#)', () => {
it('skips route attributes inside `///` XML doc comments', () => {
const content = [
'/// <summary>',
'/// Example: [HttpGet("/docfake")]',
'/// </summary>',
'[HttpGet("/real")]',
'public class C {}',
].join('\n');
const nodes = aspnetResolver.extractNodes!('Controller.cs', content);
const names = nodes.map((n) => n.name);
expect(names.some((n) => n.includes('/real'))).toBe(true);
expect(names.some((n) => n.includes('/docfake'))).toBe(false);
});

it('skips minimal-API MapGet/MapPost calls inside comments', () => {
// Regression: the minimalApiPattern loop below the routePatterns
// loop was initially missed when applying the strip helper, leaving
// commented-out `app.MapGet("/x")` calls extracted as real routes.
const content = [
'// app.MapGet("/linefake", h);',
'/*',
' * app.MapPost("/blockfake", h);',
' */',
'app.MapGet("/real", h);',
].join('\n');
const nodes = aspnetResolver.extractNodes!('Program.cs', content);
const names = nodes.map((n) => n.name);
expect(names.some((n) => n.includes('/real'))).toBe(true);
expect(names.some((n) => n.includes('/linefake'))).toBe(false);
expect(names.some((n) => n.includes('/blockfake'))).toBe(false);
});
});
});

describe('stripCommentsForRegex preserves line offsets', () => {
it('keeps newlines so match.index → original line number', () => {
const input = '"""\n@app.get("/x")\n"""\n@app.get("/y")';
const out = stripCommentsForRegex(input, 'python');
// Newlines preserved
expect(out.split('\n').length).toBe(input.split('\n').length);
// The /y route survives
expect(out).toContain('/y');
// The docstring contents are blanked
expect(out).not.toContain('/x');
});
});
23 changes: 14 additions & 9 deletions src/extraction/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import { QueryBuilder } from '../db/queries';
import { extractFromSource } from './tree-sitter';
import { detectLanguage, isLanguageSupported, initGrammars, loadGrammarsForLanguages } from './grammars';
import { logDebug, logWarn } from '../errors';
import { validatePathWithinRoot, normalizePath } from '../utils';
import { validatePathWithinRoot, normalizePath, stripBom, stripCommentLinesForRetry } from '../utils';
import picomatch from 'picomatch';

/**
Expand Down Expand Up @@ -85,10 +85,15 @@ export interface SyncResult {
}

/**
* Calculate SHA256 hash of file contents
* Calculate SHA256 hash of file contents.
*
* A leading UTF-8 BOM is stripped before hashing so files round-tripped
* through editors that disagree about BOM handling (VSCode strips by
* default; some Windows editors preserve it) hash identically and don't
* appear "modified" on every sync.
*/
export function hashContent(content: string): string {
return crypto.createHash('sha256').update(content).digest('hex');
return crypto.createHash('sha256').update(stripBom(content)).digest('hex');
}

/**
Expand Down Expand Up @@ -820,11 +825,12 @@ export class ExtractionOrchestrator {
}

// Strip lines that are entirely comments (preserving line numbers
// by replacing with empty lines so node positions stay correct)
const stripped = fullContent
.split('\n')
.map(line => /^\s*\/\//.test(line) ? '' : line)
.join('\n');
// by replacing with empty lines so node positions stay correct).
// The marker is language-specific — the previous hardcoded `//`
// was a no-op for Python (`#`), Ruby (`#`), etc., so those files
// would silently keep failing on the retry.
const language = detectLanguage(filePath, fullContent);
const stripped = stripCommentLinesForRetry(fullContent, language);

let result: ExtractionResult;
try {
Expand All @@ -834,7 +840,6 @@ export class ExtractionOrchestrator {
}

if (result.nodes.length > 0 || result.errors.length === 0) {
const language = detectLanguage(filePath, fullContent);
const stats = await fsp.stat(path.join(this.rootDir, filePath));
this.storeExtractionResult(filePath, fullContent, language, stats, result);

Expand Down
12 changes: 8 additions & 4 deletions src/resolution/frameworks/csharp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import { Node } from '../../types';
import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types';
import { stripCommentsForRegex } from '../../utils';

export const aspnetResolver: FrameworkResolver = {
name: 'aspnet',
Expand Down Expand Up @@ -117,6 +118,9 @@ export const aspnetResolver: FrameworkResolver = {
extractNodes(filePath: string, content: string): Node[] {
const nodes: Node[] = [];
const now = Date.now();
// Strip `//` and `/* */` comments so XML-doc examples like
// `/// [HttpGet("/x")]` aren't treated as real route attributes.
const safe = stripCommentsForRegex(content, 'csharp');

// Extract route attributes
// [HttpGet("path")], [HttpPost("path")], [Route("path")]
Expand All @@ -128,8 +132,8 @@ export const aspnetResolver: FrameworkResolver = {

for (const pattern of routePatterns) {
let match;
while ((match = pattern.exec(content)) !== null) {
const line = content.slice(0, match.index).split('\n').length;
while ((match = pattern.exec(safe)) !== null) {
const line = safe.slice(0, match.index).split('\n').length;

if (pattern.source.includes('Http')) {
if (match[3]) {
Expand Down Expand Up @@ -190,9 +194,9 @@ export const aspnetResolver: FrameworkResolver = {
const minimalApiPattern = /\.Map(Get|Post|Put|Patch|Delete)\s*\(\s*["']([^"']+)["']/g;

let match;
while ((match = minimalApiPattern.exec(content)) !== null) {
while ((match = minimalApiPattern.exec(safe)) !== null) {
const [, method, path] = match;
const line = content.slice(0, match.index).split('\n').length;
const line = safe.slice(0, match.index).split('\n').length;

nodes.push({
id: `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`,
Expand Down
8 changes: 6 additions & 2 deletions src/resolution/frameworks/express.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import { Node } from '../../types';
import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types';
import { stripCommentsForRegex } from '../../utils';

export const expressResolver: FrameworkResolver = {
name: 'express',
Expand Down Expand Up @@ -93,6 +94,9 @@ export const expressResolver: FrameworkResolver = {
extractNodes(filePath: string, content: string): Node[] {
const nodes: Node[] = [];
const now = Date.now();
// Neutralize comments and JSDoc blocks so a `app.get('/x')` example in
// a comment isn't extracted as a real route.
const safe = stripCommentsForRegex(content, 'javascript');

// Extract route definitions
// app.get('/path', handler) or router.get('/path', handler)
Expand All @@ -102,9 +106,9 @@ export const expressResolver: FrameworkResolver = {

for (const pattern of routePatterns) {
let match;
while ((match = pattern.exec(content)) !== null) {
while ((match = pattern.exec(safe)) !== null) {
const [, _obj, method, path] = match;
const line = content.slice(0, match.index).split('\n').length;
const line = safe.slice(0, match.index).split('\n').length;

// Skip middleware use() without paths
if (method === 'use' && !path?.startsWith('/')) {
Expand Down
Loading