Skip to content

Commit b8ce77c

Browse files
test: replace hand-picked benchmark with auto-generated one
For every symbol in the graph, generates a natural language query from splitIdentifier (e.g. buildGraph → "build graph") and checks if search finds that symbol in the top N. Tests all 286 unique symbols with zero human bias. Results (minilm, 286 symbols): structured: Hit@1=75.5%, Hit@3=95.8%, 2 misses source: Hit@1=66.8%, Hit@3=88.1%, 11 misses
1 parent 56a0517 commit b8ce77c

1 file changed

Lines changed: 123 additions & 72 deletions

File tree

Lines changed: 123 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
#!/usr/bin/env node
22

33
/**
4-
* Embedding strategy benchmark — compares structured vs source strategies
5-
* against real search queries on the current project's graph.
4+
* Embedding strategy benchmark — auto-generated from the graph.
5+
*
6+
* For every function/method/class in the graph, generates a natural language
7+
* query from the symbol name (e.g. buildGraph → "build graph") and checks
8+
* if the embedding search finds that symbol in the top N results.
9+
* No hand-picked queries — zero human bias, tests every symbol.
610
*
711
* Prerequisites:
812
* - @huggingface/transformers installed
@@ -11,114 +15,161 @@
1115
* Usage:
1216
* node tests/search/embedding-benchmark.js
1317
* node tests/search/embedding-benchmark.js --model minilm
18+
* node tests/search/embedding-benchmark.js --limit 50 # test first N symbols
19+
* node tests/search/embedding-benchmark.js --no-tests # exclude test files
1420
*/
1521

1622
import path from 'node:path';
23+
import Database from 'better-sqlite3';
1724
import { buildEmbeddings, DEFAULT_MODEL, MODELS, searchData } from '../../src/embedder.js';
1825

19-
const model = process.argv.includes('--model')
20-
? process.argv[process.argv.indexOf('--model') + 1]
21-
: DEFAULT_MODEL;
26+
const args = process.argv.slice(2);
27+
const getArg = (flag, fallback) => {
28+
const idx = args.indexOf(flag);
29+
return idx >= 0 && args[idx + 1] ? args[idx + 1] : fallback;
30+
};
31+
32+
const model = getArg('--model', DEFAULT_MODEL);
33+
const symbolLimit = parseInt(getArg('--limit', '0'), 10);
34+
const noTests = args.includes('--no-tests');
35+
const TEST_PATTERN = /\.(test|spec)\.|__test__|__tests__|\.stories\./;
2236

2337
const rootDir = '.';
2438
const dbPath = path.resolve('.codegraph/graph.db');
2539

26-
// Queries with expected best-match symbol name
27-
const QUERIES = [
28-
{ q: 'parse source code with tree-sitter', expect: 'parseFilesAuto' },
29-
{ q: 'find circular dependencies', expect: 'findCycles' },
30-
{ q: 'build dependency graph from source files', expect: 'buildGraph' },
31-
{ q: 'resolve import path to actual file', expect: 'resolveImportPath' },
32-
{ q: 'cosine similarity between vectors', expect: 'cosineSim' },
33-
{ q: 'export graph as DOT format', expect: 'exportDOT' },
34-
{ q: 'semantic search with embeddings', expect: 'search' },
35-
{ q: 'incremental file hashing', expect: 'hashFile' },
36-
{ q: 'load configuration from file', expect: 'loadConfig' },
37-
{ q: 'extract functions and classes from code', expect: 'extractJavaScript' },
38-
{ q: 'impact analysis of code changes', expect: 'diffImpactData' },
39-
{ q: 'start MCP server for AI agents', expect: 'startMCPServer' },
40-
{ q: 'watch files for changes', expect: 'watchProject' },
41-
{ q: 'reciprocal rank fusion for multi-query search', expect: 'multiSearchData' },
42-
];
43-
44-
async function benchmark(strategy) {
40+
/**
41+
* Split an identifier into readable words (mirrors src/embedder.js splitIdentifier).
42+
*/
43+
function splitIdentifier(name) {
44+
return name
45+
.replace(/([a-z])([A-Z])/g, '$1 $2')
46+
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
47+
.replace(/[_-]+/g, ' ')
48+
.trim()
49+
.toLowerCase();
50+
}
51+
52+
/**
53+
* Load all embeddable symbols from the graph and generate queries.
54+
*/
55+
function loadSymbols() {
56+
const db = new Database(dbPath, { readonly: true });
57+
let rows = db
58+
.prepare(
59+
`SELECT name, kind, file, line FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`,
60+
)
61+
.all();
62+
db.close();
63+
64+
if (noTests) {
65+
rows = rows.filter((r) => !TEST_PATTERN.test(r.file));
66+
}
67+
68+
// Deduplicate by name (same name in different files → keep first)
69+
const seen = new Set();
70+
const symbols = [];
71+
for (const row of rows) {
72+
if (seen.has(row.name)) continue;
73+
seen.add(row.name);
74+
75+
const query = splitIdentifier(row.name);
76+
// Skip symbols with single-char or very short names (not meaningful queries)
77+
if (query.length < 4) continue;
78+
symbols.push({ name: row.name, kind: row.kind, file: row.file, query });
79+
}
80+
81+
return symbolLimit > 0 ? symbols.slice(0, symbolLimit) : symbols;
82+
}
83+
84+
async function benchmark(strategy, symbols) {
4585
await buildEmbeddings(rootDir, model, dbPath, { strategy });
4686

4787
let hits1 = 0;
4888
let hits3 = 0;
4989
let hits5 = 0;
50-
const details = [];
90+
let hits10 = 0;
91+
const misses = [];
5192

52-
for (const { q, expect: expected } of QUERIES) {
53-
const data = await searchData(q, dbPath, { minScore: 0.01, limit: 10 });
93+
for (let i = 0; i < symbols.length; i++) {
94+
const { name, query } = symbols[i];
95+
const data = await searchData(query, dbPath, { minScore: 0.01, limit: 10 });
5496
if (!data) continue;
5597

5698
const names = data.results.map((r) => r.name);
57-
const rank = names.indexOf(expected) + 1; // 0 = not found
99+
const rank = names.indexOf(name) + 1; // 0 = not found
100+
58101
if (rank === 1) hits1++;
59102
if (rank >= 1 && rank <= 3) hits3++;
60103
if (rank >= 1 && rank <= 5) hits5++;
104+
if (rank >= 1 && rank <= 10) hits10++;
105+
if (rank === 0) misses.push({ name, query, top: names[0] || '(none)' });
61106

62-
const matchScore = rank > 0 ? data.results[rank - 1].similarity.toFixed(3) : 'miss';
63-
details.push({
64-
q: q.slice(0, 50),
65-
expected,
66-
rank: rank || '>10',
67-
actual: names[0],
68-
matchScore,
69-
});
107+
if ((i + 1) % 25 === 0) {
108+
process.stdout.write(` ${strategy}: ${i + 1}/${symbols.length}\r`);
109+
}
70110
}
71111

72-
return { strategy, hits1, hits3, hits5, total: QUERIES.length, details };
112+
return { strategy, hits1, hits3, hits5, hits10, total: symbols.length, misses };
73113
}
74114

115+
// ─── Main ──────────────────────────────────────────────────────────────
116+
75117
const modelConfig = MODELS[model];
76-
console.log('=== Embedding Strategy Benchmark ===');
77-
console.log(`Model: ${model} (${modelConfig.dim}d, ${modelConfig.contextWindow} token context)`);
78-
console.log(`Queries: ${QUERIES.length}`);
79-
console.log('');
118+
const symbols = loadSymbols();
80119

81-
const structured = await benchmark('structured');
82-
const source = await benchmark('source');
120+
console.log('=== Embedding Strategy Benchmark (auto-generated) ===');
121+
console.log(`Model: ${model} (${modelConfig.dim}d, ${modelConfig.contextWindow} token ctx)`);
122+
console.log(`Symbols: ${symbols.length} unique (query = splitIdentifier of name)`);
123+
console.log('');
83124

84-
// Summary table
125+
const structured = await benchmark('structured', symbols);
126+
console.log('');
127+
const source = await benchmark('source', symbols);
85128
console.log('');
129+
130+
// Summary
131+
const pct = (n, t) => `${n}/${t} (${((n / t) * 100).toFixed(1)}%)`;
132+
const delta = (a, b) => {
133+
const d = a - b;
134+
return d > 0 ? `+${d}` : String(d);
135+
};
136+
86137
console.log('=== RESULTS ===');
87138
console.log('');
88-
console.log(`${'Metric'.padEnd(12)}${'structured'.padEnd(16)}${'source'.padEnd(16)}delta`);
139+
console.log(`${'Metric'.padEnd(12)}${'structured'.padEnd(20)}${'source'.padEnd(20)}delta`);
140+
89141
for (const [label, key] of [
90142
['Hit@1', 'hits1'],
91143
['Hit@3', 'hits3'],
92144
['Hit@5', 'hits5'],
145+
['Hit@10', 'hits10'],
93146
]) {
94-
const s = structured[key];
95-
const o = source[key];
96-
const sp = `${s}/${structured.total} (${((s / structured.total) * 100).toFixed(0)}%)`;
97-
const op = `${o}/${source.total} (${((o / source.total) * 100).toFixed(0)}%)`;
98-
const delta = s - o;
99-
const sign = delta > 0 ? '+' : '';
100-
console.log(`${label.padEnd(12)}${sp.padEnd(16)}${op.padEnd(16)}${sign}${delta}`);
101-
}
102-
103-
// Per-query comparison
104-
console.log('');
105-
console.log(`${'Query'.padEnd(52)}${'Expected'.padEnd(22)}Struct Source`);
106-
for (let i = 0; i < QUERIES.length; i++) {
107-
const s = structured.details[i];
108-
const o = source.details[i];
109-
const sw =
110-
typeof s.rank === 'number' && (typeof o.rank !== 'number' || s.rank < o.rank) ? '*' : ' ';
111-
const ow =
112-
typeof o.rank === 'number' && (typeof s.rank !== 'number' || o.rank < s.rank) ? '*' : ' ';
113147
console.log(
114-
s.q.padEnd(52) +
115-
s.expected.padEnd(22) +
116-
String(s.rank).padEnd(4) +
117-
sw +
118-
' ' +
119-
String(o.rank).padEnd(4) +
120-
ow,
148+
`${label.padEnd(12)}${pct(structured[key], structured.total).padEnd(20)}${pct(source[key], source.total).padEnd(20)}${delta(structured[key], source[key])}`,
121149
);
122150
}
151+
123152
console.log('');
124-
console.log('* = better rank for that query');
153+
console.log(`Misses: structured=${structured.misses.length}, source=${source.misses.length}`);
154+
155+
// Show misses unique to each strategy
156+
const structMissNames = new Set(structured.misses.map((m) => m.name));
157+
const sourceMissNames = new Set(source.misses.map((m) => m.name));
158+
const onlyStructMiss = structured.misses.filter((m) => !sourceMissNames.has(m.name));
159+
const onlySourceMiss = source.misses.filter((m) => !structMissNames.has(m.name));
160+
161+
if (onlySourceMiss.length > 0) {
162+
console.log(`\nStructured finds but source misses (${onlySourceMiss.length}):`);
163+
for (const m of onlySourceMiss.slice(0, 15)) {
164+
console.log(` "${m.query}" → expected: ${m.name}, got: ${m.top}`);
165+
}
166+
if (onlySourceMiss.length > 15) console.log(` ... and ${onlySourceMiss.length - 15} more`);
167+
}
168+
169+
if (onlyStructMiss.length > 0) {
170+
console.log(`\nSource finds but structured misses (${onlyStructMiss.length}):`);
171+
for (const m of onlyStructMiss.slice(0, 15)) {
172+
console.log(` "${m.query}" → expected: ${m.name}, got: ${m.top}`);
173+
}
174+
if (onlyStructMiss.length > 15) console.log(` ... and ${onlyStructMiss.length - 15} more`);
175+
}

0 commit comments

Comments
 (0)