Skip to content

Commit e012426

Browse files
feat: add embedding benchmark CI + rename build benchmarks
- Rename generated/BENCHMARKS.md → generated/BUILD-BENCHMARKS.md - Add scripts/embedding-benchmark.js: CI runner that tests all models against auto-generated queries from every symbol in the graph - Add scripts/update-embedding-report.js: generates historical EMBEDDING-BENCHMARKS.md with per-model Hit@1/3/5 trends - Update benchmark.yml with separate build + embedding jobs - Embedding job caches HF models, skips jina-code without HF_TOKEN - Add HF_TOKEN secret support for gated model access - Remove tests/search/embedding-benchmark.js (superseded by scripts/)
1 parent b8ce77c commit e012426

7 files changed

Lines changed: 373 additions & 193 deletions

File tree

.github/workflows/benchmark.yml

Lines changed: 87 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ on:
99
permissions: {}
1010

1111
jobs:
12-
benchmark:
12+
build-benchmark:
1313
runs-on: ubuntu-latest
1414
if: >-
1515
github.event_name == 'workflow_dispatch' ||
@@ -31,16 +31,22 @@ jobs:
3131

3232
- run: npm install
3333

34-
- name: Run benchmark
34+
- name: Run build benchmark
3535
run: node scripts/benchmark.js 2>/dev/null > benchmark-result.json
3636

37-
- name: Update report
37+
- name: Update build report
3838
run: node scripts/update-benchmark-report.js benchmark-result.json
3939

40+
- name: Upload build result
41+
uses: actions/upload-artifact@v4
42+
with:
43+
name: build-benchmark-result
44+
path: benchmark-result.json
45+
4046
- name: Check for changes
4147
id: changes
4248
run: |
43-
if git diff --quiet HEAD -- generated/BENCHMARKS.md README.md; then
49+
if git diff --quiet HEAD -- generated/BUILD-BENCHMARKS.md README.md; then
4450
echo "changed=false" >> "$GITHUB_OUTPUT"
4551
else
4652
echo "changed=true" >> "$GITHUB_OUTPUT"
@@ -54,20 +60,89 @@ jobs:
5460
git config user.name "github-actions[bot]"
5561
git config user.email "github-actions[bot]@users.noreply.github.com"
5662
57-
BRANCH="benchmark/update-$(date +%Y%m%d-%H%M%S)"
63+
BRANCH="benchmark/build-$(date +%Y%m%d-%H%M%S)"
5864
git checkout -b "$BRANCH"
59-
git add generated/BENCHMARKS.md README.md
60-
git commit -m "docs: update performance benchmarks"
65+
git add generated/BUILD-BENCHMARKS.md README.md
66+
git commit -m "docs: update build performance benchmarks"
6167
git push origin "$BRANCH"
6268
6369
gh pr create \
6470
--base main \
6571
--head "$BRANCH" \
66-
--title "docs: update performance benchmarks" \
67-
--body "Automated benchmark update from workflow run [#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})."
72+
--title "docs: update build performance benchmarks" \
73+
--body "Automated build benchmark update from workflow run [#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})."
74+
75+
embedding-benchmark:
76+
runs-on: ubuntu-latest
77+
if: >-
78+
github.event_name == 'workflow_dispatch' ||
79+
github.event.workflow_run.conclusion == 'success'
80+
permissions:
81+
contents: write
82+
pull-requests: write
83+
84+
steps:
85+
- uses: actions/checkout@v4
86+
with:
87+
fetch-depth: 0
88+
ref: main
89+
token: ${{ secrets.GITHUB_TOKEN }}
90+
91+
- uses: actions/setup-node@v4
92+
with:
93+
node-version: "22"
94+
95+
- run: npm install
96+
97+
- name: Cache HuggingFace models
98+
uses: actions/cache@v4
99+
with:
100+
path: ~/.cache/huggingface
101+
key: hf-models-${{ runner.os }}-${{ hashFiles('src/embedder.js') }}
102+
restore-keys: hf-models-${{ runner.os }}-
103+
104+
- name: Build graph
105+
run: node src/cli.js build .
68106

69-
- name: Upload result artifact
107+
- name: Run embedding benchmark
108+
env:
109+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
110+
run: node scripts/embedding-benchmark.js 2>/dev/null > embedding-benchmark-result.json
111+
112+
- name: Update embedding report
113+
run: node scripts/update-embedding-report.js embedding-benchmark-result.json
114+
115+
- name: Upload embedding result
70116
uses: actions/upload-artifact@v4
71117
with:
72-
name: benchmark-result
73-
path: benchmark-result.json
118+
name: embedding-benchmark-result
119+
path: embedding-benchmark-result.json
120+
121+
- name: Check for changes
122+
id: changes
123+
run: |
124+
if git diff --quiet HEAD -- generated/EMBEDDING-BENCHMARKS.md; then
125+
echo "changed=false" >> "$GITHUB_OUTPUT"
126+
else
127+
echo "changed=true" >> "$GITHUB_OUTPUT"
128+
fi
129+
130+
- name: Commit and push via PR
131+
if: steps.changes.outputs.changed == 'true'
132+
env:
133+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
134+
run: |
135+
git config user.name "github-actions[bot]"
136+
git config user.email "github-actions[bot]@users.noreply.github.com"
137+
138+
BRANCH="benchmark/embedding-$(date +%Y%m%d-%H%M%S)"
139+
git checkout -b "$BRANCH"
140+
git add generated/EMBEDDING-BENCHMARKS.md
141+
git commit -m "docs: update embedding benchmarks"
142+
git push origin "$BRANCH"
143+
144+
gh pr create \
145+
--base main \
146+
--head "$BRANCH" \
147+
--title "docs: update embedding benchmarks" \
148+
--body "Automated embedding benchmark update from workflow run [#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})."

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ Codegraph also extracts symbols from common callback patterns: Commander `.comma
373373

374374
## 📊 Performance
375375

376-
Self-measured on every release via CI ([full history](generated/BENCHMARKS.md)):
376+
Self-measured on every release via CI ([build benchmarks](generated/BUILD-BENCHMARKS.md) | [embedding benchmarks](generated/EMBEDDING-BENCHMARKS.md)):
377377

378378
| Metric | Latest |
379379
|---|---|
File renamed without changes.

scripts/embedding-benchmark.js

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
#!/usr/bin/env node
2+
3+
/**
4+
* Embedding benchmark runner — measures search recall across all models.
5+
*
6+
* For every function/method/class in the graph, generates a query from the
7+
* symbol name (splitIdentifier) and checks if search finds that symbol.
8+
* Tests all available embedding models, outputs JSON to stdout.
9+
*
10+
* Skips jina-code when HF_TOKEN is not set (gated model).
11+
*
12+
* Usage: node scripts/embedding-benchmark.js > result.json
13+
*/
14+
15+
import fs from 'node:fs';
16+
import path from 'node:path';
17+
import { performance } from 'node:perf_hooks';
18+
import { fileURLToPath } from 'node:url';
19+
import Database from 'better-sqlite3';
20+
21+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
22+
const root = path.resolve(__dirname, '..');
23+
24+
const pkg = JSON.parse(fs.readFileSync(path.join(root, 'package.json'), 'utf8'));
25+
const dbPath = path.join(root, '.codegraph', 'graph.db');
26+
27+
const { buildEmbeddings, MODELS, searchData } = await import(
28+
new URL('../src/embedder.js', import.meta.url).href
29+
);
30+
31+
// Redirect console.log to stderr so only JSON goes to stdout
32+
const origLog = console.log;
33+
console.log = (...args) => console.error(...args);
34+
35+
const TEST_PATTERN = /\.(test|spec)\.|__test__|__tests__|\.stories\./;
36+
37+
function splitIdentifier(name) {
38+
return name
39+
.replace(/([a-z])([A-Z])/g, '$1 $2')
40+
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
41+
.replace(/[_-]+/g, ' ')
42+
.trim()
43+
.toLowerCase();
44+
}
45+
46+
function loadSymbols() {
47+
const db = new Database(dbPath, { readonly: true });
48+
let rows = db
49+
.prepare(
50+
`SELECT name, kind, file FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`,
51+
)
52+
.all();
53+
db.close();
54+
55+
rows = rows.filter((r) => !TEST_PATTERN.test(r.file));
56+
57+
const seen = new Set();
58+
const symbols = [];
59+
for (const row of rows) {
60+
if (seen.has(row.name)) continue;
61+
seen.add(row.name);
62+
const query = splitIdentifier(row.name);
63+
if (query.length < 4) continue;
64+
symbols.push({ name: row.name, kind: row.kind, file: row.file, query });
65+
}
66+
return symbols;
67+
}
68+
69+
async function benchmarkModel(modelKey, symbols) {
70+
const embedStart = performance.now();
71+
await buildEmbeddings(root, modelKey, dbPath, { strategy: 'structured' });
72+
const embedTimeMs = Math.round(performance.now() - embedStart);
73+
74+
let hits1 = 0;
75+
let hits3 = 0;
76+
let hits5 = 0;
77+
let hits10 = 0;
78+
79+
const searchStart = performance.now();
80+
for (const { name, query } of symbols) {
81+
const data = await searchData(query, dbPath, { minScore: 0.01, limit: 10 });
82+
if (!data) continue;
83+
84+
const names = data.results.map((r) => r.name);
85+
const rank = names.indexOf(name) + 1;
86+
if (rank === 1) hits1++;
87+
if (rank >= 1 && rank <= 3) hits3++;
88+
if (rank >= 1 && rank <= 5) hits5++;
89+
if (rank >= 1 && rank <= 10) hits10++;
90+
}
91+
const searchTimeMs = Math.round(performance.now() - searchStart);
92+
93+
const total = symbols.length;
94+
return {
95+
dim: MODELS[modelKey].dim,
96+
contextWindow: MODELS[modelKey].contextWindow,
97+
hits1,
98+
hits3,
99+
hits5,
100+
hits10,
101+
misses: total - hits10,
102+
total,
103+
embedTimeMs,
104+
searchTimeMs,
105+
};
106+
}
107+
108+
// ── Run benchmarks ──────────────────────────────────────────────────────
109+
110+
const symbols = loadSymbols();
111+
console.error(`Loaded ${symbols.length} symbols for benchmark`);
112+
113+
const hasHfToken = !!process.env.HF_TOKEN;
114+
const modelKeys = Object.keys(MODELS);
115+
const results = {};
116+
117+
for (const key of modelKeys) {
118+
if (key === 'jina-code' && !hasHfToken) {
119+
console.error(`Skipping ${key} (HF_TOKEN not set)`);
120+
continue;
121+
}
122+
123+
console.error(`\nBenchmarking model: ${key}...`);
124+
try {
125+
results[key] = await benchmarkModel(key, symbols);
126+
const r = results[key];
127+
console.error(
128+
` Hit@1=${r.hits1}/${r.total} Hit@3=${r.hits3}/${r.total} Hit@5=${r.hits5}/${r.total} misses=${r.misses}`,
129+
);
130+
} catch (err) {
131+
console.error(` FAILED: ${err.message}`);
132+
}
133+
}
134+
135+
// Restore console.log for JSON output
136+
console.log = origLog;
137+
138+
const output = {
139+
version: pkg.version,
140+
date: new Date().toISOString().slice(0, 10),
141+
strategy: 'structured',
142+
symbols: symbols.length,
143+
models: results,
144+
};
145+
146+
console.log(JSON.stringify(output, null, 2));

scripts/update-benchmark-report.js

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
/**
44
* Update benchmark report — reads benchmark JSON and updates:
5-
* 1. generated/BENCHMARKS.md (historical table + raw JSON in HTML comment)
5+
* 1. generated/BUILD-BENCHMARKS.md (historical table + raw JSON in HTML comment)
66
* 2. README.md (performance section with latest numbers)
77
*
88
* Usage:
@@ -28,10 +28,10 @@ if (arg) {
2828
const entry = JSON.parse(jsonText);
2929

3030
// ── Paths ────────────────────────────────────────────────────────────────
31-
const benchmarkPath = path.join(root, 'generated', 'BENCHMARKS.md');
31+
const benchmarkPath = path.join(root, 'generated', 'BUILD-BENCHMARKS.md');
3232
const readmePath = path.join(root, 'README.md');
3333

34-
// ── Load existing history from BENCHMARKS.md ─────────────────────────────
34+
// ── Load existing history from BUILD-BENCHMARKS.md ─────────────────────────────
3535
let history = [];
3636
if (fs.existsSync(benchmarkPath)) {
3737
const content = fs.readFileSync(benchmarkPath, 'utf8');
@@ -96,7 +96,7 @@ function engineRow(h, prev, engineKey) {
9696
);
9797
}
9898

99-
// ── Build BENCHMARKS.md ──────────────────────────────────────────────────
99+
// ── Build BUILD-BENCHMARKS.md ──────────────────────────────────────────────────
100100
let md = '# Codegraph Performance Benchmarks\n\n';
101101
md += 'Self-measured on every release by running codegraph on its own codebase.\n';
102102
md += 'Metrics are normalized per file for cross-version comparability.\n\n';
@@ -177,7 +177,7 @@ if (fs.existsSync(readmePath)) {
177177

178178
const perfSection = `## 📊 Performance
179179
180-
Self-measured on every release via CI ([full history](generated/BENCHMARKS.md)):
180+
Self-measured on every release via CI ([build benchmarks](generated/BUILD-BENCHMARKS.md) | [embedding benchmarks](generated/EMBEDDING-BENCHMARKS.md)):
181181
182182
| Metric | Latest |
183183
|---|---|

0 commit comments

Comments
 (0)