Skip to content

Commit e2a771b

Browse files
fix: recompute Jaccard from total file counts during incremental co-change analysis
The incremental upsert was overwriting Jaccard with a value computed only from new commits, ignoring historical data. Now stores per-file commit counts in a new file_commit_counts table (migration v6) and recomputes Jaccard from the accumulated totals after each merge.
1 parent aef1787 commit e2a771b

File tree

3 files changed

+55
-17
lines changed

3 files changed

+55
-17
lines changed

src/cochange.js

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ export function computeCoChanges(commits, opts = {}) {
124124
});
125125
}
126126

127-
return results;
127+
return { pairs: results, fileCommitCounts };
128128
}
129129

130130
/**
@@ -170,6 +170,7 @@ export function analyzeCoChanges(customDbPath, opts = {}) {
170170
if (opts.full) {
171171
db.exec('DELETE FROM co_changes');
172172
db.exec('DELETE FROM co_change_meta');
173+
db.exec('DELETE FROM file_commit_counts');
173174
}
174175

175176
// Collect known files from the graph for filtering
@@ -182,25 +183,53 @@ export function analyzeCoChanges(customDbPath, opts = {}) {
182183
}
183184

184185
const { commits } = scanGitHistory(repoRoot, { since, afterSha });
185-
const coChanges = computeCoChanges(commits, { minSupport, maxFilesPerCommit, knownFiles });
186+
const { pairs: coChanges, fileCommitCounts } = computeCoChanges(commits, {
187+
minSupport,
188+
maxFilesPerCommit,
189+
knownFiles,
190+
});
186191

187-
// Write results
188-
const upsert = db.prepare(`
192+
// Upsert per-file commit counts so Jaccard can be recomputed from totals
193+
const fileCountUpsert = db.prepare(`
194+
INSERT INTO file_commit_counts (file, commit_count) VALUES (?, ?)
195+
ON CONFLICT(file) DO UPDATE SET commit_count = commit_count + excluded.commit_count
196+
`);
197+
198+
// Upsert pair counts (accumulate commit_count, jaccard placeholder — recomputed below)
199+
const pairUpsert = db.prepare(`
189200
INSERT INTO co_changes (file_a, file_b, commit_count, jaccard, last_commit_epoch)
190-
VALUES (?, ?, ?, ?, ?)
201+
VALUES (?, ?, ?, 0, ?)
191202
ON CONFLICT(file_a, file_b) DO UPDATE SET
192203
commit_count = commit_count + excluded.commit_count,
193-
jaccard = excluded.jaccard,
194204
last_commit_epoch = MAX(co_changes.last_commit_epoch, excluded.last_commit_epoch)
195205
`);
196206

197-
const insertMany = db.transaction((pairs) => {
198-
for (const [key, data] of pairs) {
207+
const insertMany = db.transaction(() => {
208+
for (const [file, count] of fileCommitCounts) {
209+
fileCountUpsert.run(file, count);
210+
}
211+
for (const [key, data] of coChanges) {
199212
const [fileA, fileB] = key.split('\0');
200-
upsert.run(fileA, fileB, data.commitCount, data.jaccard, data.lastEpoch);
213+
pairUpsert.run(fileA, fileB, data.commitCount, data.lastEpoch);
201214
}
202215
});
203-
insertMany(coChanges);
216+
insertMany();
217+
218+
// Recompute Jaccard for all affected pairs from total file commit counts
219+
const affectedFiles = [...fileCommitCounts.keys()];
220+
if (affectedFiles.length > 0) {
221+
const ph = affectedFiles.map(() => '?').join(',');
222+
db.prepare(`
223+
UPDATE co_changes SET jaccard = (
224+
SELECT CAST(co_changes.commit_count AS REAL) / (
225+
COALESCE(fa.commit_count, 0) + COALESCE(fb.commit_count, 0) - co_changes.commit_count
226+
)
227+
FROM file_commit_counts fa, file_commit_counts fb
228+
WHERE fa.file = co_changes.file_a AND fb.file = co_changes.file_b
229+
)
230+
WHERE file_a IN (${ph}) OR file_b IN (${ph})
231+
`).run(...affectedFiles, ...affectedFiles);
232+
}
204233

205234
// Update metadata
206235
const metaUpsert = db.prepare(`

src/db.js

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,15 @@ export const MIGRATIONS = [
9292
);
9393
`,
9494
},
95+
{
96+
version: 6,
97+
up: `
98+
CREATE TABLE IF NOT EXISTS file_commit_counts (
99+
file TEXT PRIMARY KEY,
100+
commit_count INTEGER NOT NULL DEFAULT 0
101+
);
102+
`,
103+
},
95104
];
96105

97106
export function openDb(dbPath) {

tests/integration/cochange.test.js

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ describe('computeCoChanges', () => {
3232
];
3333
// a.js appears in 4 commits, b.js in 3, pair(a,b) = 3
3434
// jaccard(a,b) = 3 / (4 + 3 - 3) = 3/4 = 0.75
35-
const result = computeCoChanges(commits, { minSupport: 1 });
35+
const { pairs: result } = computeCoChanges(commits, { minSupport: 1 });
3636
const abKey = 'a.js\0b.js';
3737
expect(result.has(abKey)).toBe(true);
3838
expect(result.get(abKey).jaccard).toBeCloseTo(0.75);
@@ -45,7 +45,7 @@ describe('computeCoChanges', () => {
4545
{ sha: 'a2', epoch: 2000, files: ['a.js', 'b.js'] },
4646
{ sha: 'a3', epoch: 3000, files: ['a.js', 'c.js'] },
4747
];
48-
const result = computeCoChanges(commits, { minSupport: 3 });
48+
const { pairs: result } = computeCoChanges(commits, { minSupport: 3 });
4949
// pair(a,b) only has 2 co-occurrences, pair(a,c) only 1
5050
expect(result.size).toBe(0);
5151
});
@@ -57,7 +57,7 @@ describe('computeCoChanges', () => {
5757
{ sha: 'a3', epoch: 3000, files: ['a.js', 'b.js'] },
5858
{ sha: 'a4', epoch: 4000, files: ['a.js', 'b.js'] },
5959
];
60-
const result = computeCoChanges(commits, { minSupport: 3, maxFilesPerCommit: 3 });
60+
const { pairs: result } = computeCoChanges(commits, { minSupport: 3, maxFilesPerCommit: 3 });
6161
// First commit skipped (4 files > max 3)
6262
// pair(a,b) = 3 from commits a2,a3,a4; a appears in 3 commits, b in 3
6363
// jaccard = 3/(3+3-3) = 1.0
@@ -72,14 +72,14 @@ describe('computeCoChanges', () => {
7272
{ sha: 'a2', epoch: 2000, files: ['z.js', 'a.js'] },
7373
{ sha: 'a3', epoch: 3000, files: ['z.js', 'a.js'] },
7474
];
75-
const result = computeCoChanges(commits, { minSupport: 1 });
75+
const { pairs: result } = computeCoChanges(commits, { minSupport: 1 });
7676
// Should be stored as a.js < z.js
7777
expect(result.has('a.js\0z.js')).toBe(true);
7878
expect(result.has('z.js\0a.js')).toBe(false);
7979
});
8080

8181
test('empty input returns empty map', () => {
82-
const result = computeCoChanges([], { minSupport: 1 });
82+
const { pairs: result } = computeCoChanges([], { minSupport: 1 });
8383
expect(result.size).toBe(0);
8484
});
8585

@@ -89,7 +89,7 @@ describe('computeCoChanges', () => {
8989
{ sha: 'a2', epoch: 5000, files: ['a.js', 'b.js'] },
9090
{ sha: 'a3', epoch: 3000, files: ['a.js', 'b.js'] },
9191
];
92-
const result = computeCoChanges(commits, { minSupport: 1 });
92+
const { pairs: result } = computeCoChanges(commits, { minSupport: 1 });
9393
expect(result.get('a.js\0b.js').lastEpoch).toBe(5000);
9494
});
9595

@@ -100,7 +100,7 @@ describe('computeCoChanges', () => {
100100
{ sha: 'a3', epoch: 3000, files: ['a.js', 'b.js', 'c.js'] },
101101
];
102102
const knownFiles = new Set(['a.js', 'b.js']);
103-
const result = computeCoChanges(commits, { minSupport: 1, knownFiles });
103+
const { pairs: result } = computeCoChanges(commits, { minSupport: 1, knownFiles });
104104
expect(result.has('a.js\0b.js')).toBe(true);
105105
// c.js pairs should not exist
106106
expect(result.has('a.js\0c.js')).toBe(false);

0 commit comments

Comments
 (0)