From bc81ce7c8811220238c8871fd39744f28c8b24f9 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Wed, 22 Apr 2026 23:53:57 -0600 Subject: [PATCH 1/3] perf(globs): memoize compiled include/exclude globs per build Long-running hosts (watch mode, MCP server) invoke buildGraph repeatedly with the same config. Previously each invocation recompiled the include/exclude patterns from scratch. Memoize by pattern content so the compiled regex list (TS) / GlobSet (Rust) is reused across calls. Both sides use a FIFO cache capped at 32 entries with a clear() hook for tests and config-reload scenarios. Closes #1000 --- crates/codegraph-core/src/file_collector.rs | 114 ++++++++++++++++++-- src/domain/graph/builder/helpers.ts | 12 +-- src/shared/globs.ts | 43 +++++++- tests/unit/globs.test.ts | 68 ++++++++++++ 4 files changed, 222 insertions(+), 15 deletions(-) create mode 100644 tests/unit/globs.test.ts diff --git a/crates/codegraph-core/src/file_collector.rs b/crates/codegraph-core/src/file_collector.rs index 1ba57c83..42ea6222 100644 --- a/crates/codegraph-core/src/file_collector.rs +++ b/crates/codegraph-core/src/file_collector.rs @@ -6,8 +6,9 @@ use crate::parser_registry::LanguageKind; use globset::{Glob, GlobSet, GlobSetBuilder}; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::path::Path; +use std::sync::{Arc, Mutex, OnceLock}; /// Default directories to ignore (mirrors `IGNORE_DIRS` in `src/shared/constants.ts`). const DEFAULT_IGNORE_DIRS: &[&str] = &[ @@ -45,14 +46,83 @@ pub struct CollectResult { pub directories: HashSet, } +/// Max entries held in the compiled-glob cache. Config pattern sets are small +/// and stable; the cap guards against unbounded growth if a caller churns +/// through many distinct lists. +const COMPILE_CACHE_MAX: usize = 32; + +/// FIFO cache of compiled `GlobSet`s, keyed by the raw pattern list. +/// +/// Long-running hosts (watch mode, MCP server) invoke `collect_files` +/// repeatedly with the same config. Reusing compiled `GlobSet`s across +/// those invocations avoids paying the parse cost on every rebuild. +struct GlobCache { + order: Vec>, + map: HashMap, Arc>, +} + +impl GlobCache { + fn new() -> Self { + Self { + order: Vec::new(), + map: HashMap::new(), + } + } + + fn get(&self, key: &[String]) -> Option> { + self.map.get(key).cloned() + } + + fn insert(&mut self, key: Vec, value: Arc) { + if self.map.contains_key(&key) { + self.map.insert(key, value); + return; + } + if self.map.len() >= COMPILE_CACHE_MAX && !self.order.is_empty() { + let oldest = self.order.remove(0); + self.map.remove(&oldest); + } + self.order.push(key.clone()); + self.map.insert(key, value); + } + + #[cfg(test)] + fn clear(&mut self) { + self.order.clear(); + self.map.clear(); + } +} + +fn glob_cache() -> &'static Mutex { + static CACHE: OnceLock> = OnceLock::new(); + CACHE.get_or_init(|| Mutex::new(GlobCache::new())) +} + +/// Clear the compiled-glob cache. Exposed for tests; production hosts +/// don't need to invalidate because the cache is keyed on exact pattern +/// contents (a changed config produces a fresh key). +#[cfg(test)] +pub(crate) fn clear_glob_cache() { + if let Ok(mut cache) = glob_cache().lock() { + cache.clear(); + } +} + /// Compile a list of glob patterns into a `GlobSet`. /// /// Invalid patterns are logged via `eprintln!` and skipped so a single bad -/// entry in config can't take down the whole build. -fn build_glob_set(patterns: &[String]) -> Option { +/// entry in config can't take down the whole build. Results are memoized +/// by pattern content so repeated `collect_files` invocations in the same +/// process share the compiled set. +fn build_glob_set(patterns: &[String]) -> Option> { if patterns.is_empty() { return None; } + if let Ok(cache) = glob_cache().lock() { + if let Some(set) = cache.get(patterns) { + return Some(set); + } + } let mut builder = GlobSetBuilder::new(); let mut added = 0usize; for p in patterns { @@ -70,7 +140,13 @@ fn build_glob_set(patterns: &[String]) -> Option { return None; } match builder.build() { - Ok(set) => Some(set), + Ok(set) => { + let arc = Arc::new(set); + if let Ok(mut cache) = glob_cache().lock() { + cache.insert(patterns.to_vec(), arc.clone()); + } + Some(arc) + } Err(e) => { // Failing to build the GlobSet disables *all* include/exclude // filters, which silently changes what files the build sees. @@ -185,7 +261,7 @@ pub fn collect_files( .and_then(|p| p.to_str()) .map(|s| s.replace('\\', "/")) .unwrap_or_else(|| normalize_path(path)); - if !passes_include_exclude(&rel, include_set.as_ref(), exclude_set.as_ref()) { + if !passes_include_exclude(&rel, include_set.as_deref(), exclude_set.as_deref()) { continue; } } @@ -237,7 +313,7 @@ pub fn try_fast_collect( for rel_path in &file_set { if has_filters { let norm = rel_path.replace('\\', "/"); - if !passes_include_exclude(&norm, include_set.as_ref(), exclude_set.as_ref()) { + if !passes_include_exclude(&norm, include_set.as_deref(), exclude_set.as_deref()) { continue; } } @@ -380,6 +456,32 @@ mod tests { assert!(names.contains("d.ts")); } + #[test] + fn build_glob_set_memoizes_identical_pattern_lists() { + // Guards the performance optimization: long-running hosts (watch mode, + // MCP server) must reuse the compiled GlobSet across buildGraph calls + // instead of recompiling from scratch every time. + clear_glob_cache(); + let patterns = vec!["src/**/*.ts".to_string(), "**/*.test.ts".to_string()]; + let first = build_glob_set(&patterns).expect("compiles"); + let second = build_glob_set(&patterns).expect("compiles"); + assert!( + Arc::ptr_eq(&first, &second), + "repeated build_glob_set calls with the same patterns must return the cached Arc" + ); + } + + #[test] + fn build_glob_set_cache_distinguishes_different_lists() { + clear_glob_cache(); + let a = build_glob_set(&["src/**/*.ts".to_string()]).expect("compiles"); + let b = build_glob_set(&["src/**/*.js".to_string()]).expect("compiles"); + assert!( + !Arc::ptr_eq(&a, &b), + "different pattern lists must get independent cache entries" + ); + } + #[test] fn fast_collect_honors_exclude_globs() { let root = "/project"; diff --git a/src/domain/graph/builder/helpers.ts b/src/domain/graph/builder/helpers.ts index 6f57b705..7ab16b63 100644 --- a/src/domain/graph/builder/helpers.ts +++ b/src/domain/graph/builder/helpers.ts @@ -90,8 +90,8 @@ export function collectFiles( directories: Set, _visited?: Set, _rootDir?: string, - _includeRegexes?: RegExp[], - _excludeRegexes?: RegExp[], + _includeRegexes?: readonly RegExp[], + _excludeRegexes?: readonly RegExp[], ): { files: string[]; directories: Set }; export function collectFiles( dir: string, @@ -100,8 +100,8 @@ export function collectFiles( directories?: null, _visited?: Set, _rootDir?: string, - _includeRegexes?: RegExp[], - _excludeRegexes?: RegExp[], + _includeRegexes?: readonly RegExp[], + _excludeRegexes?: readonly RegExp[], ): string[]; export function collectFiles( dir: string, @@ -110,8 +110,8 @@ export function collectFiles( directories: Set | null = null, _visited: Set = new Set(), _rootDir?: string, - _includeRegexes?: RegExp[], - _excludeRegexes?: RegExp[], + _includeRegexes?: readonly RegExp[], + _excludeRegexes?: readonly RegExp[], ): string[] | { files: string[]; directories: Set } { const trackDirs = directories instanceof Set; let hasFiles = false; diff --git a/src/shared/globs.ts b/src/shared/globs.ts index 9cd6c786..8ce9fb7e 100644 --- a/src/shared/globs.ts +++ b/src/shared/globs.ts @@ -52,11 +52,31 @@ export function globToRegex(pattern: string): RegExp { return new RegExp(`^${re}$`); } +const EMPTY_REGEX_LIST: readonly RegExp[] = Object.freeze([]) as readonly RegExp[]; + +// Compile results are cached by pattern content so a long-running host +// (watch mode, MCP server) doesn't recompile on every buildGraph call. +// Capped to avoid unbounded growth when callers pass many distinct lists. +const COMPILE_CACHE_MAX = 32; +const GLOB_KEY_SEP = '\x1f'; +const compileCache = new Map(); + +function buildCacheKey(patterns: readonly string[]): string { + return patterns.join(GLOB_KEY_SEP); +} + /** * Compile a list of glob patterns. Invalid / empty patterns are skipped. + * + * Results are memoized per pattern-content so repeated `buildGraph` calls + * with the same include/exclude lists reuse the compiled regexes. The + * returned array is shared across callers and must not be mutated. */ -export function compileGlobs(patterns: readonly string[] | undefined): RegExp[] { - if (!patterns || patterns.length === 0) return []; +export function compileGlobs(patterns: readonly string[] | undefined): readonly RegExp[] { + if (!patterns || patterns.length === 0) return EMPTY_REGEX_LIST; + const key = buildCacheKey(patterns); + const cached = compileCache.get(key); + if (cached) return cached; const out: RegExp[] = []; for (const p of patterns) { if (typeof p !== 'string' || p.length === 0) continue; @@ -66,7 +86,24 @@ export function compileGlobs(patterns: readonly string[] | undefined): RegExp[] // Ignore malformed patterns rather than failing the whole build. } } - return out; + const frozen = Object.freeze(out) as readonly RegExp[]; + if (compileCache.size >= COMPILE_CACHE_MAX) { + // FIFO eviction — Map iterates insertion order. Config pattern sets + // are small and stable, so a simple cap is sufficient. + const first = compileCache.keys().next().value; + if (first !== undefined) compileCache.delete(first); + } + compileCache.set(key, frozen); + return frozen; +} + +/** + * Clear the compiled-glob cache. Intended for long-running hosts that + * need to reload config (e.g. watch mode after `.codegraphrc.json` edits) + * and for test isolation. + */ +export function clearGlobCache(): void { + compileCache.clear(); } /** diff --git a/tests/unit/globs.test.ts b/tests/unit/globs.test.ts new file mode 100644 index 00000000..823d8b17 --- /dev/null +++ b/tests/unit/globs.test.ts @@ -0,0 +1,68 @@ +/** + * Unit tests for src/shared/globs.ts — covers the memoization added so + * long-running hosts (watch mode, MCP server) don't recompile include/exclude + * globs on every buildGraph call. + */ +import { afterEach, describe, expect, it } from 'vitest'; +import { clearGlobCache, compileGlobs } from '../../src/shared/globs.js'; + +describe('compileGlobs', () => { + afterEach(() => { + clearGlobCache(); + }); + + it('returns an empty list for undefined or empty input', () => { + expect(compileGlobs(undefined)).toHaveLength(0); + expect(compileGlobs([])).toHaveLength(0); + }); + + it('compiles each pattern to a RegExp anchored with ^…$', () => { + const [inc, exc] = [compileGlobs(['src/**/*.ts']), compileGlobs(['**/*.test.ts'])]; + expect(inc).toHaveLength(1); + expect(exc).toHaveLength(1); + expect(inc[0]!.test('src/foo/bar.ts')).toBe(true); + expect(inc[0]!.test('tests/foo.ts')).toBe(false); + expect(exc[0]!.test('src/foo/bar.test.ts')).toBe(true); + expect(exc[0]!.test('src/foo/bar.ts')).toBe(false); + }); + + it('reuses the same array for identical pattern lists (memoization)', () => { + const first = compileGlobs(['src/**/*.ts', '**/*.test.ts']); + const second = compileGlobs(['src/**/*.ts', '**/*.test.ts']); + // Repeated calls with equivalent content return the exact same array — this is + // what makes repeated buildGraph invocations in a long-running host cheap. + expect(second).toBe(first); + }); + + it('treats different pattern lists as distinct cache entries', () => { + const a = compileGlobs(['src/**/*.ts']); + const b = compileGlobs(['src/**/*.js']); + expect(a).not.toBe(b); + expect(a[0]!.test('src/x.ts')).toBe(true); + expect(b[0]!.test('src/x.js')).toBe(true); + expect(a[0]!.test('src/x.js')).toBe(false); + }); + + it('treats different pattern orderings as distinct cache entries', () => { + // Order matters for glob matching semantics (even if results are the same + // set here, callers may rely on iteration order elsewhere). Two lists with + // the same patterns in different orders get independent cache slots. + const a = compileGlobs(['a/**', 'b/**']); + const b = compileGlobs(['b/**', 'a/**']); + expect(a).not.toBe(b); + }); + + it('returns a frozen array to prevent accidental mutation of cached values', () => { + const patterns = compileGlobs(['src/**/*.ts']); + expect(Object.isFrozen(patterns)).toBe(true); + }); + + it('clearGlobCache drops memoized entries', () => { + const before = compileGlobs(['src/**/*.ts']); + clearGlobCache(); + const after = compileGlobs(['src/**/*.ts']); + // Same content, but cleared cache forces a fresh compilation → different + // array identity. Test guards against silent cache leaks across suites. + expect(after).not.toBe(before); + }); +}); From e83741d8f9a0fc2f5807af7ce5efccb2afac9024 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Thu, 23 Apr 2026 00:02:57 -0600 Subject: [PATCH 2/3] fix(globs): use JSON.stringify for unambiguous cache keys (#1005) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Greptile flagged that joining patterns with \x1f could alias distinct lists whose contents contain that separator. JSON.stringify is unambiguous and equally cheap — each pattern is quoted and comma-delimited so no two lists share a key. Impact: 1 functions changed, 4 affected --- src/shared/globs.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/shared/globs.ts b/src/shared/globs.ts index 8ce9fb7e..5f1e38e0 100644 --- a/src/shared/globs.ts +++ b/src/shared/globs.ts @@ -58,11 +58,13 @@ const EMPTY_REGEX_LIST: readonly RegExp[] = Object.freeze([]) as readonly RegExp // (watch mode, MCP server) doesn't recompile on every buildGraph call. // Capped to avoid unbounded growth when callers pass many distinct lists. const COMPILE_CACHE_MAX = 32; -const GLOB_KEY_SEP = '\x1f'; const compileCache = new Map(); function buildCacheKey(patterns: readonly string[]): string { - return patterns.join(GLOB_KEY_SEP); + // JSON.stringify avoids ambiguity when patterns legitimately contain any + // single character (including control characters or separators a caller + // might choose): ["a", "bc"] → '["a","bc"]' vs ["ab", "c"] → '["ab","c"]'. + return JSON.stringify(patterns); } /** From 8922af4225de36df6e8a4294e1788b6d1ce6aebb Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Thu, 23 Apr 2026 00:03:09 -0600 Subject: [PATCH 3/3] fix(globs): use VecDeque for O(1) FIFO eviction (#1005) Greptile noted that Vec::remove(0) shifts every remaining element on each eviction. VecDeque::pop_front is O(1) and better communicates FIFO intent. At cap=32 the practical difference is negligible, but the cleaner semantics are worth the trivial change. Impact: 2 functions changed, 0 affected --- crates/codegraph-core/src/file_collector.rs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/crates/codegraph-core/src/file_collector.rs b/crates/codegraph-core/src/file_collector.rs index 42ea6222..23177d7f 100644 --- a/crates/codegraph-core/src/file_collector.rs +++ b/crates/codegraph-core/src/file_collector.rs @@ -6,7 +6,7 @@ use crate::parser_registry::LanguageKind; use globset::{Glob, GlobSet, GlobSetBuilder}; -use std::collections::{HashMap, HashSet}; +use std::collections::{HashMap, HashSet, VecDeque}; use std::path::Path; use std::sync::{Arc, Mutex, OnceLock}; @@ -57,14 +57,16 @@ const COMPILE_CACHE_MAX: usize = 32; /// repeatedly with the same config. Reusing compiled `GlobSet`s across /// those invocations avoids paying the parse cost on every rebuild. struct GlobCache { - order: Vec>, + // `VecDeque` gives O(1) `pop_front` for FIFO eviction; `Vec::remove(0)` + // would shift every remaining element on each eviction. + order: VecDeque>, map: HashMap, Arc>, } impl GlobCache { fn new() -> Self { Self { - order: Vec::new(), + order: VecDeque::new(), map: HashMap::new(), } } @@ -78,11 +80,12 @@ impl GlobCache { self.map.insert(key, value); return; } - if self.map.len() >= COMPILE_CACHE_MAX && !self.order.is_empty() { - let oldest = self.order.remove(0); - self.map.remove(&oldest); + if self.map.len() >= COMPILE_CACHE_MAX { + if let Some(oldest) = self.order.pop_front() { + self.map.remove(&oldest); + } } - self.order.push(key.clone()); + self.order.push_back(key.clone()); self.map.insert(key, value); }