Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 111 additions & 6 deletions crates/codegraph-core/src/file_collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@

use crate::parser_registry::LanguageKind;
use globset::{Glob, GlobSet, GlobSetBuilder};
use std::collections::HashSet;
use std::collections::{HashMap, HashSet, VecDeque};
use std::path::Path;
use std::sync::{Arc, Mutex, OnceLock};

/// Default directories to ignore (mirrors `IGNORE_DIRS` in `src/shared/constants.ts`).
const DEFAULT_IGNORE_DIRS: &[&str] = &[
Expand Down Expand Up @@ -45,14 +46,86 @@ pub struct CollectResult {
pub directories: HashSet<String>,
}

/// Max entries held in the compiled-glob cache. Config pattern sets are small
/// and stable; the cap guards against unbounded growth if a caller churns
/// through many distinct lists.
const COMPILE_CACHE_MAX: usize = 32;

/// FIFO cache of compiled `GlobSet`s, keyed by the raw pattern list.
///
/// Long-running hosts (watch mode, MCP server) invoke `collect_files`
/// repeatedly with the same config. Reusing compiled `GlobSet`s across
/// those invocations avoids paying the parse cost on every rebuild.
struct GlobCache {
// `VecDeque` gives O(1) `pop_front` for FIFO eviction; `Vec::remove(0)`
// would shift every remaining element on each eviction.
order: VecDeque<Vec<String>>,
map: HashMap<Vec<String>, Arc<GlobSet>>,
}

impl GlobCache {
fn new() -> Self {
Self {
order: VecDeque::new(),
map: HashMap::new(),
}
}

fn get(&self, key: &[String]) -> Option<Arc<GlobSet>> {
self.map.get(key).cloned()
}

fn insert(&mut self, key: Vec<String>, value: Arc<GlobSet>) {
if self.map.contains_key(&key) {
self.map.insert(key, value);
return;
}
if self.map.len() >= COMPILE_CACHE_MAX {
if let Some(oldest) = self.order.pop_front() {
self.map.remove(&oldest);
}
}
self.order.push_back(key.clone());
self.map.insert(key, value);
}
Comment on lines +75 to +90
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 FIFO eviction with Vec::remove(0) is O(n)

self.order.remove(0) shifts every remaining element left on each eviction. At cap=32 the cost is negligible, but a VecDeque would give O(1) pop_front at no extra complexity and makes the intent clearer.

Fix in Claude Code

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 8922af4 — switched order from Vec<Vec<String>> to VecDeque<Vec<String>> and replaced remove(0)/push with pop_front/push_back. O(1) eviction and the intent reads more clearly. cargo test -p codegraph-core file_collector still passes all 8 tests (including the 2 memoization tests).


#[cfg(test)]
fn clear(&mut self) {
self.order.clear();
self.map.clear();
}
}

fn glob_cache() -> &'static Mutex<GlobCache> {
static CACHE: OnceLock<Mutex<GlobCache>> = OnceLock::new();
CACHE.get_or_init(|| Mutex::new(GlobCache::new()))
}

/// Clear the compiled-glob cache. Exposed for tests; production hosts
/// don't need to invalidate because the cache is keyed on exact pattern
/// contents (a changed config produces a fresh key).
#[cfg(test)]
pub(crate) fn clear_glob_cache() {
if let Ok(mut cache) = glob_cache().lock() {
cache.clear();
}
}

/// Compile a list of glob patterns into a `GlobSet`.
///
/// Invalid patterns are logged via `eprintln!` and skipped so a single bad
/// entry in config can't take down the whole build.
fn build_glob_set(patterns: &[String]) -> Option<GlobSet> {
/// entry in config can't take down the whole build. Results are memoized
/// by pattern content so repeated `collect_files` invocations in the same
/// process share the compiled set.
fn build_glob_set(patterns: &[String]) -> Option<Arc<GlobSet>> {
if patterns.is_empty() {
return None;
}
if let Ok(cache) = glob_cache().lock() {
if let Some(set) = cache.get(patterns) {
return Some(set);
}
}
let mut builder = GlobSetBuilder::new();
let mut added = 0usize;
for p in patterns {
Expand All @@ -70,7 +143,13 @@ fn build_glob_set(patterns: &[String]) -> Option<GlobSet> {
return None;
}
match builder.build() {
Ok(set) => Some(set),
Ok(set) => {
let arc = Arc::new(set);
if let Ok(mut cache) = glob_cache().lock() {
cache.insert(patterns.to_vec(), arc.clone());
}
Some(arc)
}
Err(e) => {
// Failing to build the GlobSet disables *all* include/exclude
// filters, which silently changes what files the build sees.
Expand Down Expand Up @@ -185,7 +264,7 @@ pub fn collect_files(
.and_then(|p| p.to_str())
.map(|s| s.replace('\\', "/"))
.unwrap_or_else(|| normalize_path(path));
if !passes_include_exclude(&rel, include_set.as_ref(), exclude_set.as_ref()) {
if !passes_include_exclude(&rel, include_set.as_deref(), exclude_set.as_deref()) {
continue;
}
}
Expand Down Expand Up @@ -237,7 +316,7 @@ pub fn try_fast_collect(
for rel_path in &file_set {
if has_filters {
let norm = rel_path.replace('\\', "/");
if !passes_include_exclude(&norm, include_set.as_ref(), exclude_set.as_ref()) {
if !passes_include_exclude(&norm, include_set.as_deref(), exclude_set.as_deref()) {
continue;
}
}
Expand Down Expand Up @@ -380,6 +459,32 @@ mod tests {
assert!(names.contains("d.ts"));
}

#[test]
fn build_glob_set_memoizes_identical_pattern_lists() {
// Guards the performance optimization: long-running hosts (watch mode,
// MCP server) must reuse the compiled GlobSet across buildGraph calls
// instead of recompiling from scratch every time.
clear_glob_cache();
let patterns = vec!["src/**/*.ts".to_string(), "**/*.test.ts".to_string()];
let first = build_glob_set(&patterns).expect("compiles");
let second = build_glob_set(&patterns).expect("compiles");
assert!(
Arc::ptr_eq(&first, &second),
"repeated build_glob_set calls with the same patterns must return the cached Arc"
);
}

#[test]
fn build_glob_set_cache_distinguishes_different_lists() {
clear_glob_cache();
let a = build_glob_set(&["src/**/*.ts".to_string()]).expect("compiles");
let b = build_glob_set(&["src/**/*.js".to_string()]).expect("compiles");
assert!(
!Arc::ptr_eq(&a, &b),
"different pattern lists must get independent cache entries"
);
}

#[test]
fn fast_collect_honors_exclude_globs() {
let root = "/project";
Expand Down
12 changes: 6 additions & 6 deletions src/domain/graph/builder/helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ export function collectFiles(
directories: Set<string>,
_visited?: Set<string>,
_rootDir?: string,
_includeRegexes?: RegExp[],
_excludeRegexes?: RegExp[],
_includeRegexes?: readonly RegExp[],
_excludeRegexes?: readonly RegExp[],
): { files: string[]; directories: Set<string> };
export function collectFiles(
dir: string,
Expand All @@ -100,8 +100,8 @@ export function collectFiles(
directories?: null,
_visited?: Set<string>,
_rootDir?: string,
_includeRegexes?: RegExp[],
_excludeRegexes?: RegExp[],
_includeRegexes?: readonly RegExp[],
_excludeRegexes?: readonly RegExp[],
): string[];
export function collectFiles(
dir: string,
Expand All @@ -110,8 +110,8 @@ export function collectFiles(
directories: Set<string> | null = null,
_visited: Set<string> = new Set(),
_rootDir?: string,
_includeRegexes?: RegExp[],
_excludeRegexes?: RegExp[],
_includeRegexes?: readonly RegExp[],
_excludeRegexes?: readonly RegExp[],
): string[] | { files: string[]; directories: Set<string> } {
const trackDirs = directories instanceof Set;
let hasFiles = false;
Expand Down
45 changes: 42 additions & 3 deletions src/shared/globs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,33 @@ export function globToRegex(pattern: string): RegExp {
return new RegExp(`^${re}$`);
}

const EMPTY_REGEX_LIST: readonly RegExp[] = Object.freeze([]) as readonly RegExp[];

// Compile results are cached by pattern content so a long-running host
// (watch mode, MCP server) doesn't recompile on every buildGraph call.
// Capped to avoid unbounded growth when callers pass many distinct lists.
const COMPILE_CACHE_MAX = 32;
const compileCache = new Map<string, readonly RegExp[]>();

function buildCacheKey(patterns: readonly string[]): string {
// JSON.stringify avoids ambiguity when patterns legitimately contain any
// single character (including control characters or separators a caller
// might choose): ["a", "bc"] → '["a","bc"]' vs ["ab", "c"] → '["ab","c"]'.
return JSON.stringify(patterns);
}

/**
* Compile a list of glob patterns. Invalid / empty patterns are skipped.
*
* Results are memoized per pattern-content so repeated `buildGraph` calls
* with the same include/exclude lists reuse the compiled regexes. The
* returned array is shared across callers and must not be mutated.
*/
export function compileGlobs(patterns: readonly string[] | undefined): RegExp[] {
if (!patterns || patterns.length === 0) return [];
export function compileGlobs(patterns: readonly string[] | undefined): readonly RegExp[] {
if (!patterns || patterns.length === 0) return EMPTY_REGEX_LIST;
const key = buildCacheKey(patterns);
const cached = compileCache.get(key);
if (cached) return cached;
const out: RegExp[] = [];
for (const p of patterns) {
if (typeof p !== 'string' || p.length === 0) continue;
Expand All @@ -66,7 +88,24 @@ export function compileGlobs(patterns: readonly string[] | undefined): RegExp[]
// Ignore malformed patterns rather than failing the whole build.
}
}
return out;
const frozen = Object.freeze(out) as readonly RegExp[];
if (compileCache.size >= COMPILE_CACHE_MAX) {
// FIFO eviction — Map iterates insertion order. Config pattern sets
// are small and stable, so a simple cap is sufficient.
const first = compileCache.keys().next().value;
if (first !== undefined) compileCache.delete(first);
}
compileCache.set(key, frozen);
return frozen;
}

/**
* Clear the compiled-glob cache. Intended for long-running hosts that
* need to reload config (e.g. watch mode after `.codegraphrc.json` edits)
* and for test isolation.
*/
export function clearGlobCache(): void {
compileCache.clear();
}

/**
Expand Down
68 changes: 68 additions & 0 deletions tests/unit/globs.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/**
* Unit tests for src/shared/globs.ts — covers the memoization added so
* long-running hosts (watch mode, MCP server) don't recompile include/exclude
* globs on every buildGraph call.
*/
import { afterEach, describe, expect, it } from 'vitest';
import { clearGlobCache, compileGlobs } from '../../src/shared/globs.js';

describe('compileGlobs', () => {
afterEach(() => {
clearGlobCache();
});

it('returns an empty list for undefined or empty input', () => {
expect(compileGlobs(undefined)).toHaveLength(0);
expect(compileGlobs([])).toHaveLength(0);
});

it('compiles each pattern to a RegExp anchored with ^…$', () => {
const [inc, exc] = [compileGlobs(['src/**/*.ts']), compileGlobs(['**/*.test.ts'])];
expect(inc).toHaveLength(1);
expect(exc).toHaveLength(1);
expect(inc[0]!.test('src/foo/bar.ts')).toBe(true);
expect(inc[0]!.test('tests/foo.ts')).toBe(false);
expect(exc[0]!.test('src/foo/bar.test.ts')).toBe(true);
expect(exc[0]!.test('src/foo/bar.ts')).toBe(false);
});

it('reuses the same array for identical pattern lists (memoization)', () => {
const first = compileGlobs(['src/**/*.ts', '**/*.test.ts']);
const second = compileGlobs(['src/**/*.ts', '**/*.test.ts']);
// Repeated calls with equivalent content return the exact same array — this is
// what makes repeated buildGraph invocations in a long-running host cheap.
expect(second).toBe(first);
});

it('treats different pattern lists as distinct cache entries', () => {
const a = compileGlobs(['src/**/*.ts']);
const b = compileGlobs(['src/**/*.js']);
expect(a).not.toBe(b);
expect(a[0]!.test('src/x.ts')).toBe(true);
expect(b[0]!.test('src/x.js')).toBe(true);
expect(a[0]!.test('src/x.js')).toBe(false);
});

it('treats different pattern orderings as distinct cache entries', () => {
// Order matters for glob matching semantics (even if results are the same
// set here, callers may rely on iteration order elsewhere). Two lists with
// the same patterns in different orders get independent cache slots.
const a = compileGlobs(['a/**', 'b/**']);
const b = compileGlobs(['b/**', 'a/**']);
expect(a).not.toBe(b);
});

it('returns a frozen array to prevent accidental mutation of cached values', () => {
const patterns = compileGlobs(['src/**/*.ts']);
expect(Object.isFrozen(patterns)).toBe(true);
});

it('clearGlobCache drops memoized entries', () => {
const before = compileGlobs(['src/**/*.ts']);
clearGlobCache();
const after = compileGlobs(['src/**/*.ts']);
// Same content, but cleared cache forces a fresh compilation → different
// array identity. Test guards against silent cache leaks across suites.
expect(after).not.toBe(before);
});
});
Loading