From 3c4cedf9bed40292e3297f1202705f80c50693c0 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Tue, 21 Apr 2026 00:43:36 -0600 Subject: [PATCH 1/4] fix(config): honor include/exclude globs in file collection (#981) config.include and config.exclude were declared in DEFAULTS but never consumed by either engine, so users' glob filters in .codegraphrc.json had no effect. Both engines now compile the globs once and filter collected paths (relative to project root, forward-slash normalized) during initial walks and incremental fast-path rebuilds. - New src/shared/globs.ts with compileGlobs + matchesAny (extracted from features/boundaries.ts so the collector and boundaries share one implementation) - TS collector: passesIncludeExclude applied in collectFiles recursion and tryFastCollect so config changes take effect on incremental builds - Rust collector: globset-based filter wired through collect_files and try_fast_collect; BuildConfig gains include/exclude fields - Integration tests (wasm + native parity) cover exclude reject, include limit, combined filters, and empty-config default behavior Fixes #981 Impact: 19 functions changed, 20 affected --- crates/codegraph-core/Cargo.toml | 1 + crates/codegraph-core/src/build_pipeline.rs | 23 ++- crates/codegraph-core/src/config.rs | 17 ++ crates/codegraph-core/src/file_collector.rs | 171 ++++++++++++++++- src/domain/graph/builder/helpers.ts | 56 +++++- .../graph/builder/stages/collect-files.ts | 17 +- src/features/boundaries.ts | 29 +-- src/shared/globs.ts | 73 +++++++ .../config-include-exclude.test.ts | 181 ++++++++++++++++++ 9 files changed, 528 insertions(+), 40 deletions(-) create mode 100644 src/shared/globs.ts create mode 100644 tests/integration/config-include-exclude.test.ts diff --git a/crates/codegraph-core/Cargo.toml b/crates/codegraph-core/Cargo.toml index c7e41cfe..6205f150 100644 --- a/crates/codegraph-core/Cargo.toml +++ b/crates/codegraph-core/Cargo.toml @@ -37,6 +37,7 @@ tree-sitter-haskell = "0.23" tree-sitter-ocaml = "0.24" rayon = "1" ignore = "0.4" +globset = "0.4" sha2 = "0.10" # `bundled` embeds a second SQLite copy (better-sqlite3 already bundles one). # This is intentional: Windows CI lacks a system SQLite, and WAL coordination diff --git a/crates/codegraph-core/src/build_pipeline.rs b/crates/codegraph-core/src/build_pipeline.rs index b1529984..bce637fc 100644 --- a/crates/codegraph-core/src/build_pipeline.rs +++ b/crates/codegraph-core/src/build_pipeline.rs @@ -549,15 +549,32 @@ fn collect_source_files( &db_files, &journal.changed, &journal.removed, + &config.include, + &config.exclude, ) } else { - file_collector::collect_files(root_dir, &config.ignore_dirs) + file_collector::collect_files( + root_dir, + &config.ignore_dirs, + &config.include, + &config.exclude, + ) } } else { - file_collector::collect_files(root_dir, &config.ignore_dirs) + file_collector::collect_files( + root_dir, + &config.ignore_dirs, + &config.include, + &config.exclude, + ) } } else { - file_collector::collect_files(root_dir, &config.ignore_dirs) + file_collector::collect_files( + root_dir, + &config.ignore_dirs, + &config.include, + &config.exclude, + ) } } diff --git a/crates/codegraph-core/src/config.rs b/crates/codegraph-core/src/config.rs index d37d54c1..4dbb706c 100644 --- a/crates/codegraph-core/src/config.rs +++ b/crates/codegraph-core/src/config.rs @@ -10,6 +10,17 @@ use serde::Deserialize; #[derive(Debug, Clone, Deserialize, Default)] #[serde(rename_all = "camelCase")] pub struct BuildConfig { + /// Glob patterns limiting which source files are included. + /// When non-empty, a file must match at least one pattern. + /// Matched against paths relative to the project root. + #[serde(default)] + pub include: Vec, + + /// Glob patterns excluding source files from the build. + /// Matched against paths relative to the project root. + #[serde(default)] + pub exclude: Vec, + /// Additional directory names to ignore during file collection. #[serde(default)] pub ignore_dirs: Vec, @@ -129,12 +140,16 @@ mod tests { fn deserialize_empty_config() { let config: BuildConfig = serde_json::from_str("{}").unwrap(); assert!(config.ignore_dirs.is_empty()); + assert!(config.include.is_empty()); + assert!(config.exclude.is_empty()); assert!(config.build.incremental); } #[test] fn deserialize_full_config() { let json = r#"{ + "include": ["src/**/*.ts"], + "exclude": ["**/*.test.ts", "**/*.spec.ts"], "ignoreDirs": ["vendor", "tmp"], "build": { "incremental": false, @@ -145,6 +160,8 @@ mod tests { } }"#; let config: BuildConfig = serde_json::from_str(json).unwrap(); + assert_eq!(config.include, vec!["src/**/*.ts"]); + assert_eq!(config.exclude, vec!["**/*.test.ts", "**/*.spec.ts"]); assert_eq!(config.ignore_dirs, vec!["vendor", "tmp"]); assert!(!config.build.incremental); assert_eq!(config.build.drift_threshold, 0.2); diff --git a/crates/codegraph-core/src/file_collector.rs b/crates/codegraph-core/src/file_collector.rs index 1d71f22c..863f45e6 100644 --- a/crates/codegraph-core/src/file_collector.rs +++ b/crates/codegraph-core/src/file_collector.rs @@ -5,6 +5,7 @@ //! (from BurntSushi/ripgrep) for gitignore-aware traversal. use crate::parser_registry::LanguageKind; +use globset::{Glob, GlobSet, GlobSetBuilder}; use std::collections::HashSet; use std::path::Path; @@ -44,10 +45,67 @@ pub struct CollectResult { pub directories: HashSet, } +/// Compile a list of glob patterns into a `GlobSet`. +/// +/// Invalid patterns are logged via `eprintln!` and skipped so a single bad +/// entry in config can't take down the whole build. +fn build_glob_set(patterns: &[String]) -> Option { + if patterns.is_empty() { + return None; + } + let mut builder = GlobSetBuilder::new(); + let mut added = 0usize; + for p in patterns { + match Glob::new(p) { + Ok(g) => { + builder.add(g); + added += 1; + } + Err(e) => { + eprintln!("codegraph: ignoring invalid glob pattern {p:?}: {e}"); + } + } + } + if added == 0 { + return None; + } + builder.build().ok() +} + +/// `true` when the relative path passes the configured include/exclude filters. +/// +/// `rel_path` must be relative to the project root and normalized to forward +/// slashes. Mirrors `passesIncludeExclude` in `src/domain/graph/builder/helpers.ts` +/// so both engines accept or reject the same set of files. +pub fn passes_include_exclude( + rel_path: &str, + include: Option<&GlobSet>, + exclude: Option<&GlobSet>, +) -> bool { + if let Some(set) = include { + if !set.is_match(rel_path) { + return false; + } + } + if let Some(set) = exclude { + if set.is_match(rel_path) { + return false; + } + } + true +} + /// Collect all source files under `root_dir`, respecting gitignore and ignore dirs. /// /// `extra_ignore_dirs` are additional directory names to skip (from config `ignoreDirs`). -pub fn collect_files(root_dir: &str, extra_ignore_dirs: &[String]) -> CollectResult { +/// `include_patterns` / `exclude_patterns` are file-level glob filters applied after +/// the extension check, matched against paths relative to `root_dir`. +pub fn collect_files( + root_dir: &str, + extra_ignore_dirs: &[String], + include_patterns: &[String], + exclude_patterns: &[String], +) -> CollectResult { // Build an owned set of ignore dirs to avoid leaking memory. // The closure captures this owned set, so lifetimes are satisfied without Box::leak. let ignore_set: HashSet = DEFAULT_IGNORE_DIRS @@ -58,6 +116,10 @@ pub fn collect_files(root_dir: &str, extra_ignore_dirs: &[String]) -> CollectRes let ext_set: HashSet<&str> = SUPPORTED_EXTENSIONS.iter().copied().collect(); + let include_set = build_glob_set(include_patterns); + let exclude_set = build_glob_set(exclude_patterns); + let root_path = Path::new(root_dir); + let mut files = Vec::new(); let mut directories = HashSet::new(); @@ -105,6 +167,19 @@ pub fn collect_files(root_dir: &str, extra_ignore_dirs: &[String]) -> CollectRes } } + // Apply file-level include/exclude globs against the relative path. + if include_set.is_some() || exclude_set.is_some() { + let rel = path + .strip_prefix(root_path) + .ok() + .and_then(|p| p.to_str()) + .map(|s| s.replace('\\', "/")) + .unwrap_or_else(|| normalize_path(path)); + if !passes_include_exclude(&rel, include_set.as_ref(), exclude_set.as_ref()) { + continue; + } + } + let abs = normalize_path(path); if let Some(parent) = path.parent() { directories.insert(normalize_path(parent)); @@ -117,12 +192,18 @@ pub fn collect_files(root_dir: &str, extra_ignore_dirs: &[String]) -> CollectRes /// Reconstruct file list from DB file_hashes + journal deltas (fast path). /// +/// Applies `include_patterns` / `exclude_patterns` so incremental builds honor +/// config changes — the paths in the DB were collected under an earlier config +/// that may have had different glob filters. +/// /// Returns `None` when the fast path isn't applicable. pub fn try_fast_collect( root_dir: &str, db_files: &[String], journal_changed: &[String], journal_removed: &[String], + include_patterns: &[String], + exclude_patterns: &[String], ) -> CollectResult { let mut file_set: HashSet = db_files.iter().cloned().collect(); @@ -134,12 +215,22 @@ pub fn try_fast_collect( file_set.insert(changed.clone()); } + let include_set = build_glob_set(include_patterns); + let exclude_set = build_glob_set(exclude_patterns); + let has_filters = include_set.is_some() || exclude_set.is_some(); + // Convert relative paths to absolute and compute directories let root = Path::new(root_dir); let mut files = Vec::with_capacity(file_set.len()); let mut directories = HashSet::new(); for rel_path in &file_set { + if has_filters { + let norm = rel_path.replace('\\', "/"); + if !passes_include_exclude(&norm, include_set.as_ref(), exclude_set.as_ref()) { + continue; + } + } let abs = root.join(rel_path); let abs_str = normalize_path(&abs); if let Some(parent) = abs.parent() { @@ -171,7 +262,7 @@ mod tests { fs::write(src.join("readme.md"), "# Hello").unwrap(); fs::write(src.join("util.js"), "module.exports = {};").unwrap(); - let result = collect_files(tmp.to_str().unwrap(), &[]); + let result = collect_files(tmp.to_str().unwrap(), &[], &[], &[]); let names: HashSet = result .files .iter() @@ -200,13 +291,61 @@ mod tests { fs::create_dir_all(&src).unwrap(); fs::write(src.join("app.ts"), "").unwrap(); - let result = collect_files(tmp.to_str().unwrap(), &[]); + let result = collect_files(tmp.to_str().unwrap(), &[], &[], &[]); assert_eq!(result.files.len(), 1); assert!(result.files[0].contains("app.ts")); let _ = fs::remove_dir_all(&tmp); } + #[test] + fn collect_honors_exclude_globs() { + let tmp = std::env::temp_dir().join("codegraph_collect_exclude_test"); + let _ = fs::remove_dir_all(&tmp); + let src = tmp.join("src"); + fs::create_dir_all(&src).unwrap(); + fs::write(src.join("app.ts"), "").unwrap(); + fs::write(src.join("app.test.ts"), "").unwrap(); + fs::write(src.join("util.ts"), "").unwrap(); + + let exclude = vec!["**/*.test.ts".to_string()]; + let result = collect_files(tmp.to_str().unwrap(), &[], &[], &exclude); + let names: HashSet = result + .files + .iter() + .filter_map(|f| Path::new(f).file_name().map(|n| n.to_str().unwrap().to_string())) + .collect(); + assert!(names.contains("app.ts")); + assert!(names.contains("util.ts")); + assert!(!names.contains("app.test.ts"), "exclude glob should reject matching files"); + + let _ = fs::remove_dir_all(&tmp); + } + + #[test] + fn collect_honors_include_globs() { + let tmp = std::env::temp_dir().join("codegraph_collect_include_test"); + let _ = fs::remove_dir_all(&tmp); + let src = tmp.join("src"); + let tests = tmp.join("tests"); + fs::create_dir_all(&src).unwrap(); + fs::create_dir_all(&tests).unwrap(); + fs::write(src.join("app.ts"), "").unwrap(); + fs::write(tests.join("spec.ts"), "").unwrap(); + + let include = vec!["src/**".to_string()]; + let result = collect_files(tmp.to_str().unwrap(), &[], &include, &[]); + let names: HashSet = result + .files + .iter() + .filter_map(|f| Path::new(f).file_name().map(|n| n.to_str().unwrap().to_string())) + .collect(); + assert!(names.contains("app.ts")); + assert!(!names.contains("spec.ts"), "include glob should reject non-matching files"); + + let _ = fs::remove_dir_all(&tmp); + } + #[test] fn fast_collect_applies_deltas() { let root = "/project"; @@ -218,7 +357,7 @@ mod tests { let changed = vec!["src/d.ts".to_string()]; let removed = vec!["src/b.ts".to_string()]; - let result = try_fast_collect(root, &db_files, &changed, &removed); + let result = try_fast_collect(root, &db_files, &changed, &removed, &[], &[]); assert_eq!(result.files.len(), 3); // a, c, d let names: HashSet<&str> = result .files @@ -230,4 +369,28 @@ mod tests { assert!(names.contains("c.ts")); assert!(names.contains("d.ts")); } + + #[test] + fn fast_collect_honors_exclude_globs() { + let root = "/project"; + let db_files = vec![ + "src/a.ts".to_string(), + "src/a.test.ts".to_string(), + "src/b.ts".to_string(), + ]; + let exclude = vec!["**/*.test.ts".to_string()]; + + let result = try_fast_collect(root, &db_files, &[], &[], &[], &exclude); + let names: HashSet<&str> = result + .files + .iter() + .map(|f| f.rsplit('/').next().unwrap_or(f)) + .collect(); + assert!(names.contains("a.ts")); + assert!(names.contains("b.ts")); + assert!( + !names.contains("a.test.ts"), + "fast path must filter out excluded files so incremental builds honor config changes" + ); + } } diff --git a/src/domain/graph/builder/helpers.ts b/src/domain/graph/builder/helpers.ts index ee03e73b..6f57b705 100644 --- a/src/domain/graph/builder/helpers.ts +++ b/src/domain/graph/builder/helpers.ts @@ -8,7 +8,8 @@ import fs from 'node:fs'; import path from 'node:path'; import { purgeFilesData } from '../../../db/index.js'; import { warn } from '../../../infrastructure/logger.js'; -import { EXTENSIONS, IGNORE_DIRS } from '../../../shared/constants.js'; +import { EXTENSIONS, IGNORE_DIRS, normalizePath } from '../../../shared/constants.js'; +import { compileGlobs, matchesAny } from '../../../shared/globs.js'; import type { BetterSqlite3Database, CodegraphConfig, @@ -58,9 +59,29 @@ function shouldSkipEntry(entry: fs.Dirent, extraIgnore: Set | null): boo return false; } +/** + * Check whether a source file passes the configured include/exclude globs. + * + * Patterns are matched against the path relative to the project root, + * normalized to forward slashes (e.g. `src/foo/bar.ts`). When both lists + * are set, a file must match at least one include and no exclude. + */ +export function passesIncludeExclude( + relPath: string, + includeRegexes: readonly RegExp[], + excludeRegexes: readonly RegExp[], +): boolean { + if (includeRegexes.length > 0 && !matchesAny(includeRegexes, relPath)) return false; + if (excludeRegexes.length > 0 && matchesAny(excludeRegexes, relPath)) return false; + return true; +} + /** * Recursively collect all source files under `dir`. * When `directories` is a Set, also tracks which directories contain files. + * + * The first invocation establishes `dir` as the project root against which + * `config.include` / `config.exclude` globs are matched. */ export function collectFiles( dir: string, @@ -68,6 +89,9 @@ export function collectFiles( config: Partial, directories: Set, _visited?: Set, + _rootDir?: string, + _includeRegexes?: RegExp[], + _excludeRegexes?: RegExp[], ): { files: string[]; directories: Set }; export function collectFiles( dir: string, @@ -75,6 +99,9 @@ export function collectFiles( config?: Partial, directories?: null, _visited?: Set, + _rootDir?: string, + _includeRegexes?: RegExp[], + _excludeRegexes?: RegExp[], ): string[]; export function collectFiles( dir: string, @@ -82,10 +109,20 @@ export function collectFiles( config: Partial = {}, directories: Set | null = null, _visited: Set = new Set(), + _rootDir?: string, + _includeRegexes?: RegExp[], + _excludeRegexes?: RegExp[], ): string[] | { files: string[]; directories: Set } { const trackDirs = directories instanceof Set; let hasFiles = false; + // First call: compute root and compile include/exclude patterns once, + // then pass them down recursive calls so we don't recompile per directory. + const rootDir = _rootDir ?? dir; + const includeRegexes = _includeRegexes ?? compileGlobs(config.include); + const excludeRegexes = _excludeRegexes ?? compileGlobs(config.exclude); + const hasGlobFilters = includeRegexes.length > 0 || excludeRegexes.length > 0; + // Merge config ignoreDirs with defaults const extraIgnore = config.ignoreDirs ? new Set(config.ignoreDirs) : null; @@ -116,11 +153,24 @@ export function collectFiles( const full = path.join(dir, entry.name); if (entry.isDirectory()) { if (trackDirs) { - collectFiles(full, files, config, directories as Set, _visited); + collectFiles( + full, + files, + config, + directories as Set, + _visited, + rootDir, + includeRegexes, + excludeRegexes, + ); } else { - collectFiles(full, files, config, null, _visited); + collectFiles(full, files, config, null, _visited, rootDir, includeRegexes, excludeRegexes); } } else if (EXTENSIONS.has(path.extname(entry.name))) { + if (hasGlobFilters) { + const rel = normalizePath(path.relative(rootDir, full)); + if (!passesIncludeExclude(rel, includeRegexes, excludeRegexes)) continue; + } files.push(full); hasFiles = true; } diff --git a/src/domain/graph/builder/stages/collect-files.ts b/src/domain/graph/builder/stages/collect-files.ts index 73c19441..13f0b6c9 100644 --- a/src/domain/graph/builder/stages/collect-files.ts +++ b/src/domain/graph/builder/stages/collect-files.ts @@ -9,9 +9,10 @@ import fs from 'node:fs'; import path from 'node:path'; import { debug, info } from '../../../../infrastructure/logger.js'; import { normalizePath } from '../../../../shared/constants.js'; +import { compileGlobs } from '../../../../shared/globs.js'; import { readJournal } from '../../journal.js'; import type { PipelineContext } from '../context.js'; -import { collectFiles as collectFilesUtil } from '../helpers.js'; +import { collectFiles as collectFilesUtil, passesIncludeExclude } from '../helpers.js'; /** * Reconstruct allFiles from DB file_hashes + journal deltas. @@ -20,7 +21,7 @@ import { collectFiles as collectFilesUtil } from '../helpers.js'; function tryFastCollect( ctx: PipelineContext, ): { files: string[]; directories: Set } | null { - const { db, rootDir } = ctx; + const { db, rootDir, config } = ctx; const useNative = ctx.engineName === 'native' && !!ctx.nativeDb?.getCollectFilesData; // 1. Check that file_hashes table exists and has entries @@ -70,10 +71,20 @@ function tryFastCollect( } } - // 5. Convert to absolute paths and compute directories + // 5. Convert to absolute paths and compute directories, honoring + // config.include / config.exclude globs so incremental builds reflect + // config changes (paths from the DB were collected under older config). + const includeRegexes = compileGlobs(config?.include); + const excludeRegexes = compileGlobs(config?.exclude); + const hasGlobFilters = includeRegexes.length > 0 || excludeRegexes.length > 0; + const files: string[] = []; const directories = new Set(); for (const relPath of fileSet) { + if (hasGlobFilters) { + const normRel = normalizePath(relPath); + if (!passesIncludeExclude(normRel, includeRegexes, excludeRegexes)) continue; + } const absPath = path.join(rootDir, relPath); files.push(absPath); directories.add(path.dirname(absPath)); diff --git a/src/features/boundaries.ts b/src/features/boundaries.ts index c792a284..05f7738f 100644 --- a/src/features/boundaries.ts +++ b/src/features/boundaries.ts @@ -1,34 +1,9 @@ import { isTestFile } from '../infrastructure/test-filter.js'; import { BoundaryError } from '../shared/errors.js'; +import { globToRegex } from '../shared/globs.js'; import type { BetterSqlite3Database } from '../types.js'; -// ─── Glob-to-Regex ─────────────────────────────────────────────────── - -export function globToRegex(pattern: string): RegExp { - let re = ''; - let i = 0; - while (i < pattern.length) { - const ch = pattern[i] as string; - if (ch === '*' && pattern[i + 1] === '*') { - re += '.*'; - i += 2; - if (pattern[i] === '/') i++; - } else if (ch === '*') { - re += '[^/]*'; - i++; - } else if (ch === '?') { - re += '[^/]'; - i++; - } else if (/[.+^${}()|[\]\\]/.test(ch)) { - re += `\\${ch}`; - i++; - } else { - re += ch; - i++; - } - } - return new RegExp(`^${re}$`); -} +export { globToRegex }; // ─── Presets ───────────────────────────────────────────────────────── diff --git a/src/shared/globs.ts b/src/shared/globs.ts new file mode 100644 index 00000000..5a0d459c --- /dev/null +++ b/src/shared/globs.ts @@ -0,0 +1,73 @@ +/** + * Glob → RegExp conversion utilities. + * + * Shared by boundary rules (`features/boundaries.ts`) and the file-collection + * include/exclude filters (`domain/graph/builder/helpers.ts`). Keeping a single + * implementation ensures users get consistent glob semantics everywhere. + * + * Supported syntax: + * - `**` matches any sequence of characters including `/` + * - `*` matches any sequence of characters except `/` + * - `?` matches a single non-slash character + * - other regex metacharacters are escaped literally + * + * Paths must use forward slashes (callers normalize before testing). + */ + +/** + * Compile a glob pattern into a `RegExp` anchored with `^…$`. + */ +export function globToRegex(pattern: string): RegExp { + let re = ''; + let i = 0; + while (i < pattern.length) { + const ch = pattern[i] as string; + if (ch === '*' && pattern[i + 1] === '*') { + re += '.*'; + i += 2; + if (pattern[i] === '/') i++; + } else if (ch === '*') { + re += '[^/]*'; + i++; + } else if (ch === '?') { + re += '[^/]'; + i++; + } else if (/[.+^${}()|[\]\\]/.test(ch)) { + re += `\\${ch}`; + i++; + } else { + re += ch; + i++; + } + } + return new RegExp(`^${re}$`); +} + +/** + * Compile a list of glob patterns. Invalid / empty patterns are skipped. + */ +export function compileGlobs(patterns: readonly string[] | undefined): RegExp[] { + if (!patterns || patterns.length === 0) return []; + const out: RegExp[] = []; + for (const p of patterns) { + if (typeof p !== 'string' || p.length === 0) continue; + try { + out.push(globToRegex(p)); + } catch { + // Ignore malformed patterns rather than failing the whole build. + } + } + return out; +} + +/** + * `true` when at least one compiled pattern matches the given path. + * + * The path must already be normalized to forward slashes. + */ +export function matchesAny(regexes: readonly RegExp[], path: string): boolean { + for (const re of regexes) { + if (re.test(path)) return true; + } + return false; +} diff --git a/tests/integration/config-include-exclude.test.ts b/tests/integration/config-include-exclude.test.ts new file mode 100644 index 00000000..dd3655ee --- /dev/null +++ b/tests/integration/config-include-exclude.test.ts @@ -0,0 +1,181 @@ +/** + * Integration tests for `config.include` / `config.exclude` (issue #981). + * + * Verifies that top-level `include` / `exclude` globs in `.codegraphrc.json` + * actually filter the files included in the build — and that both the native + * Rust engine and the WASM/JS engine honor the filters identically. + */ + +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import Database from 'better-sqlite3'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; +import { buildGraph } from '../../src/domain/graph/builder.js'; +import { clearConfigCache } from '../../src/infrastructure/config.js'; +import { isNativeAvailable } from '../../src/infrastructure/native.js'; + +const FIXTURE_FILES: Record = { + 'src/math.js': ` +export function add(a, b) { return a + b; } +export function multiply(a, b) { return a * b; } +`.trimStart(), + 'src/util.js': ` +import { add } from './math.js'; +export function doubleSum(a, b) { return add(a, b) + add(a, b); } +`.trimStart(), + 'src/math.test.js': ` +import { add } from './math.js'; +if (add(1, 2) !== 3) throw new Error('math broken'); +`.trimStart(), + 'src/util.spec.js': ` +import { doubleSum } from './util.js'; +if (doubleSum(1, 2) !== 6) throw new Error('util broken'); +`.trimStart(), + 'scratch/notes.js': ` +export const scratch = 42; +`.trimStart(), +}; + +function writeFixture(root: string): void { + for (const [relPath, content] of Object.entries(FIXTURE_FILES)) { + const abs = path.join(root, relPath); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, content); + } +} + +function readFileRows(dbPath: string): string[] { + const db = new Database(dbPath, { readonly: true }); + try { + // `file_hashes` is the authoritative list of files that actually passed + // collection. `nodes.file` also contains synthetic directory/module rows. + const rows = db.prepare('SELECT file FROM file_hashes ORDER BY file').all() as Array<{ + file: string; + }>; + return rows.map((r) => r.file).sort(); + } finally { + db.close(); + } +} + +type EngineName = 'native' | 'wasm'; + +async function buildWithEngine( + root: string, + engine: EngineName, + config: { include?: string[]; exclude?: string[] }, +): Promise { + fs.writeFileSync(path.join(root, '.codegraphrc.json'), JSON.stringify(config)); + // `loadConfig` caches per cwd — blow the cache so each test's config is + // actually re-read from disk. + clearConfigCache(); + // Wipe DB between runs so file list is authoritative. + const dbDir = path.join(root, '.codegraph'); + if (fs.existsSync(dbDir)) fs.rmSync(dbDir, { recursive: true, force: true }); + await buildGraph(root, { engine, skipRegistry: true }); + const files = readFileRows(path.join(dbDir, 'graph.db')); + // `file_hashes` stores relative paths; normalize slashes so assertions are + // cross-platform. + return files.map((f) => f.replace(/\\/g, '/')).sort(); +} + +describe('config.include / config.exclude (issue #981)', () => { + let tmpDir: string; + let wasmRoot: string; + let nativeRoot: string; + + beforeAll(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-cfg-inc-exc-')); + wasmRoot = fs.mkdtempSync(path.join(tmpDir, 'wasm-')); + nativeRoot = fs.mkdtempSync(path.join(tmpDir, 'native-')); + writeFixture(wasmRoot); + writeFixture(nativeRoot); + }); + + afterAll(() => { + if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + // ── wasm engine ─────────────────────────────────────────────────── + + it('wasm: exclude glob rejects matching files', async () => { + const files = await buildWithEngine(wasmRoot, 'wasm', { + exclude: ['**/*.test.js', '**/*.spec.js'], + }); + expect(files).toContain('src/math.js'); + expect(files).toContain('src/util.js'); + expect(files).not.toContain('src/math.test.js'); + expect(files).not.toContain('src/util.spec.js'); + }); + + it('wasm: include glob limits collection to matching files', async () => { + const files = await buildWithEngine(wasmRoot, 'wasm', { + include: ['src/**'], + }); + // scratch/ is outside src/, so nothing from it should be included + expect(files.some((f) => f.startsWith('scratch/'))).toBe(false); + expect(files).toContain('src/math.js'); + }); + + it('wasm: include + exclude combine (include first, exclude trims)', async () => { + const files = await buildWithEngine(wasmRoot, 'wasm', { + include: ['src/**'], + exclude: ['**/*.test.js', '**/*.spec.js'], + }); + expect(files).toContain('src/math.js'); + expect(files).toContain('src/util.js'); + expect(files).not.toContain('src/math.test.js'); + expect(files).not.toContain('src/util.spec.js'); + expect(files.some((f) => f.startsWith('scratch/'))).toBe(false); + }); + + it('wasm: empty include/exclude preserves prior behavior (collects everything supported)', async () => { + const files = await buildWithEngine(wasmRoot, 'wasm', {}); + expect(files).toContain('src/math.js'); + expect(files).toContain('src/math.test.js'); + expect(files).toContain('src/util.spec.js'); + expect(files).toContain('scratch/notes.js'); + }); + + // ── native engine (skipped when not installed) ─────────────────── + + const nativeAvailable = isNativeAvailable(); + const itNative = nativeAvailable ? it : it.skip; + + itNative('native: exclude glob rejects matching files', async () => { + const files = await buildWithEngine(nativeRoot, 'native', { + exclude: ['**/*.test.js', '**/*.spec.js'], + }); + expect(files).toContain('src/math.js'); + expect(files).toContain('src/util.js'); + expect(files).not.toContain('src/math.test.js'); + expect(files).not.toContain('src/util.spec.js'); + }); + + itNative('native: include glob limits collection to matching files', async () => { + const files = await buildWithEngine(nativeRoot, 'native', { + include: ['src/**'], + }); + expect(files.some((f) => f.startsWith('scratch/'))).toBe(false); + expect(files).toContain('src/math.js'); + }); + + // ── engine parity ──────────────────────────────────────────────── + + itNative('native + wasm produce identical file sets under include/exclude', async () => { + const parityWasm = fs.mkdtempSync(path.join(tmpDir, 'parity-wasm-')); + const parityNative = fs.mkdtempSync(path.join(tmpDir, 'parity-native-')); + writeFixture(parityWasm); + writeFixture(parityNative); + + const cfg = { + include: ['src/**'], + exclude: ['**/*.test.js', '**/*.spec.js'], + }; + const wasmFiles = await buildWithEngine(parityWasm, 'wasm', cfg); + const nativeFiles = await buildWithEngine(parityNative, 'native', cfg); + // Paths are already relative to each run's own tmpDir so they compare directly. + expect(nativeFiles).toEqual(wasmFiles); + }); +}); From 4c97b2acedaa112ce98d377d17c04fad76c6bdc3 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Tue, 21 Apr 2026 00:44:02 -0600 Subject: [PATCH 2/4] chore: sync Cargo.lock for globset dependency (#981) --- Cargo.lock | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.lock b/Cargo.lock index b580ebff..fb7395f7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -68,6 +68,7 @@ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" name = "codegraph-core" version = "3.9.4" dependencies = [ + "globset", "ignore", "napi", "napi-build", From b6ce50a4e2a6a01b735c8c1f10aea486256dabe3 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Tue, 21 Apr 2026 01:14:36 -0600 Subject: [PATCH 3/4] fix(globs): enforce path-component boundary for `**/` patterns (#994) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `globToRegex` WASM-side glob compiler consumed the `/` after `**` without adding a directory-boundary group, so `**/index.ts` compiled to `^.*index\.ts$` and matched `barindex.ts`. The Rust `globset` crate enforces the boundary, so the two engines disagreed on these patterns. Compile `**/` as `(?:[^/]+/)*` — zero or more complete directory segments — keeping parity with globset. Bare `**` (e.g. trailing in `dir/**`) still compiles to `.*` so `dir/**` keeps matching `dir/a/b`. Adds regression tests for `**/` and `dir/**`. Impact: 1 functions changed, 8 affected --- src/shared/globs.ts | 13 +++++++++++-- tests/unit/boundaries.test.ts | 19 +++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/src/shared/globs.ts b/src/shared/globs.ts index 5a0d459c..9cd6c786 100644 --- a/src/shared/globs.ts +++ b/src/shared/globs.ts @@ -23,9 +23,18 @@ export function globToRegex(pattern: string): RegExp { while (i < pattern.length) { const ch = pattern[i] as string; if (ch === '*' && pattern[i + 1] === '*') { - re += '.*'; i += 2; - if (pattern[i] === '/') i++; + if (pattern[i] === '/') { + // `**/` matches zero or more full path segments, preserving the + // directory boundary before the next segment. Without this, patterns + // like `**/foo.ts` would compile to `^.*foo\.ts$` and match + // `barfoo.ts`, diverging from Rust `globset` semantics. + re += '(?:[^/]+/)*'; + i++; + } else { + // Bare `**` (e.g. `dir/**`, or trailing) matches anything. + re += '.*'; + } } else if (ch === '*') { re += '[^/]*'; i++; diff --git a/tests/unit/boundaries.test.ts b/tests/unit/boundaries.test.ts index 8684d8bc..2b12bd7b 100644 --- a/tests/unit/boundaries.test.ts +++ b/tests/unit/boundaries.test.ts @@ -47,6 +47,25 @@ describe('globToRegex', () => { expect(re.test('foo.test.js')).toBe(true); }); + test('**/ enforces path-component boundary', () => { + // Matches parity with Rust `globset`: `**/index.ts` must NOT match + // `barindex.ts` — the segment before `index.ts` has to be a full + // directory component (or absent). See PR #994 / Greptile review. + const re = globToRegex('**/index.ts'); + expect(re.test('index.ts')).toBe(true); + expect(re.test('src/index.ts')).toBe(true); + expect(re.test('a/b/index.ts')).toBe(true); + expect(re.test('barindex.ts')).toBe(false); + expect(re.test('src/barindex.ts')).toBe(false); + }); + + test('trailing ** (e.g. dir/**) matches anything under dir', () => { + const re = globToRegex('dir/**'); + expect(re.test('dir/a')).toBe(true); + expect(re.test('dir/a/b')).toBe(true); + expect(re.test('other/a')).toBe(false); + }); + test('exact path match', () => { const re = globToRegex('src/controllers/main.js'); expect(re.test('src/controllers/main.js')).toBe(true); From 63c9789c5fcc1fac681ccd9f55643898919ea2d6 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Tue, 21 Apr 2026 01:14:53 -0600 Subject: [PATCH 4/4] fix(config): surface GlobSetBuilder::build errors instead of silently disabling filters (#994) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If `GlobSetBuilder::build()` returned `Err`, `build_glob_set` silently returned `None` and all include/exclude filters were disabled — users would see unexpected files in the graph with no clue why. Mirror the per-pattern error path and log the failure via `eprintln!` before falling back to `None`. Impact: 1 functions changed, 8 affected --- crates/codegraph-core/src/file_collector.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/crates/codegraph-core/src/file_collector.rs b/crates/codegraph-core/src/file_collector.rs index 863f45e6..1ba57c83 100644 --- a/crates/codegraph-core/src/file_collector.rs +++ b/crates/codegraph-core/src/file_collector.rs @@ -69,7 +69,17 @@ fn build_glob_set(patterns: &[String]) -> Option { if added == 0 { return None; } - builder.build().ok() + match builder.build() { + Ok(set) => Some(set), + Err(e) => { + // Failing to build the GlobSet disables *all* include/exclude + // filters, which silently changes what files the build sees. + // Surface the error so users can correct their config instead of + // being confused by ignored filters. + eprintln!("codegraph: failed to build glob set: {e}"); + None + } + } } /// `true` when the relative path passes the configured include/exclude filters.