diff --git a/Cargo.lock b/Cargo.lock index b580ebff..fb7395f7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -68,6 +68,7 @@ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" name = "codegraph-core" version = "3.9.4" dependencies = [ + "globset", "ignore", "napi", "napi-build", diff --git a/crates/codegraph-core/Cargo.toml b/crates/codegraph-core/Cargo.toml index c7e41cfe..6205f150 100644 --- a/crates/codegraph-core/Cargo.toml +++ b/crates/codegraph-core/Cargo.toml @@ -37,6 +37,7 @@ tree-sitter-haskell = "0.23" tree-sitter-ocaml = "0.24" rayon = "1" ignore = "0.4" +globset = "0.4" sha2 = "0.10" # `bundled` embeds a second SQLite copy (better-sqlite3 already bundles one). # This is intentional: Windows CI lacks a system SQLite, and WAL coordination diff --git a/crates/codegraph-core/src/build_pipeline.rs b/crates/codegraph-core/src/build_pipeline.rs index 20ecce41..7690d903 100644 --- a/crates/codegraph-core/src/build_pipeline.rs +++ b/crates/codegraph-core/src/build_pipeline.rs @@ -549,15 +549,32 @@ fn collect_source_files( &db_files, &journal.changed, &journal.removed, + &config.include, + &config.exclude, ) } else { - file_collector::collect_files(root_dir, &config.ignore_dirs) + file_collector::collect_files( + root_dir, + &config.ignore_dirs, + &config.include, + &config.exclude, + ) } } else { - file_collector::collect_files(root_dir, &config.ignore_dirs) + file_collector::collect_files( + root_dir, + &config.ignore_dirs, + &config.include, + &config.exclude, + ) } } else { - file_collector::collect_files(root_dir, &config.ignore_dirs) + file_collector::collect_files( + root_dir, + &config.ignore_dirs, + &config.include, + &config.exclude, + ) } } diff --git a/crates/codegraph-core/src/config.rs b/crates/codegraph-core/src/config.rs index d37d54c1..4dbb706c 100644 --- a/crates/codegraph-core/src/config.rs +++ b/crates/codegraph-core/src/config.rs @@ -10,6 +10,17 @@ use serde::Deserialize; #[derive(Debug, Clone, Deserialize, Default)] #[serde(rename_all = "camelCase")] pub struct BuildConfig { + /// Glob patterns limiting which source files are included. + /// When non-empty, a file must match at least one pattern. + /// Matched against paths relative to the project root. + #[serde(default)] + pub include: Vec, + + /// Glob patterns excluding source files from the build. + /// Matched against paths relative to the project root. + #[serde(default)] + pub exclude: Vec, + /// Additional directory names to ignore during file collection. #[serde(default)] pub ignore_dirs: Vec, @@ -129,12 +140,16 @@ mod tests { fn deserialize_empty_config() { let config: BuildConfig = serde_json::from_str("{}").unwrap(); assert!(config.ignore_dirs.is_empty()); + assert!(config.include.is_empty()); + assert!(config.exclude.is_empty()); assert!(config.build.incremental); } #[test] fn deserialize_full_config() { let json = r#"{ + "include": ["src/**/*.ts"], + "exclude": ["**/*.test.ts", "**/*.spec.ts"], "ignoreDirs": ["vendor", "tmp"], "build": { "incremental": false, @@ -145,6 +160,8 @@ mod tests { } }"#; let config: BuildConfig = serde_json::from_str(json).unwrap(); + assert_eq!(config.include, vec!["src/**/*.ts"]); + assert_eq!(config.exclude, vec!["**/*.test.ts", "**/*.spec.ts"]); assert_eq!(config.ignore_dirs, vec!["vendor", "tmp"]); assert!(!config.build.incremental); assert_eq!(config.build.drift_threshold, 0.2); diff --git a/crates/codegraph-core/src/file_collector.rs b/crates/codegraph-core/src/file_collector.rs index 1d71f22c..1ba57c83 100644 --- a/crates/codegraph-core/src/file_collector.rs +++ b/crates/codegraph-core/src/file_collector.rs @@ -5,6 +5,7 @@ //! (from BurntSushi/ripgrep) for gitignore-aware traversal. use crate::parser_registry::LanguageKind; +use globset::{Glob, GlobSet, GlobSetBuilder}; use std::collections::HashSet; use std::path::Path; @@ -44,10 +45,77 @@ pub struct CollectResult { pub directories: HashSet, } +/// Compile a list of glob patterns into a `GlobSet`. +/// +/// Invalid patterns are logged via `eprintln!` and skipped so a single bad +/// entry in config can't take down the whole build. +fn build_glob_set(patterns: &[String]) -> Option { + if patterns.is_empty() { + return None; + } + let mut builder = GlobSetBuilder::new(); + let mut added = 0usize; + for p in patterns { + match Glob::new(p) { + Ok(g) => { + builder.add(g); + added += 1; + } + Err(e) => { + eprintln!("codegraph: ignoring invalid glob pattern {p:?}: {e}"); + } + } + } + if added == 0 { + return None; + } + match builder.build() { + Ok(set) => Some(set), + Err(e) => { + // Failing to build the GlobSet disables *all* include/exclude + // filters, which silently changes what files the build sees. + // Surface the error so users can correct their config instead of + // being confused by ignored filters. + eprintln!("codegraph: failed to build glob set: {e}"); + None + } + } +} + +/// `true` when the relative path passes the configured include/exclude filters. +/// +/// `rel_path` must be relative to the project root and normalized to forward +/// slashes. Mirrors `passesIncludeExclude` in `src/domain/graph/builder/helpers.ts` +/// so both engines accept or reject the same set of files. +pub fn passes_include_exclude( + rel_path: &str, + include: Option<&GlobSet>, + exclude: Option<&GlobSet>, +) -> bool { + if let Some(set) = include { + if !set.is_match(rel_path) { + return false; + } + } + if let Some(set) = exclude { + if set.is_match(rel_path) { + return false; + } + } + true +} + /// Collect all source files under `root_dir`, respecting gitignore and ignore dirs. /// /// `extra_ignore_dirs` are additional directory names to skip (from config `ignoreDirs`). -pub fn collect_files(root_dir: &str, extra_ignore_dirs: &[String]) -> CollectResult { +/// `include_patterns` / `exclude_patterns` are file-level glob filters applied after +/// the extension check, matched against paths relative to `root_dir`. +pub fn collect_files( + root_dir: &str, + extra_ignore_dirs: &[String], + include_patterns: &[String], + exclude_patterns: &[String], +) -> CollectResult { // Build an owned set of ignore dirs to avoid leaking memory. // The closure captures this owned set, so lifetimes are satisfied without Box::leak. let ignore_set: HashSet = DEFAULT_IGNORE_DIRS @@ -58,6 +126,10 @@ pub fn collect_files(root_dir: &str, extra_ignore_dirs: &[String]) -> CollectRes let ext_set: HashSet<&str> = SUPPORTED_EXTENSIONS.iter().copied().collect(); + let include_set = build_glob_set(include_patterns); + let exclude_set = build_glob_set(exclude_patterns); + let root_path = Path::new(root_dir); + let mut files = Vec::new(); let mut directories = HashSet::new(); @@ -105,6 +177,19 @@ pub fn collect_files(root_dir: &str, extra_ignore_dirs: &[String]) -> CollectRes } } + // Apply file-level include/exclude globs against the relative path. + if include_set.is_some() || exclude_set.is_some() { + let rel = path + .strip_prefix(root_path) + .ok() + .and_then(|p| p.to_str()) + .map(|s| s.replace('\\', "/")) + .unwrap_or_else(|| normalize_path(path)); + if !passes_include_exclude(&rel, include_set.as_ref(), exclude_set.as_ref()) { + continue; + } + } + let abs = normalize_path(path); if let Some(parent) = path.parent() { directories.insert(normalize_path(parent)); @@ -117,12 +202,18 @@ pub fn collect_files(root_dir: &str, extra_ignore_dirs: &[String]) -> CollectRes /// Reconstruct file list from DB file_hashes + journal deltas (fast path). /// +/// Applies `include_patterns` / `exclude_patterns` so incremental builds honor +/// config changes — the paths in the DB were collected under an earlier config +/// that may have had different glob filters. +/// /// Returns `None` when the fast path isn't applicable. pub fn try_fast_collect( root_dir: &str, db_files: &[String], journal_changed: &[String], journal_removed: &[String], + include_patterns: &[String], + exclude_patterns: &[String], ) -> CollectResult { let mut file_set: HashSet = db_files.iter().cloned().collect(); @@ -134,12 +225,22 @@ pub fn try_fast_collect( file_set.insert(changed.clone()); } + let include_set = build_glob_set(include_patterns); + let exclude_set = build_glob_set(exclude_patterns); + let has_filters = include_set.is_some() || exclude_set.is_some(); + // Convert relative paths to absolute and compute directories let root = Path::new(root_dir); let mut files = Vec::with_capacity(file_set.len()); let mut directories = HashSet::new(); for rel_path in &file_set { + if has_filters { + let norm = rel_path.replace('\\', "/"); + if !passes_include_exclude(&norm, include_set.as_ref(), exclude_set.as_ref()) { + continue; + } + } let abs = root.join(rel_path); let abs_str = normalize_path(&abs); if let Some(parent) = abs.parent() { @@ -171,7 +272,7 @@ mod tests { fs::write(src.join("readme.md"), "# Hello").unwrap(); fs::write(src.join("util.js"), "module.exports = {};").unwrap(); - let result = collect_files(tmp.to_str().unwrap(), &[]); + let result = collect_files(tmp.to_str().unwrap(), &[], &[], &[]); let names: HashSet = result .files .iter() @@ -200,13 +301,61 @@ mod tests { fs::create_dir_all(&src).unwrap(); fs::write(src.join("app.ts"), "").unwrap(); - let result = collect_files(tmp.to_str().unwrap(), &[]); + let result = collect_files(tmp.to_str().unwrap(), &[], &[], &[]); assert_eq!(result.files.len(), 1); assert!(result.files[0].contains("app.ts")); let _ = fs::remove_dir_all(&tmp); } + #[test] + fn collect_honors_exclude_globs() { + let tmp = std::env::temp_dir().join("codegraph_collect_exclude_test"); + let _ = fs::remove_dir_all(&tmp); + let src = tmp.join("src"); + fs::create_dir_all(&src).unwrap(); + fs::write(src.join("app.ts"), "").unwrap(); + fs::write(src.join("app.test.ts"), "").unwrap(); + fs::write(src.join("util.ts"), "").unwrap(); + + let exclude = vec!["**/*.test.ts".to_string()]; + let result = collect_files(tmp.to_str().unwrap(), &[], &[], &exclude); + let names: HashSet = result + .files + .iter() + .filter_map(|f| Path::new(f).file_name().map(|n| n.to_str().unwrap().to_string())) + .collect(); + assert!(names.contains("app.ts")); + assert!(names.contains("util.ts")); + assert!(!names.contains("app.test.ts"), "exclude glob should reject matching files"); + + let _ = fs::remove_dir_all(&tmp); + } + + #[test] + fn collect_honors_include_globs() { + let tmp = std::env::temp_dir().join("codegraph_collect_include_test"); + let _ = fs::remove_dir_all(&tmp); + let src = tmp.join("src"); + let tests = tmp.join("tests"); + fs::create_dir_all(&src).unwrap(); + fs::create_dir_all(&tests).unwrap(); + fs::write(src.join("app.ts"), "").unwrap(); + fs::write(tests.join("spec.ts"), "").unwrap(); + + let include = vec!["src/**".to_string()]; + let result = collect_files(tmp.to_str().unwrap(), &[], &include, &[]); + let names: HashSet = result + .files + .iter() + .filter_map(|f| Path::new(f).file_name().map(|n| n.to_str().unwrap().to_string())) + .collect(); + assert!(names.contains("app.ts")); + assert!(!names.contains("spec.ts"), "include glob should reject non-matching files"); + + let _ = fs::remove_dir_all(&tmp); + } + #[test] fn fast_collect_applies_deltas() { let root = "/project"; @@ -218,7 +367,7 @@ mod tests { let changed = vec!["src/d.ts".to_string()]; let removed = vec!["src/b.ts".to_string()]; - let result = try_fast_collect(root, &db_files, &changed, &removed); + let result = try_fast_collect(root, &db_files, &changed, &removed, &[], &[]); assert_eq!(result.files.len(), 3); // a, c, d let names: HashSet<&str> = result .files @@ -230,4 +379,28 @@ mod tests { assert!(names.contains("c.ts")); assert!(names.contains("d.ts")); } + + #[test] + fn fast_collect_honors_exclude_globs() { + let root = "/project"; + let db_files = vec![ + "src/a.ts".to_string(), + "src/a.test.ts".to_string(), + "src/b.ts".to_string(), + ]; + let exclude = vec!["**/*.test.ts".to_string()]; + + let result = try_fast_collect(root, &db_files, &[], &[], &[], &exclude); + let names: HashSet<&str> = result + .files + .iter() + .map(|f| f.rsplit('/').next().unwrap_or(f)) + .collect(); + assert!(names.contains("a.ts")); + assert!(names.contains("b.ts")); + assert!( + !names.contains("a.test.ts"), + "fast path must filter out excluded files so incremental builds honor config changes" + ); + } } diff --git a/src/domain/graph/builder/helpers.ts b/src/domain/graph/builder/helpers.ts index ee03e73b..6f57b705 100644 --- a/src/domain/graph/builder/helpers.ts +++ b/src/domain/graph/builder/helpers.ts @@ -8,7 +8,8 @@ import fs from 'node:fs'; import path from 'node:path'; import { purgeFilesData } from '../../../db/index.js'; import { warn } from '../../../infrastructure/logger.js'; -import { EXTENSIONS, IGNORE_DIRS } from '../../../shared/constants.js'; +import { EXTENSIONS, IGNORE_DIRS, normalizePath } from '../../../shared/constants.js'; +import { compileGlobs, matchesAny } from '../../../shared/globs.js'; import type { BetterSqlite3Database, CodegraphConfig, @@ -58,9 +59,29 @@ function shouldSkipEntry(entry: fs.Dirent, extraIgnore: Set | null): boo return false; } +/** + * Check whether a source file passes the configured include/exclude globs. + * + * Patterns are matched against the path relative to the project root, + * normalized to forward slashes (e.g. `src/foo/bar.ts`). When both lists + * are set, a file must match at least one include and no exclude. + */ +export function passesIncludeExclude( + relPath: string, + includeRegexes: readonly RegExp[], + excludeRegexes: readonly RegExp[], +): boolean { + if (includeRegexes.length > 0 && !matchesAny(includeRegexes, relPath)) return false; + if (excludeRegexes.length > 0 && matchesAny(excludeRegexes, relPath)) return false; + return true; +} + /** * Recursively collect all source files under `dir`. * When `directories` is a Set, also tracks which directories contain files. + * + * The first invocation establishes `dir` as the project root against which + * `config.include` / `config.exclude` globs are matched. */ export function collectFiles( dir: string, @@ -68,6 +89,9 @@ export function collectFiles( config: Partial, directories: Set, _visited?: Set, + _rootDir?: string, + _includeRegexes?: RegExp[], + _excludeRegexes?: RegExp[], ): { files: string[]; directories: Set }; export function collectFiles( dir: string, @@ -75,6 +99,9 @@ export function collectFiles( config?: Partial, directories?: null, _visited?: Set, + _rootDir?: string, + _includeRegexes?: RegExp[], + _excludeRegexes?: RegExp[], ): string[]; export function collectFiles( dir: string, @@ -82,10 +109,20 @@ export function collectFiles( config: Partial = {}, directories: Set | null = null, _visited: Set = new Set(), + _rootDir?: string, + _includeRegexes?: RegExp[], + _excludeRegexes?: RegExp[], ): string[] | { files: string[]; directories: Set } { const trackDirs = directories instanceof Set; let hasFiles = false; + // First call: compute root and compile include/exclude patterns once, + // then pass them down recursive calls so we don't recompile per directory. + const rootDir = _rootDir ?? dir; + const includeRegexes = _includeRegexes ?? compileGlobs(config.include); + const excludeRegexes = _excludeRegexes ?? compileGlobs(config.exclude); + const hasGlobFilters = includeRegexes.length > 0 || excludeRegexes.length > 0; + // Merge config ignoreDirs with defaults const extraIgnore = config.ignoreDirs ? new Set(config.ignoreDirs) : null; @@ -116,11 +153,24 @@ export function collectFiles( const full = path.join(dir, entry.name); if (entry.isDirectory()) { if (trackDirs) { - collectFiles(full, files, config, directories as Set, _visited); + collectFiles( + full, + files, + config, + directories as Set, + _visited, + rootDir, + includeRegexes, + excludeRegexes, + ); } else { - collectFiles(full, files, config, null, _visited); + collectFiles(full, files, config, null, _visited, rootDir, includeRegexes, excludeRegexes); } } else if (EXTENSIONS.has(path.extname(entry.name))) { + if (hasGlobFilters) { + const rel = normalizePath(path.relative(rootDir, full)); + if (!passesIncludeExclude(rel, includeRegexes, excludeRegexes)) continue; + } files.push(full); hasFiles = true; } diff --git a/src/domain/graph/builder/stages/collect-files.ts b/src/domain/graph/builder/stages/collect-files.ts index 73c19441..13f0b6c9 100644 --- a/src/domain/graph/builder/stages/collect-files.ts +++ b/src/domain/graph/builder/stages/collect-files.ts @@ -9,9 +9,10 @@ import fs from 'node:fs'; import path from 'node:path'; import { debug, info } from '../../../../infrastructure/logger.js'; import { normalizePath } from '../../../../shared/constants.js'; +import { compileGlobs } from '../../../../shared/globs.js'; import { readJournal } from '../../journal.js'; import type { PipelineContext } from '../context.js'; -import { collectFiles as collectFilesUtil } from '../helpers.js'; +import { collectFiles as collectFilesUtil, passesIncludeExclude } from '../helpers.js'; /** * Reconstruct allFiles from DB file_hashes + journal deltas. @@ -20,7 +21,7 @@ import { collectFiles as collectFilesUtil } from '../helpers.js'; function tryFastCollect( ctx: PipelineContext, ): { files: string[]; directories: Set } | null { - const { db, rootDir } = ctx; + const { db, rootDir, config } = ctx; const useNative = ctx.engineName === 'native' && !!ctx.nativeDb?.getCollectFilesData; // 1. Check that file_hashes table exists and has entries @@ -70,10 +71,20 @@ function tryFastCollect( } } - // 5. Convert to absolute paths and compute directories + // 5. Convert to absolute paths and compute directories, honoring + // config.include / config.exclude globs so incremental builds reflect + // config changes (paths from the DB were collected under older config). + const includeRegexes = compileGlobs(config?.include); + const excludeRegexes = compileGlobs(config?.exclude); + const hasGlobFilters = includeRegexes.length > 0 || excludeRegexes.length > 0; + const files: string[] = []; const directories = new Set(); for (const relPath of fileSet) { + if (hasGlobFilters) { + const normRel = normalizePath(relPath); + if (!passesIncludeExclude(normRel, includeRegexes, excludeRegexes)) continue; + } const absPath = path.join(rootDir, relPath); files.push(absPath); directories.add(path.dirname(absPath)); diff --git a/src/features/boundaries.ts b/src/features/boundaries.ts index c792a284..05f7738f 100644 --- a/src/features/boundaries.ts +++ b/src/features/boundaries.ts @@ -1,34 +1,9 @@ import { isTestFile } from '../infrastructure/test-filter.js'; import { BoundaryError } from '../shared/errors.js'; +import { globToRegex } from '../shared/globs.js'; import type { BetterSqlite3Database } from '../types.js'; -// ─── Glob-to-Regex ─────────────────────────────────────────────────── - -export function globToRegex(pattern: string): RegExp { - let re = ''; - let i = 0; - while (i < pattern.length) { - const ch = pattern[i] as string; - if (ch === '*' && pattern[i + 1] === '*') { - re += '.*'; - i += 2; - if (pattern[i] === '/') i++; - } else if (ch === '*') { - re += '[^/]*'; - i++; - } else if (ch === '?') { - re += '[^/]'; - i++; - } else if (/[.+^${}()|[\]\\]/.test(ch)) { - re += `\\${ch}`; - i++; - } else { - re += ch; - i++; - } - } - return new RegExp(`^${re}$`); -} +export { globToRegex }; // ─── Presets ───────────────────────────────────────────────────────── diff --git a/src/shared/globs.ts b/src/shared/globs.ts new file mode 100644 index 00000000..9cd6c786 --- /dev/null +++ b/src/shared/globs.ts @@ -0,0 +1,82 @@ +/** + * Glob → RegExp conversion utilities. + * + * Shared by boundary rules (`features/boundaries.ts`) and the file-collection + * include/exclude filters (`domain/graph/builder/helpers.ts`). Keeping a single + * implementation ensures users get consistent glob semantics everywhere. + * + * Supported syntax: + * - `**` matches any sequence of characters including `/` + * - `*` matches any sequence of characters except `/` + * - `?` matches a single non-slash character + * - other regex metacharacters are escaped literally + * + * Paths must use forward slashes (callers normalize before testing). + */ + +/** + * Compile a glob pattern into a `RegExp` anchored with `^…$`. + */ +export function globToRegex(pattern: string): RegExp { + let re = ''; + let i = 0; + while (i < pattern.length) { + const ch = pattern[i] as string; + if (ch === '*' && pattern[i + 1] === '*') { + i += 2; + if (pattern[i] === '/') { + // `**/` matches zero or more full path segments, preserving the + // directory boundary before the next segment. Without this, patterns + // like `**/foo.ts` would compile to `^.*foo\.ts$` and match + // `barfoo.ts`, diverging from Rust `globset` semantics. + re += '(?:[^/]+/)*'; + i++; + } else { + // Bare `**` (e.g. `dir/**`, or trailing) matches anything. + re += '.*'; + } + } else if (ch === '*') { + re += '[^/]*'; + i++; + } else if (ch === '?') { + re += '[^/]'; + i++; + } else if (/[.+^${}()|[\]\\]/.test(ch)) { + re += `\\${ch}`; + i++; + } else { + re += ch; + i++; + } + } + return new RegExp(`^${re}$`); +} + +/** + * Compile a list of glob patterns. Invalid / empty patterns are skipped. + */ +export function compileGlobs(patterns: readonly string[] | undefined): RegExp[] { + if (!patterns || patterns.length === 0) return []; + const out: RegExp[] = []; + for (const p of patterns) { + if (typeof p !== 'string' || p.length === 0) continue; + try { + out.push(globToRegex(p)); + } catch { + // Ignore malformed patterns rather than failing the whole build. + } + } + return out; +} + +/** + * `true` when at least one compiled pattern matches the given path. + * + * The path must already be normalized to forward slashes. + */ +export function matchesAny(regexes: readonly RegExp[], path: string): boolean { + for (const re of regexes) { + if (re.test(path)) return true; + } + return false; +} diff --git a/tests/integration/config-include-exclude.test.ts b/tests/integration/config-include-exclude.test.ts new file mode 100644 index 00000000..dd3655ee --- /dev/null +++ b/tests/integration/config-include-exclude.test.ts @@ -0,0 +1,181 @@ +/** + * Integration tests for `config.include` / `config.exclude` (issue #981). + * + * Verifies that top-level `include` / `exclude` globs in `.codegraphrc.json` + * actually filter the files included in the build — and that both the native + * Rust engine and the WASM/JS engine honor the filters identically. + */ + +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import Database from 'better-sqlite3'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; +import { buildGraph } from '../../src/domain/graph/builder.js'; +import { clearConfigCache } from '../../src/infrastructure/config.js'; +import { isNativeAvailable } from '../../src/infrastructure/native.js'; + +const FIXTURE_FILES: Record = { + 'src/math.js': ` +export function add(a, b) { return a + b; } +export function multiply(a, b) { return a * b; } +`.trimStart(), + 'src/util.js': ` +import { add } from './math.js'; +export function doubleSum(a, b) { return add(a, b) + add(a, b); } +`.trimStart(), + 'src/math.test.js': ` +import { add } from './math.js'; +if (add(1, 2) !== 3) throw new Error('math broken'); +`.trimStart(), + 'src/util.spec.js': ` +import { doubleSum } from './util.js'; +if (doubleSum(1, 2) !== 6) throw new Error('util broken'); +`.trimStart(), + 'scratch/notes.js': ` +export const scratch = 42; +`.trimStart(), +}; + +function writeFixture(root: string): void { + for (const [relPath, content] of Object.entries(FIXTURE_FILES)) { + const abs = path.join(root, relPath); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, content); + } +} + +function readFileRows(dbPath: string): string[] { + const db = new Database(dbPath, { readonly: true }); + try { + // `file_hashes` is the authoritative list of files that actually passed + // collection. `nodes.file` also contains synthetic directory/module rows. + const rows = db.prepare('SELECT file FROM file_hashes ORDER BY file').all() as Array<{ + file: string; + }>; + return rows.map((r) => r.file).sort(); + } finally { + db.close(); + } +} + +type EngineName = 'native' | 'wasm'; + +async function buildWithEngine( + root: string, + engine: EngineName, + config: { include?: string[]; exclude?: string[] }, +): Promise { + fs.writeFileSync(path.join(root, '.codegraphrc.json'), JSON.stringify(config)); + // `loadConfig` caches per cwd — blow the cache so each test's config is + // actually re-read from disk. + clearConfigCache(); + // Wipe DB between runs so file list is authoritative. + const dbDir = path.join(root, '.codegraph'); + if (fs.existsSync(dbDir)) fs.rmSync(dbDir, { recursive: true, force: true }); + await buildGraph(root, { engine, skipRegistry: true }); + const files = readFileRows(path.join(dbDir, 'graph.db')); + // `file_hashes` stores relative paths; normalize slashes so assertions are + // cross-platform. + return files.map((f) => f.replace(/\\/g, '/')).sort(); +} + +describe('config.include / config.exclude (issue #981)', () => { + let tmpDir: string; + let wasmRoot: string; + let nativeRoot: string; + + beforeAll(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-cfg-inc-exc-')); + wasmRoot = fs.mkdtempSync(path.join(tmpDir, 'wasm-')); + nativeRoot = fs.mkdtempSync(path.join(tmpDir, 'native-')); + writeFixture(wasmRoot); + writeFixture(nativeRoot); + }); + + afterAll(() => { + if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + // ── wasm engine ─────────────────────────────────────────────────── + + it('wasm: exclude glob rejects matching files', async () => { + const files = await buildWithEngine(wasmRoot, 'wasm', { + exclude: ['**/*.test.js', '**/*.spec.js'], + }); + expect(files).toContain('src/math.js'); + expect(files).toContain('src/util.js'); + expect(files).not.toContain('src/math.test.js'); + expect(files).not.toContain('src/util.spec.js'); + }); + + it('wasm: include glob limits collection to matching files', async () => { + const files = await buildWithEngine(wasmRoot, 'wasm', { + include: ['src/**'], + }); + // scratch/ is outside src/, so nothing from it should be included + expect(files.some((f) => f.startsWith('scratch/'))).toBe(false); + expect(files).toContain('src/math.js'); + }); + + it('wasm: include + exclude combine (include first, exclude trims)', async () => { + const files = await buildWithEngine(wasmRoot, 'wasm', { + include: ['src/**'], + exclude: ['**/*.test.js', '**/*.spec.js'], + }); + expect(files).toContain('src/math.js'); + expect(files).toContain('src/util.js'); + expect(files).not.toContain('src/math.test.js'); + expect(files).not.toContain('src/util.spec.js'); + expect(files.some((f) => f.startsWith('scratch/'))).toBe(false); + }); + + it('wasm: empty include/exclude preserves prior behavior (collects everything supported)', async () => { + const files = await buildWithEngine(wasmRoot, 'wasm', {}); + expect(files).toContain('src/math.js'); + expect(files).toContain('src/math.test.js'); + expect(files).toContain('src/util.spec.js'); + expect(files).toContain('scratch/notes.js'); + }); + + // ── native engine (skipped when not installed) ─────────────────── + + const nativeAvailable = isNativeAvailable(); + const itNative = nativeAvailable ? it : it.skip; + + itNative('native: exclude glob rejects matching files', async () => { + const files = await buildWithEngine(nativeRoot, 'native', { + exclude: ['**/*.test.js', '**/*.spec.js'], + }); + expect(files).toContain('src/math.js'); + expect(files).toContain('src/util.js'); + expect(files).not.toContain('src/math.test.js'); + expect(files).not.toContain('src/util.spec.js'); + }); + + itNative('native: include glob limits collection to matching files', async () => { + const files = await buildWithEngine(nativeRoot, 'native', { + include: ['src/**'], + }); + expect(files.some((f) => f.startsWith('scratch/'))).toBe(false); + expect(files).toContain('src/math.js'); + }); + + // ── engine parity ──────────────────────────────────────────────── + + itNative('native + wasm produce identical file sets under include/exclude', async () => { + const parityWasm = fs.mkdtempSync(path.join(tmpDir, 'parity-wasm-')); + const parityNative = fs.mkdtempSync(path.join(tmpDir, 'parity-native-')); + writeFixture(parityWasm); + writeFixture(parityNative); + + const cfg = { + include: ['src/**'], + exclude: ['**/*.test.js', '**/*.spec.js'], + }; + const wasmFiles = await buildWithEngine(parityWasm, 'wasm', cfg); + const nativeFiles = await buildWithEngine(parityNative, 'native', cfg); + // Paths are already relative to each run's own tmpDir so they compare directly. + expect(nativeFiles).toEqual(wasmFiles); + }); +}); diff --git a/tests/unit/boundaries.test.ts b/tests/unit/boundaries.test.ts index 8684d8bc..2b12bd7b 100644 --- a/tests/unit/boundaries.test.ts +++ b/tests/unit/boundaries.test.ts @@ -47,6 +47,25 @@ describe('globToRegex', () => { expect(re.test('foo.test.js')).toBe(true); }); + test('**/ enforces path-component boundary', () => { + // Matches parity with Rust `globset`: `**/index.ts` must NOT match + // `barindex.ts` — the segment before `index.ts` has to be a full + // directory component (or absent). See PR #994 / Greptile review. + const re = globToRegex('**/index.ts'); + expect(re.test('index.ts')).toBe(true); + expect(re.test('src/index.ts')).toBe(true); + expect(re.test('a/b/index.ts')).toBe(true); + expect(re.test('barindex.ts')).toBe(false); + expect(re.test('src/barindex.ts')).toBe(false); + }); + + test('trailing ** (e.g. dir/**) matches anything under dir', () => { + const re = globToRegex('dir/**'); + expect(re.test('dir/a')).toBe(true); + expect(re.test('dir/a/b')).toBe(true); + expect(re.test('other/a')).toBe(false); + }); + test('exact path match', () => { const re = globToRegex('src/controllers/main.js'); expect(re.test('src/controllers/main.js')).toBe(true);