diff --git a/crates/pcr-cli/Cargo.toml b/crates/pcr-cli/Cargo.toml index 531f36c..fb5bce6 100644 --- a/crates/pcr-cli/Cargo.toml +++ b/crates/pcr-cli/Cargo.toml @@ -23,3 +23,7 @@ serde_json = { workspace = true } # Already a workspace dep — reused here so the migration smoke test can # build a v1-schema fixture DB and re-query it after migrations run. rusqlite = { workspace = true } +# SIGINT delivery for the graceful-shutdown integration test (Unix-only, +# guarded by `#![cfg(unix)]` in the test file). +[target.'cfg(unix)'.dev-dependencies] +libc = "0.2" diff --git a/crates/pcr-cli/tests/start_graceful_shutdown.rs b/crates/pcr-cli/tests/start_graceful_shutdown.rs new file mode 100644 index 0000000..b7705e9 --- /dev/null +++ b/crates/pcr-cli/tests/start_graceful_shutdown.rs @@ -0,0 +1,103 @@ +//! Integration test for the audit's task 1: `pcr start` must shut +//! down gracefully on SIGINT. +//! +//! Before this fix the source watcher threads ran a bare `loop { +//! sleep; scan() }` with no cooperative shutdown signal. Ctrl-C +//! killed the process mid-scan and the PID file was racey to remove. +//! Now `pcr start` installs a Ctrl-C handler that flips a process- +//! wide `crate::shutdown` flag; scan loops poll it at the top of +//! every iteration; `wait_for_shutdown` returns; the `PidFileGuard` +//! drop removes the PID file before the binary exits cleanly with +//! code 0. +//! +//! Unix-only — `nix`/`libc::kill(pid, SIGINT)` is the natural way to +//! send the signal, and the `ctrlc` crate's Windows path is a +//! console-mode handler that doesn't translate to a clean test +//! shape. Windows ungraceful-shutdown is a separate concern. + +#![cfg(unix)] + +mod common; + +use std::time::{Duration, Instant}; + +use common::home_fixture; + +#[test] +fn pcr_start_exits_zero_and_removes_pid_file_on_sigint() { + let fx = home_fixture(); + let pid_file = fx.pcr_dir().join("watcher.pid"); + assert!(!pid_file.exists(), "fresh fixture has no PID file yet"); + + // Spawn `pcr start --plain` headless in the fixture's $HOME. We + // need a `std::process::Child` (not `assert_cmd::Command`) so we + // can grab the PID and send a signal to it directly. + let bin = assert_cmd::cargo::cargo_bin("pcr"); + let mut child = std::process::Command::new(&bin) + .arg("start") + .env("HOME", fx.home_path()) + .env("USERPROFILE", fx.home_path()) + // Disable TUI: `is_tui_eligible` already returns false because + // stderr is piped, but set NO_COLOR for belt-and-braces in case + // the binary's env carries a real CI/terminal value through. + .env("NO_COLOR", "1") + .env_remove("CI") + .env_remove("CURSOR_AGENT") + .current_dir(fx.cwd_path()) + .stdin(std::process::Stdio::null()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .expect("spawn pcr start"); + + // Wait for the PID file to appear, capped at 10 s. The binary + // writes it before `wait_for_shutdown()` so it's the simplest + // readiness signal — without it, we might send SIGINT during + // startup, before the ctrlc handler is installed. + let pid = child.id() as i32; + let deadline = Instant::now() + Duration::from_secs(10); + while !pid_file.exists() { + if Instant::now() >= deadline { + let _ = child.kill(); + panic!("PID file never appeared at {pid_file:?} — pcr start may have crashed"); + } + std::thread::sleep(Duration::from_millis(50)); + } + + // Send SIGINT, exactly as `kill -INT ` would. + // SAFETY: `pid` is the child we just spawned; kill(2) with SIGINT + // is safe and has no other observable side effects in this test. + let rc = unsafe { libc::kill(pid, libc::SIGINT) }; + assert_eq!( + rc, + 0, + "kill(2) syscall failed: {}", + std::io::Error::last_os_error() + ); + + // Wait for graceful exit. 15 s gives the 200 ms ctrlc-poll loop + + // any scan-in-progress + final flush plenty of headroom. + let exit_deadline = Instant::now() + Duration::from_secs(15); + let status = loop { + match child.try_wait().expect("try_wait") { + Some(s) => break s, + None => { + if Instant::now() >= exit_deadline { + let _ = child.kill(); + panic!("pcr start did not exit within 15s after SIGINT — graceful shutdown is broken"); + } + std::thread::sleep(Duration::from_millis(50)); + } + } + }; + + assert!( + status.success(), + "expected exit code 0, got {status:?} — SIGINT must produce a clean exit" + ); + assert!( + !pid_file.exists(), + "PidFileGuard::drop must have removed {pid_file:?}; otherwise a stale PID file \ + confuses the next `pcr start` into prompting about an already-dead watcher" + ); +} diff --git a/crates/pcr-core/src/auth.rs b/crates/pcr-core/src/auth.rs index 9dfdd6b..87133c6 100644 --- a/crates/pcr-core/src/auth.rs +++ b/crates/pcr-core/src/auth.rs @@ -16,21 +16,21 @@ pub struct Auth { pub user_id: String, } -fn auth_file_path() -> PathBuf { - config::pcr_dir().join("auth.json") +fn auth_file_path() -> anyhow::Result { + Ok(config::pcr_dir()?.join("auth.json")) } /// Load the saved auth credentials. Returns `None` if the file doesn't -/// exist or can't be parsed — mirrors the Go `Load() *Auth` behavior which -/// returns `nil` on any error. +/// exist, can't be parsed, or if `$HOME` couldn't be resolved — mirrors +/// the Go `Load() *Auth` behavior which returns `nil` on any error. pub fn load() -> Option { - let data = fs::read(auth_file_path()).ok()?; + let data = fs::read(auth_file_path().ok()?).ok()?; serde_json::from_slice(&data).ok() } /// Persist auth credentials to disk with 0600 permissions. pub fn save(auth: &Auth) -> anyhow::Result<()> { - let path = auth_file_path(); + let path = auth_file_path()?; if let Some(parent) = path.parent() { fs::create_dir_all(parent)?; } @@ -46,7 +46,11 @@ pub fn save(auth: &Auth) -> anyhow::Result<()> { Ok(()) } -/// Clear saved credentials. Silently succeeds if no file exists. +/// Clear saved credentials. Silently succeeds if no file exists. If +/// `$HOME` can't be resolved there's nothing to clear in the first +/// place, so we still no-op rather than surfacing an error. pub fn clear() { - let _ = fs::remove_file(auth_file_path()); + if let Ok(p) = auth_file_path() { + let _ = fs::remove_file(p); + } } diff --git a/crates/pcr-core/src/commands/hook.rs b/crates/pcr-core/src/commands/hook.rs index 31a558d..130b6fb 100644 --- a/crates/pcr-core/src/commands/hook.rs +++ b/crates/pcr-core/src/commands/hook.rs @@ -7,8 +7,13 @@ use crate::exit::ExitCode; use crate::sources::claudecode::hook::run_hook as run_claude_hook; pub fn run(_mode: OutputMode) -> ExitCode { - // Only act if `pcr start` is currently running. - if read_existing_pid(&pid_file_path()).is_none() { + // Only act if `pcr start` is currently running. If `$HOME` + // can't even be resolved we can't have a live watcher anyway — + // exit 0 so the hook doesn't re-engage the model. + let Ok(path) = pid_file_path() else { + return ExitCode::Success; + }; + if read_existing_pid(&path).is_none() { return ExitCode::Success; } let ctx = project_context::resolve(); diff --git a/crates/pcr-core/src/commands/log.rs b/crates/pcr-core/src/commands/log.rs index 176f865..db1c9d2 100644 --- a/crates/pcr-core/src/commands/log.rs +++ b/crates/pcr-core/src/commands/log.rs @@ -32,8 +32,14 @@ fn short_sha(sha: &str) -> String { if sha.starts_with("manual-") { return "[manual]".to_string(); } - if sha.len() >= 7 { - return sha[..7].to_string(); + // Real SHAs are hex (ASCII), but `short_sha` is called on whatever + // sha string the store handed us — including the "manual-*" + // sentinels and any future non-ASCII tags. `chars().take(7)` + // matches the truncate-on-char-boundary fix in `util::text` and + // never panics on multi-byte input. + let short: String = sha.chars().take(7).collect(); + if short.chars().count() == 7 { + return short; } sha.to_string() } diff --git a/crates/pcr-core/src/commands/start.rs b/crates/pcr-core/src/commands/start.rs index 93f6501..eca7ec7 100644 --- a/crates/pcr-core/src/commands/start.rs +++ b/crates/pcr-core/src/commands/start.rs @@ -1,8 +1,6 @@ //! `pcr start`. Mirrors `cli/cmd/start.go`. -use std::path::PathBuf; -use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::Arc; +use std::path::{Path, PathBuf}; use std::thread; use std::time::Duration; @@ -12,13 +10,14 @@ use crate::display; use crate::entry::StartArgs; use crate::exit::ExitCode; use crate::projects; +use crate::shutdown; use crate::sources; -pub fn pid_file_path() -> PathBuf { - config::pcr_dir().join("watcher.pid") +pub fn pid_file_path() -> anyhow::Result { + Ok(config::pcr_dir()?.join("watcher.pid")) } -pub fn read_existing_pid(pid_file: &PathBuf) -> Option { +pub fn read_existing_pid(pid_file: &Path) -> Option { let data = std::fs::read_to_string(pid_file).ok()?; let pid: i32 = data.trim().parse().ok()?; #[cfg(unix)] @@ -44,7 +43,27 @@ pub fn read_existing_pid(pid_file: &PathBuf) -> Option { } pub fn run(mode: OutputMode, args: StartArgs) -> ExitCode { - let pid_file = pid_file_path(); + // Install the Ctrl-C handler before doing any setup work (PID + // file write, watcher spawn, etc.). The default SIGINT handler + // would otherwise kill the process during that window — racing + // with PID file cleanup and exiting with `W_TERMSIG(SIGINT)` + // instead of the clean code 0 the integration test asserts on. + // + // `set_handler` returns Err if a handler is already installed + // (re-entry in tests via the same process); the existing + // handler already routes to `request_shutdown`, so ignore Err. + let _ = ctrlc::set_handler(shutdown::request_shutdown); + + let pid_file = match pid_file_path() { + Ok(p) => p, + Err(e) => { + // Surface the $HOME-missing case immediately rather + // than letting downstream singletons (`store::open()`, + // etc.) panic with the same error mid-watcher-spawn. + display::print_error("start", &format!("{e}")); + return ExitCode::GenericError; + } + }; if let Some(pid) = read_existing_pid(&pid_file) { if !agent::is_interactive_terminal() { @@ -208,12 +227,14 @@ fn spawn_all_sources() -> Vec> { } fn wait_for_shutdown() { - let flag = Arc::new(AtomicBool::new(false)); - let flag_handler = flag.clone(); - let _ = ctrlc::set_handler(move || { - flag_handler.store(true, Ordering::SeqCst); - }); - while !flag.load(Ordering::SeqCst) { + // The ctrlc handler is installed at the top of `run()` so the + // SIGINT-arrives-before-watchers-are-ready window doesn't kill + // the process with the default signal handler. Here we just + // park the main thread until the handler flips the flag (which + // every long-running scan loop in `crate::shutdown` is also + // polling). PID file cleanup runs via `PidFileGuard::drop` — + // `remove_file` is wrapped in `let _ =` so it's idempotent. + while !shutdown::is_shutting_down() { thread::sleep(Duration::from_millis(200)); } } diff --git a/crates/pcr-core/src/config.rs b/crates/pcr-core/src/config.rs index 8c38976..66503cd 100644 --- a/crates/pcr-core/src/config.rs +++ b/crates/pcr-core/src/config.rs @@ -1,5 +1,7 @@ //! Compile-time constants. Mirrors `cli/internal/config/constants.go`. +use std::path::PathBuf; + /// Supabase project URL used for all RPC calls. pub const SUPABASE_URL: &str = "https://icbsvwffcykzimjjonad.supabase.co"; @@ -12,9 +14,90 @@ pub const APP_URL: &str = "https://pcr.dev"; /// Name of the per-user data directory under `$HOME`. pub const PCR_DIR: &str = ".pcr-dev"; -/// Returns the absolute path of `$HOME/.pcr-dev`, creating nothing on disk. -pub fn pcr_dir() -> std::path::PathBuf { - dirs::home_dir() - .unwrap_or_else(std::env::temp_dir) - .join(PCR_DIR) +/// Returns the absolute path of `$HOME/.pcr-dev`, creating nothing on +/// disk. +/// +/// Previously this fell back to `std::env::temp_dir()` when +/// `dirs::home_dir()` returned `None` — a silent failure mode that +/// wrote auth credentials and the SQLite store to `/tmp`, where they +/// vanished on reboot. The new contract: callers see the error and +/// can decide whether to fail loudly (CLI entry points) or skip +/// soft-state writes (background loops). No silent fallback ever. +/// +/// On well-configured Unix systems and on Windows with `USERPROFILE` +/// set this returns `Ok(...)` essentially unconditionally; the +/// `Err` path only fires inside sandboxes / containers / cron jobs +/// where neither `HOME` nor `USERPROFILE` is set. +pub fn pcr_dir() -> anyhow::Result { + let home = dirs::home_dir().ok_or_else(|| { + anyhow::anyhow!( + "could not determine $HOME (or %USERPROFILE% on Windows). PCR refuses to \ + silently fall back to a temp directory because that would lose auth + \ + local drafts on reboot. Set HOME and re-run." + ) + })?; + Ok(home.join(PCR_DIR)) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// `dirs::home_dir()` reads `HOME` (Unix) / `USERPROFILE` (Win) + /// from the env at call time, so we can exercise the error + /// path by removing both before calling. Restore them before + /// the test ends so the rest of the test binary stays sane. + /// + /// Skipped on platforms where `dirs::home_dir()` falls back to + /// something else (e.g. `getpwuid_r` on Unix): if the + /// unset-everything-snapshot still resolves, we can't drive + /// the error branch from a test, but the production code is + /// still strictly safer than the previous `unwrap_or_else( + /// env::temp_dir)`. + #[test] + fn pcr_dir_returns_err_when_home_is_unresolvable() { + let prev_home = std::env::var_os("HOME"); + let prev_userprofile = std::env::var_os("USERPROFILE"); + // SAFETY: this test process is single-threaded inside its + // own binary (the suite spawns each #[test] sequentially + // by default), and we restore the env before returning. + unsafe { + std::env::remove_var("HOME"); + std::env::remove_var("USERPROFILE"); + } + + let result = pcr_dir(); + + // Restore before any assertion can panic. + unsafe { + if let Some(v) = prev_home { + std::env::set_var("HOME", v); + } + if let Some(v) = prev_userprofile { + std::env::set_var("USERPROFILE", v); + } + } + + if dirs::home_dir().is_some() { + eprintln!( + "skipping: dirs::home_dir() still resolves with HOME/USERPROFILE unset \ + (likely getpwuid_r on Unix); error branch isn't reachable from a test \ + on this platform" + ); + return; + } + let err = match result { + Ok(p) => panic!( + "expected an error when HOME is unset; got Ok({}). The previous \ + implementation would silently fall back to /tmp here — that's exactly \ + the regression we're guarding against.", + p.display(), + ), + Err(e) => e.to_string(), + }; + assert!( + err.contains("$HOME") || err.to_lowercase().contains("home"), + "error message must mention $HOME so users know what to fix; got: {err}" + ); + } } diff --git a/crates/pcr-core/src/lib.rs b/crates/pcr-core/src/lib.rs index eae4f84..1a9bf42 100644 --- a/crates/pcr-core/src/lib.rs +++ b/crates/pcr-core/src/lib.rs @@ -35,6 +35,7 @@ pub mod exit; pub mod help; pub mod mcp; pub mod projects; +pub mod shutdown; pub mod sources; pub mod store; pub mod supabase; diff --git a/crates/pcr-core/src/projects.rs b/crates/pcr-core/src/projects.rs index 71ac544..9d38ad5 100644 --- a/crates/pcr-core/src/projects.rs +++ b/crates/pcr-core/src/projects.rs @@ -36,15 +36,24 @@ struct Registry { projects: Vec, } -fn file_path() -> PathBuf { - config::pcr_dir().join("projects.json") +fn file_path() -> anyhow::Result { + Ok(config::pcr_dir()?.join("projects.json")) } /// Registered project list. Cached per-process; the cache is reused /// while `projects.json`'s mtime is unchanged so the high-frequency /// watchers don't re-parse it on every poll. +/// +/// If `$HOME` can't be resolved we return an empty list — this is a +/// hot path (each watcher poll calls it) and silently degrading to +/// "no projects registered" matches the existing best-effort +/// behaviour of read_from_disk on a missing file. The user-facing +/// commands that actually need projects (`init`, `pcr start` setup, +/// etc.) re-check via `file_path()?` and surface the error. pub fn load() -> Vec { - let path = file_path(); + let Ok(path) = file_path() else { + return Vec::new(); + }; let mtime = mtime_of(&path); if let Some(cached) = lookup_cache(mtime) { return cached; @@ -65,7 +74,7 @@ fn read_from_disk(path: &Path) -> Vec { } fn save(projects: &[Project]) -> anyhow::Result<()> { - let path = file_path(); + let path = file_path()?; if let Some(parent) = path.parent() { fs::create_dir_all(parent)?; } diff --git a/crates/pcr-core/src/shutdown.rs b/crates/pcr-core/src/shutdown.rs new file mode 100644 index 0000000..f082100 --- /dev/null +++ b/crates/pcr-core/src/shutdown.rs @@ -0,0 +1,104 @@ +//! Process-wide cooperative shutdown signal. +//! +//! `pcr start` installs a Ctrl-C handler in +//! [`crate::commands::start::wait_for_shutdown`] that calls +//! [`request_shutdown`]. Long-running scan loops in the source watchers +//! poll [`is_shutting_down`] at the top of each iteration so they unwind +//! cooperatively instead of being torn down mid-scan when the process +//! exits. The PID file is then cleaned up via the `PidFileGuard` Drop +//! impl, which is idempotent (a `let _ =` on `remove_file` — no panic +//! if the file is already gone, e.g. because a prior `pcr start` was +//! replaced). +//! +//! Long sleeps (e.g. the 20 s cursor poll loop) should call +//! [`sleep_unless_shutdown`] instead of `thread::sleep` so Ctrl-C is +//! observable within ~200 ms rather than only at the next scan boundary. + +use std::sync::atomic::{AtomicBool, Ordering}; +use std::thread; +use std::time::Duration; + +static SHUTDOWN: AtomicBool = AtomicBool::new(false); + +/// Returns true once `pcr start` has been asked to terminate. Cheap to +/// poll — single relaxed-ish atomic load. +pub fn is_shutting_down() -> bool { + SHUTDOWN.load(Ordering::SeqCst) +} + +/// Flip the shutdown flag. Called by the Ctrl-C handler. Idempotent — +/// callers can invoke this multiple times safely. +pub fn request_shutdown() { + SHUTDOWN.store(true, Ordering::SeqCst); +} + +/// Sleep for up to `duration`, broken into 200 ms slices so Ctrl-C is +/// noticed promptly. Returns `false` the moment the shutdown flag +/// flips so callers can break out of their loop without first +/// completing the next scan iteration. +pub fn sleep_unless_shutdown(duration: Duration) -> bool { + let slice = Duration::from_millis(200); + let mut remaining = duration; + while remaining > Duration::ZERO { + if is_shutting_down() { + return false; + } + let step = remaining.min(slice); + thread::sleep(step); + remaining = remaining.saturating_sub(step); + } + !is_shutting_down() +} + +#[cfg(test)] +mod tests { + use super::*; + + // The flag is a process-wide static. Tests inside the same binary + // share it, so we serialize via a mutex and reset between cases + // — otherwise an interleaved run would see the flag set by a + // previous test. + fn lock() -> std::sync::MutexGuard<'static, ()> { + static M: std::sync::Mutex<()> = std::sync::Mutex::new(()); + M.lock().unwrap_or_else(|e| e.into_inner()) + } + + fn reset() { + SHUTDOWN.store(false, Ordering::SeqCst); + } + + #[test] + fn sleep_returns_quickly_when_shutdown_requested() { + let _g = lock(); + reset(); + let handle = thread::spawn(|| { + thread::sleep(Duration::from_millis(50)); + request_shutdown(); + }); + let start = std::time::Instant::now(); + let completed_full = sleep_unless_shutdown(Duration::from_secs(10)); + let elapsed = start.elapsed(); + handle.join().unwrap(); + + assert!( + !completed_full, + "must report early-exit when shutdown was requested" + ); + assert!( + elapsed < Duration::from_millis(500), + "must break out within ~one slice of the request, not wait \ + the full 10 s; elapsed = {elapsed:?}" + ); + } + + #[test] + fn sleep_completes_when_no_shutdown() { + let _g = lock(); + reset(); + let completed_full = sleep_unless_shutdown(Duration::from_millis(50)); + assert!( + completed_full, + "no shutdown signal → sleep must complete fully" + ); + } +} diff --git a/crates/pcr-core/src/sources/claudecode/watcher.rs b/crates/pcr-core/src/sources/claudecode/watcher.rs index 1e1798b..f30d667 100644 --- a/crates/pcr-core/src/sources/claudecode/watcher.rs +++ b/crates/pcr-core/src/sources/claudecode/watcher.rs @@ -185,12 +185,19 @@ pub fn process_file( if !force_full_scan && line_count <= prev_count { return; } - state.set(file_path, line_count); let session = parse_claude_code_session(&content, &project_name, file_path); if session.prompts.is_empty() { + // Parse extracted nothing usable. Leave the state cursor alone so + // the next scan re-tries this file — otherwise a transient parse + // failure (corrupt JSONL, truncated write, schema drift) would + // silently advance the line count and stay invisible until a + // full rescan, which never happens in the steady-state watcher. return; } + // Parse produced prompts. Now it's safe to advance the state cursor + // so the next scan only considers freshly appended lines. + state.set(file_path, line_count); let schema_v = versions::CAPTURE_SCHEMA_VERSION; let mut base_file_context = serde_json::Map::new(); diff --git a/crates/pcr-core/src/sources/cursor/diff_tracker.rs b/crates/pcr-core/src/sources/cursor/diff_tracker.rs index 69a93d0..d79d0f2 100644 --- a/crates/pcr-core/src/sources/cursor/diff_tracker.rs +++ b/crates/pcr-core/src/sources/cursor/diff_tracker.rs @@ -129,8 +129,14 @@ impl DiffTracker { // Discard any diff events older than our start time — they came from // a previous run. let _ = store::prune_diff_events(self.started_at); - loop { - std::thread::sleep(self.poll_interval); + // Cooperative shutdown: see `crate::shutdown` — the tick stays + // 3 s (or whatever was configured) but the sleep is sliced so + // Ctrl-C lands within ~200 ms instead of waiting for the next + // tick boundary. + while crate::shutdown::sleep_unless_shutdown(self.poll_interval) { + if crate::shutdown::is_shutting_down() { + break; + } self.poll(); } } @@ -138,12 +144,19 @@ impl DiffTracker { // ─── State persistence ─────────────────────────────────────────────────────── -fn state_path() -> PathBuf { - config::pcr_dir().join("diff-tracker-state.json") +fn state_path() -> Option { + // Soft-state path: if `$HOME` can't be resolved, we just skip + // persistence entirely. The diff tracker still works in-memory + // — losing the previous-state snapshot means the next session + // starts fresh, which is the same behaviour as a first run. + Some(config::pcr_dir().ok()?.join("diff-tracker-state.json")) } fn load_state(inner: &mut Inner) { - let Ok(bytes) = std::fs::read(state_path()) else { + let Some(path) = state_path() else { + return; + }; + let Ok(bytes) = std::fs::read(path) else { return; }; if let Ok(loaded) = serde_json::from_slice::>>(&bytes) { @@ -152,6 +165,9 @@ fn load_state(inner: &mut Inner) { } fn save_state(inner: &Arc>) { + let Some(path) = state_path() else { + return; + }; let Ok(guard) = inner.lock() else { return; }; @@ -160,10 +176,10 @@ fn save_state(inner: &Arc>) { let Ok(bytes) = serde_json::to_vec(&snapshot) else { return; }; - if let Some(parent) = state_path().parent() { + if let Some(parent) = path.parent() { let _ = std::fs::create_dir_all(parent); } - let _ = std::fs::write(state_path(), bytes); + let _ = std::fs::write(&path, bytes); } /// Relative paths that changed between two dirty-file snapshots. diff --git a/crates/pcr-core/src/sources/cursor/session_state_watcher.rs b/crates/pcr-core/src/sources/cursor/session_state_watcher.rs index 922fbf9..74708ab 100644 --- a/crates/pcr-core/src/sources/cursor/session_state_watcher.rs +++ b/crates/pcr-core/src/sources/cursor/session_state_watcher.rs @@ -30,8 +30,15 @@ impl SessionStateWatcher { /// Run the 2-second polling loop. Call in a dedicated thread. pub fn run_blocking(mut self) { - loop { - std::thread::sleep(Duration::from_secs(2)); + // `sleep_unless_shutdown` yields within ~200 ms of a Ctrl-C + // so the watcher thread exits before the main `pcr start` + // returns (and well before the OS tears the process down). + // Without this, the poll could be torn down mid-iteration + // and lose an in-flight `record_session_state_event` write. + while crate::shutdown::sleep_unless_shutdown(Duration::from_secs(2)) { + if crate::shutdown::is_shutting_down() { + break; + } self.poll(); } } @@ -61,7 +68,12 @@ impl SessionStateWatcher { ..Default::default() }); if let Some(prev) = prev { - let short = &row.composer_id[..row.composer_id.len().min(8)]; + // Composer IDs are UUIDs in practice (ASCII), but we treat + // them as untrusted external text — byte-slice on a non- + // self-allocated string panics if a char boundary falls in + // the middle, matching the truncate-on-char-boundary fix + // applied elsewhere (`util::text::truncate`). + let short: String = row.composer_id.chars().take(8).collect(); if prev.unified_mode != snap.unified_mode && !snap.unified_mode.is_empty() { display::print_verbose_event( "session", diff --git a/crates/pcr-core/src/sources/cursor/watcher.rs b/crates/pcr-core/src/sources/cursor/watcher.rs index b5a15d0..b8676bc 100644 --- a/crates/pcr-core/src/sources/cursor/watcher.rs +++ b/crates/pcr-core/src/sources/cursor/watcher.rs @@ -3,12 +3,12 @@ use chrono::{DateTime, NaiveDateTime, TimeZone, Utc}; use notify::{EventKind, RecursiveMode, Watcher as NotifyWatcher}; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::path::{Path, PathBuf}; use std::sync::mpsc; use std::sync::{Arc, Mutex}; use std::thread; -use std::time::{Duration, Instant}; +use std::time::{Duration, Instant, SystemTime}; use walkdir::WalkDir; use crate::display; @@ -32,6 +32,25 @@ pub struct PromptScanner { diff_tracker: Option>, seen: Arc>>, initial_scan: Arc>, + /// Per-file mtime cache from the last completed walk. The + /// periodic scan path skips any file whose mtime is still + /// equal to the cached value — `process_session` is a no-op on + /// already-processed bubbles via the dedup hash, but for + /// hundreds of stable transcripts the open + walk + JSON + /// re-fetch cost dominates a `~/.cursor/projects/` tree. See + /// `collect_changed_transcripts` for the fast-path / per-file + /// skip logic. + mtime_cache: Arc>>, + /// Top-level dir mtime from the last walk. A new project + /// subdirectory always changes the parent's mtime, so we can + /// shortcut "no new projects + no notify activity" into a + /// no-op without walking anything. + last_dir_mtime: Arc>>, + /// Set by the `notify` fast-path the moment a transcript-shaped + /// path mutates; consumed (cleared) at the top of the next + /// scan so the periodic safety-net loop knows whether anything + /// happened since it last walked. + notify_event_pending: Arc>, } impl PromptScanner { @@ -42,6 +61,9 @@ impl PromptScanner { diff_tracker, seen: Arc::new(Mutex::new(HashSet::new())), initial_scan: Arc::new(Mutex::new(true)), + mtime_cache: Arc::new(Mutex::new(HashMap::new())), + last_dir_mtime: Arc::new(Mutex::new(None)), + notify_event_pending: Arc::new(Mutex::new(false)), } } @@ -52,25 +74,114 @@ impl PromptScanner { } else { display::print_watcher_ready("Cursor", &self.dir.display().to_string()); } - // Initial silent scan. - self.scan(); + // Initial silent scan. Force the walk past the fast-path + // skip — the caches start empty so we need a full + // enumeration to populate them anyway. + self.scan_inner(true); if let Ok(mut flag) = self.initial_scan.lock() { *flag = false; } // Kick off fsnotify fast-path in a thread. let s2 = self.clone(); thread::spawn(move || s2.watch_fsnotify()); - // Periodic 20-second scan. - loop { - thread::sleep(Duration::from_secs(20)); - self.scan(); + // Periodic safety-net walk. `notify` is the primary signal + // for new transcripts (sub-second after a debounce); this + // loop only catches notify-misses, so 60 s is plenty — + // bumped from the previous 20 s after the audit flagged + // the CPU/disk cost of full WalkDir traversals over + // `~/.cursor/projects/` every 20 s at scale. With the + // mtime cache the per-iteration cost is also much lower: + // a single `stat` on the top-level dir plus only + // `stat + parse` on files whose mtime advanced since the + // last walk. + while crate::shutdown::sleep_unless_shutdown(Duration::from_secs(60)) { + if crate::shutdown::is_shutting_down() { + break; + } + self.scan_inner(false); } } + /// Public entry — calls into the shared scan path, letting the + /// fast-path / per-file cache decide whether work needs to + /// happen. Used by `watch_fsnotify` and by `force_sync` so a + /// notify hit always walks (the event-pending flag is set + /// before calling). fn scan(&self) { + self.scan_inner(false); + } + + /// Inner scan with explicit force flag. `force = true` bypasses + /// the dir-mtime / event-pending fast-path skip (used on the + /// initial scan, when the caches are guaranteed empty). + fn scan_inner(&self, force: bool) { if let Some(dt) = &self.diff_tracker { dt.poll(); } + let Some(changed) = self.collect_changed_transcripts(force) else { + return; + }; + for path in changed { + let Some((project_slug, session_id)) = parse_transcript_path(&path) else { + continue; + }; + self.process_session(&project_slug, &session_id); + } + } + + /// Enumerate transcripts that need reprocessing this tick. Returns + /// `None` when the caller should skip the walk entirely (the + /// fast-path: top-level dir mtime unchanged AND no notify event + /// has landed since the last walk). Otherwise returns the subset + /// of transcripts whose per-file mtime moved since the last walk + /// — `process_session` is a no-op on unchanged content thanks to + /// the `seen` dedup, but the stat-loop savings dominate on + /// directories with hundreds of stable transcripts. + /// + /// Always populates `mtime_cache` with the full current snapshot + /// when a walk runs, so the next call has an accurate baseline. + fn collect_changed_transcripts(&self, force: bool) -> Option> { + let dir_mtime = std::fs::metadata(&self.dir) + .ok() + .and_then(|m| m.modified().ok()); + let prev_dir_mtime = self.last_dir_mtime.lock().ok().and_then(|g| *g); + + // Consume the notify-event flag — whether or not we end up + // walking, the flag should reset, since "walk performed" + // observes everything notify could have hinted at. + let notify_event = { + let mut guard = self.notify_event_pending.lock().ok()?; + let was_set = *guard; + *guard = false; + was_set + }; + + // Top-level mtime updates only when a new project subdir is + // created/removed. Within an existing project, new sessions + // don't touch the parent's mtime, so we need the notify + // signal to cover that path. When both signals are quiet + // and we're not forced, nothing can have changed since the + // last walk → skip. + let dir_changed = dir_mtime.is_some() && dir_mtime != prev_dir_mtime; + if !force && !dir_changed && !notify_event { + return None; + } + + if let Ok(mut guard) = self.last_dir_mtime.lock() { + *guard = dir_mtime; + } + + let mut new_cache: HashMap = HashMap::new(); + let mut changed: Vec = Vec::new(); + + // Snapshot the previous cache outside the walk so we hold + // the lock for as little time as possible. + let prev_cache: HashMap = self + .mtime_cache + .lock() + .map(|g| g.clone()) + .unwrap_or_default(); + for entry in WalkDir::new(&self.dir).into_iter().filter_map(|e| e.ok()) { if !entry.file_type().is_file() { continue; @@ -79,11 +190,22 @@ impl PromptScanner { if !is_agent_transcript(path) { continue; } - let Some((project_slug, session_id)) = parse_transcript_path(path) else { - continue; - }; - self.process_session(&project_slug, &session_id); + let mtime = entry.metadata().ok().and_then(|m| m.modified().ok()); + if let Some(m) = mtime { + new_cache.insert(path.to_path_buf(), m); + if prev_cache.get(path).copied() == Some(m) { + // Same mtime as last walk → no new bubbles to + // observe. Skip the per-session work entirely. + continue; + } + } + changed.push(path.to_path_buf()); } + + if let Ok(mut guard) = self.mtime_cache.lock() { + *guard = new_cache; + } + Some(changed) } fn process_session(&self, project_slug: &str, session_id: &str) { @@ -563,6 +685,15 @@ impl PromptScanner { } for p in &event.paths { if is_agent_transcript(p) { + // Signal the periodic scan path that a notify + // event has landed since the last walk, so the + // next periodic tick doesn't short-circuit + // even if the top-level dir mtime didn't move + // (new bubbles in an existing session don't + // change `~/.cursor/projects/`'s mtime). + if let Ok(mut guard) = self.notify_event_pending.lock() { + *guard = true; + } if let Ok(mut guard) = debounce_fire.lock() { *guard = Some(Instant::now() + Duration::from_millis(500)); } @@ -800,6 +931,156 @@ mod tests { use super::*; use chrono::TimeZone; use std::path::PathBuf; + use std::time::SystemTime; + + /// Build a temp `~/.cursor/projects/`-shaped tree with one or + /// more transcripts. Returns the temp dir (held for lifetime) + /// and the absolute paths of the created transcripts. + fn make_transcript_tree(specs: &[(&str, &str)]) -> (tempfile::TempDir, Vec) { + let tmp = tempfile::TempDir::new().expect("tempdir"); + let mut paths: Vec = Vec::new(); + for (slug, sid) in specs { + let dir = tmp.path().join(slug).join("agent-transcripts").join(sid); + std::fs::create_dir_all(&dir).expect("mkdir transcripts"); + let p = dir.join(format!("{sid}.jsonl")); + std::fs::write(&p, b"placeholder\n").expect("write transcript"); + paths.push(p); + } + (tmp, paths) + } + + /// Bump the mtime of `path` to "now + 1s" so coarse-resolution + /// filesystems (ext4 with 1 s mtime quantization, HFS+, etc.) + /// still distinguish "previous walk" from "after this poke". + fn poke_mtime(path: &Path) { + // Re-write so the kernel updates mtime — `filetime` would + // be cleaner but we don't ship it; this avoids a dep. + std::thread::sleep(std::time::Duration::from_millis(1100)); + let mut contents = std::fs::read(path).expect("read"); + contents.push(b'\n'); + std::fs::write(path, contents).expect("rewrite"); + } + + fn scanner_for(dir: PathBuf) -> PromptScanner { + PromptScanner::new(dir, String::new(), None) + } + + #[test] + fn collect_pending_initial_walk_returns_all_transcripts() { + let (tmp, paths) = make_transcript_tree(&[("proj-a", "sid-a"), ("proj-b", "sid-b")]); + let s = scanner_for(tmp.path().to_path_buf()); + + // Force=true mirrors `start()`'s initial scan. + let pending = s + .collect_changed_transcripts(true) + .expect("initial walk must run"); + assert_eq!(pending.len(), paths.len(), "every transcript pending"); + for p in &paths { + assert!(pending.contains(p), "missing path {p:?}"); + } + let cache_size = s.mtime_cache.lock().unwrap().len(); + assert_eq!(cache_size, paths.len(), "cache populated after walk"); + } + + #[test] + fn collect_pending_fast_path_skips_when_nothing_changed() { + let (tmp, _) = make_transcript_tree(&[("proj-x", "sid-x")]); + let s = scanner_for(tmp.path().to_path_buf()); + + // Populate caches. + s.collect_changed_transcripts(true).expect("initial walk"); + + // Second poll: dir mtime hasn't moved, no notify event has + // been signalled — fast-path returns None so the periodic + // safety-net doesn't redo work the notify path would have + // already handled. + let pending = s.collect_changed_transcripts(false); + assert!( + pending.is_none(), + "fast-path must skip when neither the dir mtime nor a notify event \ + indicates anything has changed; got {pending:?}" + ); + } + + #[test] + fn collect_pending_walks_when_notify_event_signalled() { + let (tmp, _) = make_transcript_tree(&[("proj-y", "sid-y")]); + let s = scanner_for(tmp.path().to_path_buf()); + + s.collect_changed_transcripts(true).expect("initial walk"); + + // Simulate the notify fast-path firing — bubble write inside + // an existing session doesn't bump the top-level dir mtime, + // so the only signal the periodic loop has is this flag. + *s.notify_event_pending.lock().unwrap() = true; + + let pending = s + .collect_changed_transcripts(false) + .expect("notify event must force a walk"); + // No mtimes changed → empty list (per-file fast-path), but the + // walk DID run (Some) and the event flag was consumed. + assert!( + pending.is_empty(), + "walk ran but no file mtimes moved → no pending work; got {pending:?}" + ); + assert!( + !*s.notify_event_pending.lock().unwrap(), + "notify event flag must be consumed by the walk" + ); + } + + #[test] + fn collect_pending_returns_only_files_whose_mtime_moved() { + let (tmp, paths) = make_transcript_tree(&[("proj-1", "sid-1"), ("proj-2", "sid-2")]); + let s = scanner_for(tmp.path().to_path_buf()); + + // Initial walk populates caches. + s.collect_changed_transcripts(true).expect("initial walk"); + + // Touch one file so its mtime moves. The audit's "pickup + // latency" test: drop / modify a transcript and verify the + // next walk returns it, while leaving the other transcript + // out of the change list. + poke_mtime(&paths[1]); + *s.notify_event_pending.lock().unwrap() = true; + + let pending = s + .collect_changed_transcripts(false) + .expect("walk triggered by notify"); + assert_eq!( + pending.len(), + 1, + "only the touched transcript should be pending; got {pending:?}" + ); + assert_eq!(pending[0], paths[1]); + } + + #[test] + fn collect_pending_force_runs_walk_even_without_signals() { + let (tmp, _) = make_transcript_tree(&[("proj-z", "sid-z")]); + let s = scanner_for(tmp.path().to_path_buf()); + + // Populate caches, then prove force=true bypasses the + // fast-path the next call would otherwise take. Matches + // the initial-scan invocation in `start()`. + s.collect_changed_transcripts(true).expect("initial walk"); + let pending = s + .collect_changed_transcripts(true) + .expect("force=true must always walk"); + assert!(pending.is_empty(), "no changes → empty list, but Some(_)"); + } + + /// Sanity check: a baseline-resolution mtime equality compare + /// is what gates the per-file skip. `SystemTime::eq` is exact; + /// document the contract so future readers don't accidentally + /// switch to a coarser comparison. + #[test] + fn systemtime_equality_is_exact() { + let now = SystemTime::now(); + assert_eq!(now, now); + let later = now + std::time::Duration::from_nanos(1); + assert_ne!(now, later); + } fn ev(id: i64, project_id: &str, files: &[&str]) -> DiffEvent { DiffEvent { diff --git a/crates/pcr-core/src/sources/shared/state.rs b/crates/pcr-core/src/sources/shared/state.rs index f25f698..570438e 100644 --- a/crates/pcr-core/src/sources/shared/state.rs +++ b/crates/pcr-core/src/sources/shared/state.rs @@ -20,8 +20,25 @@ struct FileStateInner { } impl FileState { + /// Build a per-source line-state tracker rooted at + /// `$HOME/.pcr-dev/-state.json`. Panics with a clear + /// message if `$HOME` can't be resolved — without a stable + /// state file the watcher would silently drop its cursor on + /// reboot (the audit's correctness concern) and re-emit every + /// prompt on the next start. Failing fast at watcher + /// construction is strictly better than the previous silent + /// `/tmp` fallback. Source-watcher entry points + /// (`vscode::watcher::run`, `claudecode::watcher::run`) are + /// the only callers; all of them are themselves invoked from + /// `pcr start` which has already validated the directory via + /// `pid_file_path()?`. pub fn new(name: &str) -> Self { - let file_path = config::pcr_dir().join(format!("{name}-state.json")); + let file_path = config::pcr_dir() + .expect( + "pcr: cannot determine $HOME — refusing to put the watcher state file \ + under /tmp (would reset capture cursors on every reboot)", + ) + .join(format!("{name}-state.json")); let state = Self { inner: Arc::new(Mutex::new(FileStateInner { data: HashMap::new(), diff --git a/crates/pcr-core/src/store/db.rs b/crates/pcr-core/src/store/db.rs index f60a4d3..ca14979 100644 --- a/crates/pcr-core/src/store/db.rs +++ b/crates/pcr-core/src/store/db.rs @@ -17,15 +17,21 @@ use crate::config; static CONN: OnceLock> = OnceLock::new(); -fn db_path() -> PathBuf { - config::pcr_dir().join("drafts.db") +fn db_path() -> anyhow::Result { + Ok(config::pcr_dir()?.join("drafts.db")) } /// Open (and lazily create) the singleton database. Returns a locked guard -/// around a `rusqlite::Connection`. +/// around a `rusqlite::Connection`. Panics with a clear message if `$HOME` +/// can't be resolved — the CLI can't function at all without its local +/// store, so failing fast here is strictly better than the previous +/// silent `/tmp` fallback (which evaporated drafts on reboot). pub fn open() -> MutexGuard<'static, Connection> { let mutex = CONN.get_or_init(|| { - let path = db_path(); + let path = db_path().expect( + "pcr: cannot determine $HOME — refusing to open the draft store under /tmp \ + (set HOME and re-run)", + ); if let Some(parent) = path.parent() { let _ = std::fs::create_dir_all(parent); } diff --git a/crates/pcr-core/src/store/drafts.rs b/crates/pcr-core/src/store/drafts.rs index 554cb1a..abadf9b 100644 --- a/crates/pcr-core/src/store/drafts.rs +++ b/crates/pcr-core/src/store/drafts.rs @@ -411,9 +411,44 @@ pub fn update_draft_response( return Ok(()); } let conn = open(); + // The (session_id, prompt_text) pair isn't unique — Claude Code + // and Cursor both let users re-send the same prompt text inside + // one session, and we keep both drafts (they get distinct + // content_hash values via captured_at). An unscoped UPDATE would + // overwrite the older row's response with the newer turn's, so + // pin to a single id first. + // + // The audit suggested SELECT-then-UPDATE over `LIMIT 1` in the + // UPDATE itself: rusqlite's bundled SQLite isn't built with + // SQLITE_ENABLE_UPDATE_DELETE_LIMIT (verified locally — `UPDATE + // ... LIMIT 1` errors with "near \"LIMIT\": syntax error"), and + // the two-step shape is more portable across SQLite builds. + let id: Option = conn + .query_row( + "SELECT id FROM drafts + WHERE session_id = ? AND prompt_text = ? AND status = 'draft' + ORDER BY captured_at DESC, id DESC + LIMIT 1", + params![session_id, prompt_text], + |r| r.get(0), + ) + .optional()?; + let Some(id) = id else { + return Ok(()); + }; + // Apply the don't-shrink guard at UPDATE time (not at SELECT + // time): if the newest matching row already has a longer or + // equal response, we skip rather than fall back to an older + // row that happens to qualify. Otherwise the callers in + // `claudecode/watcher.rs::process_file` (which always target + // "the newest draft for this (session, prompt)") would + // accidentally rewrite an older draft on the second + // enrichment pass. conn.execute( - "UPDATE drafts SET response_text = ? WHERE session_id = ? AND prompt_text = ? AND status = 'draft' AND (response_text IS NULL OR LENGTH(response_text) < LENGTH(?))", - params![response_text, session_id, prompt_text, response_text], + "UPDATE drafts SET response_text = ? + WHERE id = ? + AND (response_text IS NULL OR LENGTH(response_text) < LENGTH(?))", + params![response_text, id, response_text], )?; Ok(()) } diff --git a/crates/pcr-core/src/store/gc.rs b/crates/pcr-core/src/store/gc.rs index cfd5147..45c8e11 100644 --- a/crates/pcr-core/src/store/gc.rs +++ b/crates/pcr-core/src/store/gc.rs @@ -3,8 +3,10 @@ use anyhow::Result; use chrono::{Duration, Utc}; use rusqlite::params; +use std::collections::HashSet; +use std::io::Write; use std::path::Path; -use std::process::Command; +use std::process::{Command, Stdio}; use crate::store::db::open; @@ -75,20 +77,26 @@ pub fn gc_orphaned(project_path: &Path) -> Result { .collect(); rows }; - let mut orphans: Vec = Vec::new(); - for (id, sha) in rows { - let ok = Command::new("git") - .arg("cat-file") - .arg("-e") - .arg(&sha) - .current_dir(project_path) - .status() - .map(|s| s.success()) - .unwrap_or(false); - if !ok { - orphans.push(id); - } + if rows.is_empty() { + return Ok(0); } + // The previous shape ran one `git cat-file -e ` per commit + // — O(N) git processes per GC pass. Replaced with a single + // `git cat-file --batch-check` invocation that reads SHAs on + // stdin and emits one line per query. (The audit suggested + // `git for-each-ref refs/heads/`, but that's the wrong + // primitive for "does this exact SHA still exist": branches + // walk the reflog forward, not from the commit graph + // backward. `--batch-check` is git's purpose-built answer for + // this exact query.) Keeps the orphan-detection semantics + // byte-identical: a SHA is orphaned iff `git cat-file` can't + // resolve it (rebased away, branch deleted, repo re-cloned). + let missing_shas = shas_missing_from_repo(project_path, rows.iter().map(|(_, s)| s.as_str())); + let orphans: Vec = rows + .into_iter() + .filter(|(_, sha)| missing_shas.contains(sha)) + .map(|(id, _)| id) + .collect(); if orphans.is_empty() { return Ok(0); } @@ -121,6 +129,75 @@ pub fn gc_orphaned(project_path: &Path) -> Result { Ok(orphans.len() as i64) } +/// Ask git which of `shas` are NOT present as commit objects in +/// `repo`. Single subprocess regardless of input size — `git +/// cat-file --batch-check` reads SHAs on stdin (one per line) and +/// emits one line per query. We treat any non-resolvable line as +/// "missing" (matches the original per-SHA `cat-file -e` exit +/// status: success ⇒ present, failure ⇒ absent). +/// +/// If git fails to start at all (no git on PATH, repo gone), we +/// return an empty set — that means "nothing is orphaned by +/// git", i.e. we'd rather skip a GC cycle than nuke unpushed +/// commits because the user's git binary is mis-installed. +fn shas_missing_from_repo<'a>( + repo: &Path, + shas: impl IntoIterator, +) -> HashSet { + let shas: Vec = shas.into_iter().map(String::from).collect(); + if shas.is_empty() { + return HashSet::new(); + } + let mut child = match Command::new("git") + .arg("cat-file") + .arg("--batch-check=%(objectname) %(objecttype)") + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::null()) + .current_dir(repo) + .spawn() + { + Ok(c) => c, + // git unavailable / repo broken — preserve unpushed work + // rather than wrongly classifying everything as orphaned. + Err(_) => return HashSet::new(), + }; + + if let Some(mut stdin) = child.stdin.take() { + // Best-effort: if writes start failing partway through, + // we still wait_with_output so we don't leak the child. + // The stdout we did receive will be parsed below; any + // unwritten SHAs are simply omitted from the missing set + // (treated as present), again erring on the side of NOT + // deleting unpushed history we're unsure about. + for sha in &shas { + if writeln!(stdin, "{sha}").is_err() { + break; + } + } + } + + let output = match child.wait_with_output() { + Ok(o) => o, + Err(_) => return HashSet::new(), + }; + let stdout = String::from_utf8_lossy(&output.stdout); + let mut missing: HashSet = HashSet::new(); + for line in stdout.lines() { + // `--batch-check` emits ` missing` for unknown + // objects and ` ` for resolvable ones. + // We only care about the "missing" sentinel. + let mut parts = line.split_whitespace(); + let Some(name) = parts.next() else { + continue; + }; + if parts.next() == Some("missing") { + missing.insert(name.to_string()); + } + } + missing +} + fn delete_commits(ids: &[String]) -> Result { if ids.is_empty() { return Ok(0); @@ -246,3 +323,116 @@ pub fn get_candidates_for_commit( } Ok((relevant, unrelated)) } + +#[cfg(test)] +mod tests { + use super::*; + + /// Helper: initialise a tiny git repo with one commit, return + /// the temp dir (held for lifetime) and the commit's SHA so + /// tests can verify the existence-check semantics. + fn init_repo_with_one_commit() -> Option<(tempfile::TempDir, String)> { + // git might not be on PATH in some sandboxed CI envs. + // Probe once and bail out (skipping the test) if so — + // we don't want a missing git binary to flake the suite. + if Command::new("git") + .arg("--version") + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .map(|s| !s.success()) + .unwrap_or(true) + { + return None; + } + + let tmp = tempfile::TempDir::new().ok()?; + let run = |args: &[&str]| { + Command::new("git") + .args(args) + .current_dir(tmp.path()) + .env("GIT_AUTHOR_NAME", "pcr-test") + .env("GIT_AUTHOR_EMAIL", "pcr-test@example.invalid") + .env("GIT_COMMITTER_NAME", "pcr-test") + .env("GIT_COMMITTER_EMAIL", "pcr-test@example.invalid") + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .ok() + .filter(|s| s.success()) + }; + run(&["init", "--initial-branch=main"]).or_else(|| run(&["init"]))?; + std::fs::write(tmp.path().join("README"), b"hello\n").ok()?; + run(&["add", "."])?; + run(&["commit", "-m", "first"])?; + + let sha = Command::new("git") + .args(["rev-parse", "HEAD"]) + .current_dir(tmp.path()) + .output() + .ok()?; + if !sha.status.success() { + return None; + } + let sha = String::from_utf8_lossy(&sha.stdout).trim().to_string(); + if sha.is_empty() { + return None; + } + Some((tmp, sha)) + } + + #[test] + fn batch_check_marks_only_unknown_shas_as_missing() { + let Some((repo, head_sha)) = init_repo_with_one_commit() else { + eprintln!( + "skipping batch-check test: git unavailable or unable to init a fixture repo" + ); + return; + }; + // 40 zeros: well-formed SHA1, but cannot resolve to any + // commit object in this fresh repo. + let bogus_sha = "0".repeat(40); + + let missing = shas_missing_from_repo(repo.path(), [head_sha.as_str(), bogus_sha.as_str()]); + + assert!( + !missing.contains(&head_sha), + "live HEAD sha {head_sha} must NOT be reported missing" + ); + assert!( + missing.contains(&bogus_sha), + "unknown sha {bogus_sha} must be reported missing" + ); + assert_eq!( + missing.len(), + 1, + "exactly one of the two SHAs is unknown; got {missing:?}" + ); + } + + #[test] + fn batch_check_returns_empty_when_input_is_empty() { + let tmp = tempfile::TempDir::new().expect("tempdir"); + // Doesn't even need to be a git repo — empty input + // short-circuits before spawning git. + let missing = shas_missing_from_repo(tmp.path(), std::iter::empty::<&str>()); + assert!(missing.is_empty()); + } + + #[test] + fn batch_check_returns_empty_on_git_failure() { + let tmp = tempfile::TempDir::new().expect("tempdir"); + // Not a git repo at all. `git cat-file --batch-check` + // will fail to start (or exit non-zero immediately). + // We expect an empty `missing` set — the conservative + // choice that protects unpushed work from being GC'd + // because git happens to be broken / mis-configured on + // this machine. + let missing = shas_missing_from_repo(tmp.path(), ["deadbeef".repeat(5).as_str()]); + assert!( + missing.is_empty(), + "git failure must NOT classify SHAs as missing (would wrongly GC unpushed work); \ + got {missing:?}" + ); + } +} diff --git a/crates/pcr-core/tests/claudecode_state_ordering.rs b/crates/pcr-core/tests/claudecode_state_ordering.rs new file mode 100644 index 0000000..0728722 --- /dev/null +++ b/crates/pcr-core/tests/claudecode_state_ordering.rs @@ -0,0 +1,127 @@ +//! State-cursor ordering for `claudecode::watcher::process_file`. +//! +//! Regression test for the audit's task 6: `state.set(file_path, lines)` +//! used to run BEFORE `parse_claude_code_session`. If the parser +//! happened to fail (or even just extract zero prompts) the watcher +//! still advanced its line-count cursor — so on the next scan, none of +//! the previously-unprocessed lines would ever be re-examined. +//! +//! The fix is to delay `state.set` until parse produced at least one +//! prompt. This test exercises that contract with a real on-disk +//! JSONL file, the real `process_file` entry point, and a real +//! `$HOME/.pcr-dev` store — no mocks. +//! +//! Single test per file: the in-process SQLite singleton in +//! `crates/pcr-core/src/store/db.rs` survives across tests inside the +//! same integration binary, which would otherwise contaminate state +//! between cases. + +use pcr_core::projects; +use pcr_core::sources::claudecode::watcher::process_file; +use pcr_core::sources::shared::{Deduplicator, FileState}; +use tempfile::TempDir; + +#[test] +fn parse_failure_leaves_state_cursor_unchanged() { + let home = TempDir::new().expect("home tempdir"); + // SAFETY: this integration test binary contains exactly one #[test]; + // cargo runs each integration test binary in its own process; no + // other thread can observe the env mutation. + unsafe { + std::env::set_var("HOME", home.path()); + std::env::set_var("USERPROFILE", home.path()); + } + std::fs::create_dir_all(home.path().join(".pcr-dev")).expect("mkdir pcr-dev"); + + // Register a project whose claude_slug matches the synthetic + // `~/.claude/projects//` directory we're about to create. + // `process_file` early-returns if the slug isn't registered, so + // without this step the function would skip everything before + // even touching the state cursor. + let project_path = home.path().join("workspace-state-ordering"); + std::fs::create_dir_all(&project_path).expect("mkdir project"); + let registered = projects::register(&project_path.to_string_lossy()); + assert!( + !registered.claude_slug.is_empty(), + "register must populate claude_slug" + ); + + // Build the synthetic transcript path the watcher would have seen. + let claude_dir = home + .path() + .join(".claude") + .join("projects") + .join(®istered.claude_slug); + std::fs::create_dir_all(&claude_dir).expect("mkdir claude transcripts dir"); + let file_path = claude_dir.join("session-corrupt.jsonl"); + + // Corrupt JSONL: real bytes (so the line count is non-zero), but + // every line fails `serde_json::from_str` → parser returns an + // empty session. Three lines so `count_non_empty_lines` returns 3. + std::fs::write( + &file_path, + b"not json line 1\nnot json line 2\nnot json line 3\n", + ) + .expect("write corrupt transcript"); + + let state = FileState::new("claude-code-state-ordering-test"); + let dedup = Deduplicator::new(); + + // Sanity-check the precondition: cursor is at zero before the + // first call. + let key = file_path.to_string_lossy().into_owned(); + assert_eq!(state.get(&key), 0, "fresh state starts at 0"); + + process_file(&key, "", &state, &dedup, false); + + assert_eq!( + state.get(&key), + 0, + "parse extracted no prompts → state cursor must NOT advance, \ + otherwise a transient parse failure silently drops lines" + ); + + // Replace the corrupt file with a parseable transcript — one user + // message that the parser will surface as a prompt. After this + // call, `process_file` must advance the cursor because there's + // now something legitimate to attribute. + let session_id = "claude-session-state-ordering-001"; + let valid_line = serde_json::json!({ + "type": "user", + "sessionId": session_id, + "timestamp": "2026-05-18T00:00:00.000Z", + "gitBranch": "main", + "message": { + "role": "user", + "content": "do the thing", + }, + }); + let valid_assistant = serde_json::json!({ + "type": "assistant", + "sessionId": session_id, + "timestamp": "2026-05-18T00:00:01.000Z", + "message": { + "role": "assistant", + "model": "claude-sonnet-4-5", + "content": [{"type": "text", "text": "ok"}], + }, + }); + let body = format!("{}\n{}\n", valid_line, valid_assistant); + std::fs::write(&file_path, body.as_bytes()).expect("rewrite transcript"); + + process_file(&key, "", &state, &dedup, false); + assert!( + state.get(&key) > 0, + "after a parse that yielded prompts, state cursor must advance" + ); + + // Sanity: a forced re-scan over the same path is still idempotent + // — the cursor stays at the line count, doesn't somehow regress. + let advanced = state.get(&key); + process_file(&key, "", &state, &dedup, false); + assert_eq!( + state.get(&key), + advanced, + "re-running on identical content must not move the cursor backwards" + ); +} diff --git a/crates/pcr-core/tests/drafts_update_response_scoped.rs b/crates/pcr-core/tests/drafts_update_response_scoped.rs new file mode 100644 index 0000000..c423564 --- /dev/null +++ b/crates/pcr-core/tests/drafts_update_response_scoped.rs @@ -0,0 +1,108 @@ +//! Regression test for the audit's task 2: `update_draft_response` +//! used to issue an unscoped `UPDATE … WHERE session_id = ? AND +//! prompt_text = ?` which would silently overwrite EVERY draft with +//! the same (session, prompt) tuple. In practice Claude Code and +//! Cursor both let users re-send identical prompt text inside one +//! session (e.g. "go", "continue", "yes please"); the two distinct +//! drafts share `session_id` + `prompt_text` but live as separate +//! rows (distinct `content_hash` via the v2 hash, which folds in +//! `captured_at`). +//! +//! The fix selects the most recent matching id first, then updates +//! that single row. This test proves the older row is left intact. + +use pcr_core::store::{self, DraftRecord, DraftStatus}; +use pcr_core::supabase::{prompt_content_hash_v2, prompt_id_v2, PromptRecord}; +use tempfile::TempDir; + +fn save(session_id: &str, prompt_text: &str, captured_at: &str) -> String { + let hash = prompt_content_hash_v2(session_id, prompt_text, captured_at); + let rec = PromptRecord { + id: prompt_id_v2(session_id, prompt_text, captured_at), + content_hash: hash.clone(), + session_id: session_id.to_string(), + project_id: "p-scoped-update".into(), + project_name: "p-scoped-update".into(), + prompt_text: prompt_text.to_string(), + source: "claude-code".into(), + capture_method: "test".into(), + captured_at: captured_at.to_string(), + ..Default::default() + }; + store::save_draft(&rec, &[], "", "").expect("save_draft"); + hash +} + +fn find(captured_at: &str, drafts: &[DraftRecord]) -> DraftRecord { + drafts + .iter() + .find(|d| d.captured_at == captured_at) + .cloned() + .unwrap_or_else(|| panic!("no draft at {captured_at} in {drafts:?}")) +} + +#[test] +fn update_draft_response_only_touches_the_most_recent_match() { + let home = TempDir::new().expect("home tempdir"); + // SAFETY: this integration test binary contains exactly one + // #[test]; cargo runs each binary in its own process. + unsafe { + std::env::set_var("HOME", home.path()); + std::env::set_var("USERPROFILE", home.path()); + } + std::fs::create_dir_all(home.path().join(".pcr-dev")).expect("mkdir pcr-dev"); + + let sid = "claude-session-dup-prompt-001"; + let prompt = "continue"; + + // Two drafts with identical (session_id, prompt_text). The v2 + // hash folds `captured_at` in so the rows survive the + // `content_hash UNIQUE` constraint independently. + save(sid, prompt, "2026-05-18T00:00:00.000Z"); + save(sid, prompt, "2026-05-18T00:05:00.000Z"); + + let before = store::get_drafts_by_status(DraftStatus::Draft, &[], &[]).expect("query"); + assert_eq!(before.len(), 2, "both drafts should be stored"); + let older_id = find("2026-05-18T00:00:00.000Z", &before).id.clone(); + let newer_id = find("2026-05-18T00:05:00.000Z", &before).id.clone(); + assert_ne!(older_id, newer_id, "rows must be distinct"); + + store::update_draft_response(sid, prompt, "newer reply").expect("update"); + + let after = store::get_drafts_by_status(DraftStatus::Draft, &[], &[]).expect("query"); + assert_eq!(after.len(), 2, "row count unchanged"); + let older = find("2026-05-18T00:00:00.000Z", &after); + let newer = find("2026-05-18T00:05:00.000Z", &after); + + assert_eq!( + newer.response_text, "newer reply", + "most recent draft must receive the response" + ); + assert_eq!( + older.response_text, "", + "older draft must be UNTOUCHED — the previous unscoped UPDATE \ + overwrote it, which is the regression we're guarding against" + ); + + // Second call must be idempotent: the existing response is no + // shorter than what we'd write, so the LEN-filter selects no row + // and we exit without an UPDATE. Older row stays empty either way. + store::update_draft_response(sid, prompt, "newer reply").expect("idempotent update"); + let after2 = store::get_drafts_by_status(DraftStatus::Draft, &[], &[]).expect("query"); + let older2 = find("2026-05-18T00:00:00.000Z", &after2); + let newer2 = find("2026-05-18T00:05:00.000Z", &after2); + assert_eq!(newer2.response_text, "newer reply"); + assert_eq!(older2.response_text, ""); + + // A *longer* response targeted at the same (session, prompt) + // still scopes to the newest row, not the older one. + store::update_draft_response(sid, prompt, "newer reply — extended").expect("update v2"); + let after3 = store::get_drafts_by_status(DraftStatus::Draft, &[], &[]).expect("query"); + let older3 = find("2026-05-18T00:00:00.000Z", &after3); + let newer3 = find("2026-05-18T00:05:00.000Z", &after3); + assert_eq!(newer3.response_text, "newer reply — extended"); + assert_eq!( + older3.response_text, "", + "older row must remain untouched even when the response grows" + ); +}