diff --git a/README.md b/README.md index 827ea70..81071b9 100644 --- a/README.md +++ b/README.md @@ -243,28 +243,29 @@ result = ( ### Dynamic Policy (policy_fn) Inspect syscall events at runtime and adjust permissions on the fly. -Each event includes rich metadata: path, host, port, argv, category, -parent PID. The callback returns a verdict to allow, deny, or audit. +Events carry syscall name, category, PID, network destination (for +`connect`/`sendto`/`bind`), and `argv` (for `execve`). The callback +returns a verdict to allow, deny, or audit. ```python from sandlock import Sandbox, Policy import errno def on_event(event, ctx): - # Block download tools + # Block download tools by argv if event.syscall == "execve" and event.argv_contains("curl"): return True # deny - # Custom errno for sensitive files - if event.category == "file" and event.path_contains("/secret"): + # Deny connections to a specific IP + if event.syscall == "connect" and event.host == "10.0.0.5": return errno.EACCES - # Restrict network after setup phase - if event.syscall == "execve" and event.path_contains("untrusted"): - ctx.restrict_network([]) - ctx.deny_path("/etc/shadow") + # Lock down once the program has finished starting up + if event.syscall == "execve": + ctx.restrict_network([]) # block all network + ctx.deny_path("/etc/shadow") # dynamic fs deny - # Audit file access (allow but flag) + # Audit every file access (allow but flag) if event.category == "file": return "audit" @@ -281,7 +282,24 @@ result = Sandbox(policy, policy_fn=on_event).run(["python3", "agent.py"]) positive int = deny with errno, `"audit"`/`-2` = allow + flag. **Event fields:** `syscall`, `category` (file/network/process/memory), -`pid`, `parent_pid`, `path`, `host`, `port`, `argv`, `denied`. +`pid`, `parent_pid`, `host`, `port`, `argv`, `denied`. + +> **TOCTOU NOTE ** Per `seccomp_unotify(2)`, the kernel +> re-reads user-memory pointers after `Continue`. Sandlock handles this +> in two places: +> +> - **Path strings are not exposed on events.** Path-based access control +> belongs in static Landlock rules (`fs_readable` / `fs_writable` / +> `fs_denied`) — kernel-enforced and TOCTOU-immune. Use +> `ctx.deny_path()` for runtime additions. +> - **`event.argv` is exposed and TOCTOU-safe.** Before returning +> `Continue` for an `execve`, the supervisor `PTRACE_SEIZE` + +> `PTRACE_INTERRUPT`s every sibling thread of the calling tid so the +> kernel's re-read happens with no other writer running. The pause +> has no observable cost: `execve`'s `de_thread` step kills sibling +> threads anyway. If the freeze cannot be established (e.g., YAMA +> blocks ptrace), the execve is denied with `EPERM` — the safety +> invariant is never silently relaxed. **Context methods:** - `ctx.restrict_network(ips)` / `ctx.grant_network(ips)` — network control diff --git a/crates/sandlock-core/src/lib.rs b/crates/sandlock-core/src/lib.rs index dd37de9..9a25967 100644 --- a/crates/sandlock-core/src/lib.rs +++ b/crates/sandlock-core/src/lib.rs @@ -15,6 +15,7 @@ pub(crate) mod random; pub(crate) mod time; pub(crate) mod cow; pub(crate) mod checkpoint; +pub(crate) mod sibling_freeze; pub mod netlink; pub(crate) mod procfs; pub(crate) mod port_remap; diff --git a/crates/sandlock-core/src/policy_fn.rs b/crates/sandlock-core/src/policy_fn.rs index e36ee4d..4759b76 100644 --- a/crates/sandlock-core/src/policy_fn.rs +++ b/crates/sandlock-core/src/policy_fn.rs @@ -8,12 +8,24 @@ //! .fs_read("/usr").fs_read("/lib") //! .net_allow_host("127.0.0.1") //! .policy_fn(|event, ctx| { -//! if event.syscall == "execve" && event.path_contains("untrusted") { -//! ctx.restrict_network(&[]); // block all network +//! if event.syscall == "connect" && event.host == Some("10.0.0.5".parse().unwrap()) { +//! return Verdict::Deny; //! } +//! Verdict::Allow //! }) //! .build()?; //! ``` +//! +//! # TOCTOU and string-typed fields +//! +//! Path and argv strings the kernel will re-read after a `Continue` +//! response (per `seccomp_unotify(2)`) are not exposed on this event. +//! Path-based access control belongs in static Landlock rules +//! (`fs_read`/`fs_write`/`fs_deny`), which the kernel enforces directly +//! and which are not subject to user-memory races. Network fields +//! (`host`, `port`) are TOCTOU-safe because the supervisor performs +//! `connect`/`sendto`/`bind` on-behalf via `pidfd_getfd` and the kernel +//! never re-reads child memory for those syscalls. use std::collections::{HashMap, HashSet}; use std::net::IpAddr; @@ -41,6 +53,26 @@ pub enum SyscallCategory { // ============================================================ /// An intercepted syscall event observed by the seccomp supervisor. +/// +/// # TOCTOU and string-typed fields +/// +/// Path strings are deliberately absent. Per `seccomp_unotify(2)`, the +/// kernel re-reads user-memory pointers after a `Continue` response, so +/// any path-string-based decision is racy in a multi-threaded child. +/// Path-based access control belongs in static Landlock rules +/// (`fs_read` / `fs_write` / `fs_deny`); see issue #27. +/// +/// `argv` *is* exposed for `execve`/`execveat` and is TOCTOU-safe by +/// construction: before the supervisor returns `Continue` for an +/// execve, it `PTRACE_SEIZE`+`PTRACE_INTERRUPT`s every sibling thread +/// of the calling tid so the kernel's post-Continue re-read sees the +/// same memory the supervisor inspected. Siblings are killed by the +/// kernel during execve's `de_thread` step anyway, so the pause has +/// no observable cost. See `crate::sibling_freeze`. +/// +/// Network fields (`host`, `port`) are TOCTOU-safe because the +/// supervisor performs `connect`/`sendto`/`bind` on-behalf via +/// `pidfd_getfd` and the kernel never re-reads child memory for those. #[derive(Debug, Clone)] pub struct SyscallEvent { /// Syscall name (e.g., "connect", "openat", "execve", "clone"). @@ -51,27 +83,22 @@ pub struct SyscallEvent { pub pid: u32, /// Parent PID (read from /proc/{pid}/stat). pub parent_pid: Option, - /// Resolved filesystem path (for openat, execve, etc.). - pub path: Option, - /// Destination IP address (for connect, sendto). + /// Destination IP address (for connect, sendto). TOCTOU-safe. pub host: Option, - /// Destination port (for connect, sendto, bind). + /// Destination port (for connect, sendto, bind). TOCTOU-safe. pub port: Option, /// Size argument (for mmap, brk). pub size: Option, - /// Command arguments (for execve/execveat). + /// Command arguments for execve/execveat. TOCTOU-safe: sibling + /// threads are frozen before the kernel re-reads. pub argv: Option>, /// Whether the supervisor denied this syscall. pub denied: bool, } impl SyscallEvent { - /// Check if the path contains a substring. - pub fn path_contains(&self, s: &str) -> bool { - self.path.as_ref().map_or(false, |p| p.contains(s)) - } - - /// Check if any argv element contains a substring. + /// Returns true if any argv element contains the given substring. + /// Only meaningful for execve/execveat events (where argv is populated). pub fn argv_contains(&self, s: &str) -> bool { self.argv.as_ref().map_or(false, |args| args.iter().any(|a| a.contains(s))) } @@ -434,13 +461,12 @@ mod tests { } #[test] - fn test_event_path_contains() { + fn test_event_argv_contains() { let event = SyscallEvent { syscall: "execve".to_string(), category: SyscallCategory::Process, pid: 1, parent_pid: Some(0), - path: Some("/usr/bin/python3".to_string()), host: None, port: None, size: None, @@ -451,7 +477,21 @@ mod tests { assert!(event.argv_contains("-c")); assert!(!event.argv_contains("ruby")); assert_eq!(event.category, SyscallCategory::Process); - assert!(event.path_contains("python")); - assert!(!event.path_contains("ruby")); + } + + #[test] + fn test_event_argv_contains_none() { + let event = SyscallEvent { + syscall: "openat".to_string(), + category: SyscallCategory::File, + pid: 1, + parent_pid: None, + host: None, + port: None, + size: None, + argv: None, + denied: false, + }; + assert!(!event.argv_contains("anything")); } } diff --git a/crates/sandlock-core/src/seccomp/notif.rs b/crates/sandlock-core/src/seccomp/notif.rs index 71c4e34..da170f2 100644 --- a/crates/sandlock-core/src/seccomp/notif.rs +++ b/crates/sandlock-core/src/seccomp/notif.rs @@ -729,15 +729,15 @@ fn read_sockaddr_for_event(notif: &SeccompNotif, addr: u64, len: usize, notif_fd (ip, if port > 0 { Some(port) } else { None }) } -/// Read argv (array of string pointers) from child memory for execve. -/// execve(path, argv, envp): argv is a NULL-terminated array of char* pointers. +/// Read argv (NULL-terminated array of char* in child memory) for execve. +/// Capped at 64 entries × 256 bytes/entry as a safety bound. fn read_argv_for_event(notif: &SeccompNotif, argv_ptr: u64, notif_fd: RawFd) -> Option> { if argv_ptr == 0 { return None; } let mut args = Vec::new(); let ptr_size = std::mem::size_of::(); - for i in 0..64 { // safety limit - let ptr_addr = argv_ptr + (i * ptr_size) as u64; + for i in 0..64u64 { + let ptr_addr = argv_ptr + i * ptr_size as u64; let ptr_bytes = read_child_mem(notif_fd, notif.id, notif.pid, ptr_addr, ptr_size).ok()?; let str_ptr = u64::from_ne_bytes(ptr_bytes[..8].try_into().ok()?); if str_ptr == 0 { break; } // NULL terminator @@ -773,27 +773,34 @@ async fn emit_policy_event( let category = syscall_category(nr); let parent_pid = read_ppid(notif.pid); - // Extract metadata based on syscall type - let mut path = None; + // Extract metadata based on syscall type. + // + // Path strings are deliberately NOT extracted: the kernel re-reads + // user-memory pointers after Continue, so any path-string-based + // decision is racy (issue #27). Path-based access control belongs + // in static Landlock rules. + // + // argv IS extracted for execve/execveat: the supervisor freezes + // sibling threads before returning Continue (sibling_freeze module), + // so the post-Continue re-read sees the same memory we read here. + // + // Network fields are TOCTOU-safe because connect/sendto/bind are + // performed on-behalf via pidfd_getfd; the kernel never re-reads + // child memory for those syscalls. let mut host = None; let mut port = None; let mut size = None; let mut argv = None; - if nr == libc::SYS_openat || Some(nr) == arch::SYS_OPEN || nr == libc::SYS_execve || nr == libc::SYS_execveat { - // openat(dirfd, pathname, ...): args[1] = path ptr - // execve(pathname, argv, envp): args[0] = path ptr, args[1] = argv ptr - let path_ptr = if nr == libc::SYS_openat { - notif.data.args[1] + if nr == libc::SYS_execve || nr == libc::SYS_execveat { + // execve(pathname, argv, envp): args[1] = argv ptr + // execveat(dirfd, pathname, argv, ..): args[2] = argv ptr + let argv_ptr = if nr == libc::SYS_execveat { + notif.data.args[2] } else { - notif.data.args[0] + notif.data.args[1] }; - path = read_path_for_event(notif, path_ptr, notif_fd); - - // Extract argv for execve/execveat - if nr == libc::SYS_execve || nr == libc::SYS_execveat { - argv = read_argv_for_event(notif, notif.data.args[1], notif_fd); - } + argv = read_argv_for_event(notif, argv_ptr, notif_fd); } if nr == libc::SYS_connect || nr == libc::SYS_sendto || nr == libc::SYS_bind { @@ -815,7 +822,6 @@ async fn emit_policy_event( category, pid: notif.pid, parent_pid, - path, host, port, size, @@ -910,6 +916,33 @@ async fn handle_notification( } } + // TOCTOU-close for execve (issue #27): freeze sibling threads of + // the calling tid before the kernel re-reads pathname/argv from + // child memory. Cheap because the kernel's de_thread step in + // execve kills the siblings anyway — we're just stopping them + // moments earlier, closing the race window for the supervisor's + // argv inspection in policy_fn. + // + // Only relevant when we're sending Continue: a denial response + // (Errno) means the kernel never re-reads, so no freeze needed. + // + // Strict on failure: if we cannot freeze the siblings, we cannot + // uphold the argv-safety invariant, so we deny the execve with + // EPERM rather than letting it through unprotected. + let nr = notif.data.nr as i64; + if matches!(action, NotifAction::Continue) + && crate::sibling_freeze::requires_freeze_on_continue(nr) + { + if let Err(e) = crate::sibling_freeze::freeze_siblings_for_execve(notif.pid as i32) { + eprintln!( + "sandlock: argv-safety freeze failed for pid {}: {} \ + — denying execve to preserve TOCTOU invariant", + notif.pid, e + ); + action = NotifAction::Errno(libc::EPERM); + } + } + // Ignore error — child may have exited between recv and response. let _ = send_response(fd, notif.id, action); } diff --git a/crates/sandlock-core/src/sibling_freeze.rs b/crates/sandlock-core/src/sibling_freeze.rs new file mode 100644 index 0000000..42365c0 --- /dev/null +++ b/crates/sandlock-core/src/sibling_freeze.rs @@ -0,0 +1,182 @@ +//! Freeze sibling threads of an execve caller before returning Continue. +//! +//! # Why +//! +//! Per `seccomp_unotify(2)`, after the supervisor responds with +//! `Continue`, the kernel re-reads the syscall's user-memory pointers +//! before executing the syscall. For execve, that means the kernel +//! re-reads `pathname` and the argv array from child memory. A racing +//! sibling thread of the calling tid can mutate that memory in the +//! window between the supervisor's response and the kernel's re-read, +//! defeating any decision policy_fn made on the values it inspected. +//! +//! This module closes the window for execve specifically. Before the +//! supervisor returns Continue, every sibling tid is `PTRACE_SEIZE`d +//! and `PTRACE_INTERRUPT`ed (which puts it in group-stop). The kernel +//! re-reads while no sibling is running. Then the supervisor releases +//! its hold on the seccomp notification. +//! +//! # Why this is essentially free for execve +//! +//! `execve(2)` already terminates all sibling threads as part of +//! `de_thread`. Freezing them moments earlier doesn't change observable +//! behavior — the kernel kills them anyway. We don't need to detach +//! explicitly; the siblings die with the rest of the thread group +//! during execve, and ptrace records associated with them are reaped +//! by the kernel. +//! +//! # Failure modes (strict) +//! +//! The freeze is an invariant: if the supervisor exposed argv to +//! `policy_fn` and the callback returned Allow, the kernel must re-read +//! the same memory the supervisor inspected. We refuse to silently +//! degrade — if the freeze cannot be established, the supervisor +//! denies the execve with `EPERM` rather than letting it proceed +//! without TOCTOU protection. +//! +//! - `PTRACE_SEIZE` returns `ESRCH` for a sibling that exited between +//! enumeration and seize. Treated as success: there is no thread to +//! race. +//! - Any other ptrace failure (YAMA `ptrace_scope` >= 2 outside the +//! parent chain, another tracer attached, kernel resource limits) +//! produces an error; siblings already frozen during the partial +//! attempt are detached so they resume normally; the caller fails +//! the syscall closed. + +use std::fs; +use std::io; + +/// Enumerate sibling tids of `caller_tid` from `/proc//task/`. +/// `caller_tid` is excluded from the result. +fn list_siblings(caller_tid: i32) -> io::Result> { + let dir = fs::read_dir(format!("/proc/{}/task", caller_tid))?; + let mut tids = Vec::new(); + for entry in dir { + let entry = match entry { + Ok(e) => e, + Err(_) => continue, + }; + let name = entry.file_name(); + let name_str = match name.to_str() { + Some(s) => s, + None => continue, + }; + let tid: i32 = match name_str.parse() { + Ok(t) => t, + Err(_) => continue, + }; + if tid != caller_tid { + tids.push(tid); + } + } + Ok(tids) +} + +/// `PTRACE_SEIZE` + `PTRACE_INTERRUPT` a single tid and wait for the +/// resulting group-stop. Returns `Ok(true)` if the tid is now stopped, +/// `Ok(false)` if the tid had already exited (ESRCH; nothing to do), +/// or an error if ptrace refused. +/// +/// On a partial-progress failure (PTRACE_SEIZE succeeded but +/// PTRACE_INTERRUPT did not), the function detaches itself before +/// returning so the caller doesn't have to track partial state. +fn seize_and_interrupt(tid: i32) -> io::Result { + let ret = unsafe { + libc::ptrace(libc::PTRACE_SEIZE as libc::c_uint, tid, 0, 0) + }; + if ret < 0 { + let err = io::Error::last_os_error(); + if err.raw_os_error() == Some(libc::ESRCH) { + return Ok(false); // already exited — nothing to freeze + } + return Err(err); + } + // PTRACE_SEIZE succeeded; from here, any error path must DETACH + // before returning so we don't leave the sibling traced-but-running. + + let ret = unsafe { + libc::ptrace(libc::PTRACE_INTERRUPT as libc::c_uint, tid, 0, 0) + }; + if ret < 0 { + let err = io::Error::last_os_error(); + let _ = unsafe { libc::ptrace(libc::PTRACE_DETACH, tid, 0, 0) }; + if err.raw_os_error() == Some(libc::ESRCH) { + return Ok(false); + } + return Err(err); + } + + // Wait for the ptrace-stop. WNOHANG would race; we want to block + // until the sibling is genuinely stopped. __WALL is needed because + // siblings are threads (not children of the supervisor in the + // traditional fork sense) and waitpid(2) by default ignores them. + let mut status: i32 = 0; + let _ = unsafe { libc::waitpid(tid, &mut status, libc::__WALL) }; + Ok(true) +} + +/// Detach a previously-frozen sibling. Used to roll back partial +/// progress when a later sibling refuses to be frozen. +fn detach(tid: i32) { + let _ = unsafe { libc::ptrace(libc::PTRACE_DETACH, tid, 0, 0) }; +} + +/// Freeze all sibling threads of `caller_tid`. +/// +/// Strict semantics: if any sibling refuses to be frozen, all +/// successfully-frozen siblings are detached (so they resume normally) +/// and the error is propagated. The caller is expected to deny the +/// execve with EPERM, preserving the invariant that exposed argv is +/// always TOCTOU-safe. +/// +/// On success, returns the number of siblings frozen. The supervisor +/// does not actively detach on the success path — siblings die during +/// execve's `de_thread`, and the kernel reaps the ptrace state. +pub(crate) fn freeze_siblings_for_execve(caller_tid: i32) -> io::Result { + let siblings = list_siblings(caller_tid)?; + let mut frozen: Vec = Vec::with_capacity(siblings.len()); + for tid in siblings { + match seize_and_interrupt(tid) { + Ok(true) => frozen.push(tid), + Ok(false) => continue, // already exited — fine + Err(e) => { + // Roll back: detach everything we already froze so they + // resume normally, then fail. + for ftid in &frozen { + detach(*ftid); + } + return Err(e); + } + } + } + Ok(frozen.len()) +} + +/// Helper called from the dispatch hot path. Returns true if the +/// notification is for an execve-class syscall whose Continue response +/// requires freezing siblings. +pub(crate) fn requires_freeze_on_continue(syscall_nr: i64) -> bool { + syscall_nr == libc::SYS_execve || syscall_nr == libc::SYS_execveat +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn list_siblings_excludes_self() { + // Our own /proc/self/task always exists; just check we don't + // see our own tid in the list. + let our_tid = unsafe { libc::syscall(libc::SYS_gettid) } as i32; + let siblings = list_siblings(our_tid).unwrap(); + assert!(!siblings.contains(&our_tid)); + } + + #[test] + fn requires_freeze_only_for_exec() { + assert!(requires_freeze_on_continue(libc::SYS_execve)); + assert!(requires_freeze_on_continue(libc::SYS_execveat)); + assert!(!requires_freeze_on_continue(libc::SYS_openat)); + assert!(!requires_freeze_on_continue(libc::SYS_connect)); + } +} diff --git a/crates/sandlock-core/tests/integration/test_policy_fn.rs b/crates/sandlock-core/tests/integration/test_policy_fn.rs index 2b7960d..b8a168b 100644 --- a/crates/sandlock-core/tests/integration/test_policy_fn.rs +++ b/crates/sandlock-core/tests/integration/test_policy_fn.rs @@ -17,15 +17,12 @@ fn base_policy() -> sandlock_core::PolicyBuilder { /// Test that the policy callback receives events with metadata. #[tokio::test] async fn test_policy_fn_receives_events_with_metadata() { - let events: Arc)>>> = Arc::new(Mutex::new(Vec::new())); + let events: Arc>> = Arc::new(Mutex::new(Vec::new())); let events_clone = events.clone(); let policy = base_policy() .policy_fn(move |event, _ctx| { - events_clone.lock().unwrap().push(( - event.syscall.clone(), - event.path.clone(), - )); + events_clone.lock().unwrap().push(event.syscall.clone()); Verdict::Allow }) .build() @@ -39,9 +36,11 @@ async fn test_policy_fn_receives_events_with_metadata() { let captured = events.lock().unwrap(); assert!(!captured.is_empty(), "should receive events"); - // Should have at least one openat with a path - let has_path = captured.iter().any(|(name, path)| name == "openat" && path.is_some()); - assert!(has_path, "openat events should include path, got: {:?}", &captured[..captured.len().min(5)]); + // Should have at least one openat (file syscall) and one execve. + assert!(captured.iter().any(|n| n == "openat"), + "should include openat, got: {:?}", &captured[..captured.len().min(5)]); + assert!(captured.iter().any(|n| n == "execve"), + "should include execve, got: {:?}", &captured[..captured.len().min(5)]); } /// Test that Verdict::Deny blocks a connect syscall. @@ -181,7 +180,7 @@ async fn test_policy_fn_passthrough() { assert!(count > 0, "callback should have been called at least once, got {}", count); } -/// Test execve events include argv. +/// Test execve events include argv (TOCTOU-safe via sibling freeze). #[tokio::test] async fn test_policy_fn_execve_argv() { let argvs: Arc>>> = Arc::new(Mutex::new(Vec::new())); @@ -206,19 +205,17 @@ async fn test_policy_fn_execve_argv() { let captured = argvs.lock().unwrap(); assert!(!captured.is_empty(), "should have captured execve argv"); - // At least one argv should contain "python3" let has_python = captured.iter().any(|args| args.iter().any(|a| a.contains("python3"))); assert!(has_python, "argv should contain python3, got: {:?}", *captured); } -/// Test argv_contains helper. +/// Test argv_contains-based denial. The supervisor freezes sibling +/// threads of the calling tid before Continue, so the policy_fn's +/// argv inspection binds to what the kernel will run. #[tokio::test] async fn test_policy_fn_deny_by_argv() { - let _out = temp_file("deny-argv"); - let policy = base_policy() .policy_fn(move |event, _ctx| { - // Deny execve if argv contains "malicious" if event.syscall == "execve" && event.argv_contains("malicious") { return Verdict::Deny; } @@ -227,11 +224,9 @@ async fn test_policy_fn_deny_by_argv() { .build() .unwrap(); - // Run something with "malicious" in argv — should be denied let result = Sandbox::run_interactive( &policy, &["echo", "malicious"], ).await.unwrap(); - // echo's execve has "malicious" in argv[1], should be denied assert!(!result.success(), "execve with 'malicious' in argv should be denied"); } @@ -376,41 +371,6 @@ async fn test_policy_fn_deny_with_eacces() { let _ = std::fs::remove_file(&out); } -/// Test Verdict::DenyWith(ENOENT) on openat. -#[tokio::test] -async fn test_policy_fn_deny_with_enoent() { - let out = temp_file("deny-enoent"); - - let policy = base_policy() - .policy_fn(move |event, _ctx| { - // Deny opening /etc/hostname with ENOENT (pretend it doesn't exist) - if event.syscall == "openat" && event.path_contains("/etc/hostname") { - return Verdict::DenyWith(libc::ENOENT); - } - Verdict::Allow - }) - .build() - .unwrap(); - - let script = format!(concat!( - "try:\n", - " open('/etc/hostname').read()\n", - " open('{out}', 'w').write('READ')\n", - "except FileNotFoundError:\n", - " open('{out}', 'w').write('ENOENT')\n", - "except PermissionError:\n", - " open('{out}', 'w').write('EPERM')\n", - ), out = out.display()); - - let result = Sandbox::run_interactive(&policy, &["python3", "-c", &script]).await.unwrap(); - assert!(result.success()); - - let content = std::fs::read_to_string(&out).unwrap_or_default(); - assert_eq!(content, "ENOENT", "should get FileNotFoundError, got: {}", content); - - let _ = std::fs::remove_file(&out); -} - // ============================================================ // Audit tests // ============================================================ @@ -424,7 +384,7 @@ async fn test_policy_fn_audit() { let policy = base_policy() .policy_fn(move |event, _ctx| { if event.category == SyscallCategory::File { - audited_clone.lock().unwrap().push(event.path.clone().unwrap_or_default()); + audited_clone.lock().unwrap().push(event.syscall.clone()); return Verdict::Audit; } Verdict::Allow @@ -450,13 +410,11 @@ async fn test_policy_fn_restrict_pid_network_without_allowlist() { // but policy_fn can still restrict specific PIDs. let policy = base_policy() .policy_fn(move |event, ctx| { - // On execve of the connect script, restrict that PID to no IPs + // On any execve, restrict that PID's network to nothing. + // (Previously gated on path-substring; path strings were + // dropped from events for TOCTOU reasons — issue #27.) if event.syscall == "execve" { - if let Some(ref path) = event.path { - if path.contains("connect_test") { - ctx.restrict_pid_network(event.pid, &[]); - } - } + ctx.restrict_pid_network(event.pid, &[]); } Verdict::Allow }) diff --git a/crates/sandlock-ffi/src/lib.rs b/crates/sandlock-ffi/src/lib.rs index 667aed8..a93b856 100644 --- a/crates/sandlock-ffi/src/lib.rs +++ b/crates/sandlock-ffi/src/lib.rs @@ -1310,6 +1310,10 @@ pub unsafe extern "C" fn sandlock_gather_free(g: *mut sandlock_gather_t) { // ---------------------------------------------------------------- /// C-compatible syscall event passed to the policy callback. +/// +/// Path strings are intentionally absent (issue #27); use static Landlock +/// rules for path-based access control. argv is exposed for execve only +/// and is TOCTOU-safe via sibling-thread freeze before Continue. #[repr(C)] pub struct sandlock_event_t { pub syscall: *const c_char, @@ -1317,7 +1321,6 @@ pub struct sandlock_event_t { pub category: u8, pub pid: u32, pub parent_pid: u32, // 0 if unknown - pub path: *const c_char, // NULL if not applicable pub host: *const c_char, // NULL if not applicable pub port: u16, pub denied: bool, @@ -1357,10 +1360,9 @@ pub unsafe extern "C" fn sandlock_policy_builder_policy_fn( let cb_fn = move |event: sandlock_core::policy_fn::SyscallEvent, ctx: &mut sandlock_core::policy_fn::PolicyContext| { let syscall_c = CString::new(event.syscall.as_str()).unwrap_or_default(); - let path_c = event.path.as_deref().and_then(|s| CString::new(s).ok()); let host_c = event.host.map(|ip| CString::new(ip.to_string()).unwrap_or_default()); - // Convert argv to C string array + // Convert argv to C string array. CStrings live until end of closure. let argv_c: Vec = event.argv.as_ref() .map(|args| args.iter().filter_map(|s| CString::new(s.as_str()).ok()).collect()) .unwrap_or_default(); @@ -1379,7 +1381,6 @@ pub unsafe extern "C" fn sandlock_policy_builder_policy_fn( category, pid: event.pid, parent_pid: event.parent_pid.unwrap_or(0), - path: path_c.as_ref().map_or(ptr::null(), |c| c.as_ptr()), host: host_c.as_ref().map_or(ptr::null(), |c| c.as_ptr()), port: event.port.unwrap_or(0), denied: event.denied, diff --git a/python/src/sandlock/_sdk.py b/python/src/sandlock/_sdk.py index ce3350b..8232839 100644 --- a/python/src/sandlock/_sdk.py +++ b/python/src/sandlock/_sdk.py @@ -115,14 +115,15 @@ def _builder_fn(name, *extra_args): _b_hostname = _builder_fn("sandlock_policy_builder_hostname", ctypes.c_char_p) _b_cpu_cores = _builder_fn("sandlock_policy_builder_cpu_cores", ctypes.POINTER(ctypes.c_uint32), ctypes.c_uint32) -# Policy callback (policy_fn) +# Policy callback (policy_fn). +# Path strings absent (issue #27 — path-based control belongs in Landlock). +# argv is populated for execve only; TOCTOU-safe via sibling freeze. class _CEvent(ctypes.Structure): _fields_ = [ ("syscall", ctypes.c_char_p), ("category", ctypes.c_uint8), ("pid", ctypes.c_uint32), ("parent_pid", ctypes.c_uint32), - ("path", ctypes.c_char_p), ("host", ctypes.c_char_p), ("port", ctypes.c_uint16), ("denied", ctypes.c_bool), @@ -395,21 +396,34 @@ def confine(policy: "PolicyDataclass") -> None: @dataclass(frozen=True) class SyscallEvent: - """An intercepted syscall event.""" + """An intercepted syscall event. + + Path strings are intentionally absent: the kernel re-reads user-memory + pointers after a Continue response, so any path-string-based decision + is racy (issue #27). Path-based access control belongs in static + Landlock rules (``fs_readable``, ``fs_writable``, ``fs_denied``). + + ``argv`` *is* exposed for execve/execveat events and is TOCTOU-safe: + the supervisor freezes the calling process's sibling threads via + PTRACE_INTERRUPT before returning Continue, so the kernel's re-read + sees the same memory the supervisor inspected. Siblings die during + execve's de_thread step regardless, so the freeze has no observable + cost. + """ syscall: str category: str # "file", "network", "process", "memory" pid: int parent_pid: int = 0 - path: str | None = None host: str | None = None port: int = 0 argv: tuple[str, ...] | None = None denied: bool = False - def path_contains(self, s: str) -> bool: - return self.path is not None and s in self.path - def argv_contains(self, s: str) -> bool: + """Returns True if any argv element contains ``s``. + + Only meaningful for execve/execveat events. + """ return self.argv is not None and any(s in a for a in self.argv) @@ -898,7 +912,6 @@ def from_dataclass(cls, policy: PolicyDataclass, policy_fn=None, override_hostna if policy_fn is not None: def _c_callback(event_p, ctx_p): ev = event_p.contents - # Extract argv py_argv = None if ev.argv and ev.argc > 0: py_argv = tuple( @@ -912,7 +925,6 @@ def _c_callback(event_p, ctx_p): category=_CATEGORIES.get(ev.category, "file"), pid=ev.pid, parent_pid=ev.parent_pid, - path=ev.path.decode("utf-8") if ev.path else None, host=ev.host.decode("utf-8") if ev.host else None, port=ev.port, argv=py_argv, diff --git a/python/tests/test_policy_fn.py b/python/tests/test_policy_fn.py index 0b84238..c47097e 100644 --- a/python/tests/test_policy_fn.py +++ b/python/tests/test_policy_fn.py @@ -43,20 +43,24 @@ def test_frozen(self): def test_defaults(self): e = SyscallEvent(syscall="clone", category="process", pid=1) - assert e.path is None assert e.host is None assert e.port == 0 assert e.parent_pid == 0 + assert e.argv is None assert e.denied is False - def test_path_contains(self): - e = SyscallEvent(syscall="openat", category="file", pid=1, path="/usr/bin/python3") - assert e.path_contains("python") - assert not e.path_contains("ruby") + def test_argv_contains(self): + e = SyscallEvent( + syscall="execve", category="process", pid=1, + argv=("python3", "-c", "print(1)"), + ) + assert e.argv_contains("python3") + assert e.argv_contains("-c") + assert not e.argv_contains("ruby") - def test_path_contains_none(self): - e = SyscallEvent(syscall="clone", category="process", pid=1) - assert not e.path_contains("anything") + def test_argv_contains_none(self): + e = SyscallEvent(syscall="openat", category="file", pid=1) + assert not e.argv_contains("anything") # ---------------------------------------------------------------------------