Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 29 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -243,28 +243,29 @@ result = (
### Dynamic Policy (policy_fn)

Inspect syscall events at runtime and adjust permissions on the fly.
Each event includes rich metadata: path, host, port, argv, category,
parent PID. The callback returns a verdict to allow, deny, or audit.
Events carry syscall name, category, PID, network destination (for
`connect`/`sendto`/`bind`), and `argv` (for `execve`). The callback
returns a verdict to allow, deny, or audit.

```python
from sandlock import Sandbox, Policy
import errno

def on_event(event, ctx):
# Block download tools
# Block download tools by argv
if event.syscall == "execve" and event.argv_contains("curl"):
return True # deny

# Custom errno for sensitive files
if event.category == "file" and event.path_contains("/secret"):
# Deny connections to a specific IP
if event.syscall == "connect" and event.host == "10.0.0.5":
return errno.EACCES

# Restrict network after setup phase
if event.syscall == "execve" and event.path_contains("untrusted"):
ctx.restrict_network([])
ctx.deny_path("/etc/shadow")
# Lock down once the program has finished starting up
if event.syscall == "execve":
ctx.restrict_network([]) # block all network
ctx.deny_path("/etc/shadow") # dynamic fs deny

# Audit file access (allow but flag)
# Audit every file access (allow but flag)
if event.category == "file":
return "audit"

Expand All @@ -281,7 +282,24 @@ result = Sandbox(policy, policy_fn=on_event).run(["python3", "agent.py"])
positive int = deny with errno, `"audit"`/`-2` = allow + flag.

**Event fields:** `syscall`, `category` (file/network/process/memory),
`pid`, `parent_pid`, `path`, `host`, `port`, `argv`, `denied`.
`pid`, `parent_pid`, `host`, `port`, `argv`, `denied`.

> **TOCTOU NOTE ** Per `seccomp_unotify(2)`, the kernel
> re-reads user-memory pointers after `Continue`. Sandlock handles this
> in two places:
>
> - **Path strings are not exposed on events.** Path-based access control
> belongs in static Landlock rules (`fs_readable` / `fs_writable` /
> `fs_denied`) — kernel-enforced and TOCTOU-immune. Use
> `ctx.deny_path()` for runtime additions.
> - **`event.argv` is exposed and TOCTOU-safe.** Before returning
> `Continue` for an `execve`, the supervisor `PTRACE_SEIZE` +
> `PTRACE_INTERRUPT`s every sibling thread of the calling tid so the
> kernel's re-read happens with no other writer running. The pause
> has no observable cost: `execve`'s `de_thread` step kills sibling
> threads anyway. If the freeze cannot be established (e.g., YAMA
> blocks ptrace), the execve is denied with `EPERM` — the safety
> invariant is never silently relaxed.

**Context methods:**
- `ctx.restrict_network(ips)` / `ctx.grant_network(ips)` — network control
Expand Down
1 change: 1 addition & 0 deletions crates/sandlock-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ pub(crate) mod random;
pub(crate) mod time;
pub(crate) mod cow;
pub(crate) mod checkpoint;
pub(crate) mod sibling_freeze;
pub mod netlink;
pub(crate) mod procfs;
pub(crate) mod port_remap;
Expand Down
74 changes: 57 additions & 17 deletions crates/sandlock-core/src/policy_fn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,24 @@
//! .fs_read("/usr").fs_read("/lib")
//! .net_allow_host("127.0.0.1")
//! .policy_fn(|event, ctx| {
//! if event.syscall == "execve" && event.path_contains("untrusted") {
//! ctx.restrict_network(&[]); // block all network
//! if event.syscall == "connect" && event.host == Some("10.0.0.5".parse().unwrap()) {
//! return Verdict::Deny;
//! }
//! Verdict::Allow
//! })
//! .build()?;
//! ```
//!
//! # TOCTOU and string-typed fields
//!
//! Path and argv strings the kernel will re-read after a `Continue`
//! response (per `seccomp_unotify(2)`) are not exposed on this event.
//! Path-based access control belongs in static Landlock rules
//! (`fs_read`/`fs_write`/`fs_deny`), which the kernel enforces directly
//! and which are not subject to user-memory races. Network fields
//! (`host`, `port`) are TOCTOU-safe because the supervisor performs
//! `connect`/`sendto`/`bind` on-behalf via `pidfd_getfd` and the kernel
//! never re-reads child memory for those syscalls.

use std::collections::{HashMap, HashSet};
use std::net::IpAddr;
Expand Down Expand Up @@ -41,6 +53,26 @@ pub enum SyscallCategory {
// ============================================================

/// An intercepted syscall event observed by the seccomp supervisor.
///
/// # TOCTOU and string-typed fields
///
/// Path strings are deliberately absent. Per `seccomp_unotify(2)`, the
/// kernel re-reads user-memory pointers after a `Continue` response, so
/// any path-string-based decision is racy in a multi-threaded child.
/// Path-based access control belongs in static Landlock rules
/// (`fs_read` / `fs_write` / `fs_deny`); see issue #27.
///
/// `argv` *is* exposed for `execve`/`execveat` and is TOCTOU-safe by
/// construction: before the supervisor returns `Continue` for an
/// execve, it `PTRACE_SEIZE`+`PTRACE_INTERRUPT`s every sibling thread
/// of the calling tid so the kernel's post-Continue re-read sees the
/// same memory the supervisor inspected. Siblings are killed by the
/// kernel during execve's `de_thread` step anyway, so the pause has
/// no observable cost. See `crate::sibling_freeze`.
///
/// Network fields (`host`, `port`) are TOCTOU-safe because the
/// supervisor performs `connect`/`sendto`/`bind` on-behalf via
/// `pidfd_getfd` and the kernel never re-reads child memory for those.
#[derive(Debug, Clone)]
pub struct SyscallEvent {
/// Syscall name (e.g., "connect", "openat", "execve", "clone").
Expand All @@ -51,27 +83,22 @@ pub struct SyscallEvent {
pub pid: u32,
/// Parent PID (read from /proc/{pid}/stat).
pub parent_pid: Option<u32>,
/// Resolved filesystem path (for openat, execve, etc.).
pub path: Option<String>,
/// Destination IP address (for connect, sendto).
/// Destination IP address (for connect, sendto). TOCTOU-safe.
pub host: Option<IpAddr>,
/// Destination port (for connect, sendto, bind).
/// Destination port (for connect, sendto, bind). TOCTOU-safe.
pub port: Option<u16>,
/// Size argument (for mmap, brk).
pub size: Option<u64>,
/// Command arguments (for execve/execveat).
/// Command arguments for execve/execveat. TOCTOU-safe: sibling
/// threads are frozen before the kernel re-reads.
pub argv: Option<Vec<String>>,
/// Whether the supervisor denied this syscall.
pub denied: bool,
}

impl SyscallEvent {
/// Check if the path contains a substring.
pub fn path_contains(&self, s: &str) -> bool {
self.path.as_ref().map_or(false, |p| p.contains(s))
}

/// Check if any argv element contains a substring.
/// Returns true if any argv element contains the given substring.
/// Only meaningful for execve/execveat events (where argv is populated).
pub fn argv_contains(&self, s: &str) -> bool {
self.argv.as_ref().map_or(false, |args| args.iter().any(|a| a.contains(s)))
}
Expand Down Expand Up @@ -434,13 +461,12 @@ mod tests {
}

#[test]
fn test_event_path_contains() {
fn test_event_argv_contains() {
let event = SyscallEvent {
syscall: "execve".to_string(),
category: SyscallCategory::Process,
pid: 1,
parent_pid: Some(0),
path: Some("/usr/bin/python3".to_string()),
host: None,
port: None,
size: None,
Expand All @@ -451,7 +477,21 @@ mod tests {
assert!(event.argv_contains("-c"));
assert!(!event.argv_contains("ruby"));
assert_eq!(event.category, SyscallCategory::Process);
assert!(event.path_contains("python"));
assert!(!event.path_contains("ruby"));
}

#[test]
fn test_event_argv_contains_none() {
let event = SyscallEvent {
syscall: "openat".to_string(),
category: SyscallCategory::File,
pid: 1,
parent_pid: None,
host: None,
port: None,
size: None,
argv: None,
denied: false,
};
assert!(!event.argv_contains("anything"));
}
}
71 changes: 52 additions & 19 deletions crates/sandlock-core/src/seccomp/notif.rs
Original file line number Diff line number Diff line change
Expand Up @@ -729,15 +729,15 @@ fn read_sockaddr_for_event(notif: &SeccompNotif, addr: u64, len: usize, notif_fd
(ip, if port > 0 { Some(port) } else { None })
}

/// Read argv (array of string pointers) from child memory for execve.
/// execve(path, argv, envp): argv is a NULL-terminated array of char* pointers.
/// Read argv (NULL-terminated array of char* in child memory) for execve.
/// Capped at 64 entries × 256 bytes/entry as a safety bound.
fn read_argv_for_event(notif: &SeccompNotif, argv_ptr: u64, notif_fd: RawFd) -> Option<Vec<String>> {
if argv_ptr == 0 { return None; }
let mut args = Vec::new();
let ptr_size = std::mem::size_of::<u64>();

for i in 0..64 { // safety limit
let ptr_addr = argv_ptr + (i * ptr_size) as u64;
for i in 0..64u64 {
let ptr_addr = argv_ptr + i * ptr_size as u64;
let ptr_bytes = read_child_mem(notif_fd, notif.id, notif.pid, ptr_addr, ptr_size).ok()?;
let str_ptr = u64::from_ne_bytes(ptr_bytes[..8].try_into().ok()?);
if str_ptr == 0 { break; } // NULL terminator
Expand Down Expand Up @@ -773,27 +773,34 @@ async fn emit_policy_event(
let category = syscall_category(nr);
let parent_pid = read_ppid(notif.pid);

// Extract metadata based on syscall type
let mut path = None;
// Extract metadata based on syscall type.
//
// Path strings are deliberately NOT extracted: the kernel re-reads
// user-memory pointers after Continue, so any path-string-based
// decision is racy (issue #27). Path-based access control belongs
// in static Landlock rules.
//
// argv IS extracted for execve/execveat: the supervisor freezes
// sibling threads before returning Continue (sibling_freeze module),
// so the post-Continue re-read sees the same memory we read here.
//
// Network fields are TOCTOU-safe because connect/sendto/bind are
// performed on-behalf via pidfd_getfd; the kernel never re-reads
// child memory for those syscalls.
let mut host = None;
let mut port = None;
let mut size = None;
let mut argv = None;

if nr == libc::SYS_openat || Some(nr) == arch::SYS_OPEN || nr == libc::SYS_execve || nr == libc::SYS_execveat {
// openat(dirfd, pathname, ...): args[1] = path ptr
// execve(pathname, argv, envp): args[0] = path ptr, args[1] = argv ptr
let path_ptr = if nr == libc::SYS_openat {
notif.data.args[1]
if nr == libc::SYS_execve || nr == libc::SYS_execveat {
// execve(pathname, argv, envp): args[1] = argv ptr
// execveat(dirfd, pathname, argv, ..): args[2] = argv ptr
let argv_ptr = if nr == libc::SYS_execveat {
notif.data.args[2]
} else {
notif.data.args[0]
notif.data.args[1]
};
path = read_path_for_event(notif, path_ptr, notif_fd);

// Extract argv for execve/execveat
if nr == libc::SYS_execve || nr == libc::SYS_execveat {
argv = read_argv_for_event(notif, notif.data.args[1], notif_fd);
}
argv = read_argv_for_event(notif, argv_ptr, notif_fd);
}

if nr == libc::SYS_connect || nr == libc::SYS_sendto || nr == libc::SYS_bind {
Expand All @@ -815,7 +822,6 @@ async fn emit_policy_event(
category,
pid: notif.pid,
parent_pid,
path,
host,
port,
size,
Expand Down Expand Up @@ -910,6 +916,33 @@ async fn handle_notification(
}
}

// TOCTOU-close for execve (issue #27): freeze sibling threads of
// the calling tid before the kernel re-reads pathname/argv from
// child memory. Cheap because the kernel's de_thread step in
// execve kills the siblings anyway — we're just stopping them
// moments earlier, closing the race window for the supervisor's
// argv inspection in policy_fn.
//
// Only relevant when we're sending Continue: a denial response
// (Errno) means the kernel never re-reads, so no freeze needed.
//
// Strict on failure: if we cannot freeze the siblings, we cannot
// uphold the argv-safety invariant, so we deny the execve with
// EPERM rather than letting it through unprotected.
let nr = notif.data.nr as i64;
if matches!(action, NotifAction::Continue)
&& crate::sibling_freeze::requires_freeze_on_continue(nr)
{
if let Err(e) = crate::sibling_freeze::freeze_siblings_for_execve(notif.pid as i32) {
eprintln!(
"sandlock: argv-safety freeze failed for pid {}: {} \
— denying execve to preserve TOCTOU invariant",
notif.pid, e
);
action = NotifAction::Errno(libc::EPERM);
}
}

// Ignore error — child may have exited between recv and response.
let _ = send_response(fd, notif.id, action);
}
Expand Down
Loading
Loading