Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
309 changes: 154 additions & 155 deletions crates/sandlock-core/src/cow/dispatch.rs

Large diffs are not rendered by default.

65 changes: 36 additions & 29 deletions crates/sandlock-core/src/procfs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use std::sync::Arc;
use tokio::sync::Mutex;

use crate::seccomp::notif::{read_child_cstr, write_child_mem, NotifAction, NotifPolicy};
use crate::seccomp::state::{NetworkState, ProcfsState};
use crate::seccomp::state::{NetworkState, ProcessIndex};
use crate::sys::structs::{SeccompNotif, EACCES};
use crate::sys::syscall;

Expand Down Expand Up @@ -380,7 +380,7 @@ fn read_path(notif: &SeccompNotif, addr: u64, notif_fd: RawFd) -> Option<String>
/// - Lets everything else through.
pub(crate) async fn handle_proc_open(
notif: &SeccompNotif,
procfs: &Arc<Mutex<ProcfsState>>,
processes: &Arc<ProcessIndex>,
resource: &Arc<Mutex<crate::seccomp::state::ResourceState>>,
network: &Arc<Mutex<NetworkState>>,
policy: &NotifPolicy,
Expand All @@ -404,8 +404,7 @@ pub(crate) async fn handle_proc_open(
// already hide non-sandbox PIDs, but without this check a process
// could still open /proc/{ppid}/cmdline (or any guessed PID) directly.
if let Some(pid) = extract_proc_pid(&path) {
let pfs = procfs.lock().await;
if !pfs.proc_pids.contains(&pid) {
if !processes.contains(pid) {
return NotifAction::Errno(EACCES);
}
}
Expand Down Expand Up @@ -435,11 +434,10 @@ pub(crate) async fn handle_proc_open(

// Virtualize /proc/loadavg when proc virtualization is active.
if path == "/proc/loadavg" {
let pfs = procfs.lock().await;
let total = processes.len() as u32;
let last_pid = processes.max_pid().unwrap_or(0);
let rs = resource.lock().await;
let total = pfs.proc_pids.len() as u32;
let running = rs.proc_count;
let last_pid = pfs.proc_pids.iter().max().copied().unwrap_or(0);
let content = generate_loadavg(&rs.load_avg, running, total, last_pid);
return inject_memfd(&content);
}
Expand Down Expand Up @@ -612,7 +610,7 @@ pub(crate) fn handle_etc_hosts_open(
/// regardless of filesystem internals.
pub(crate) async fn handle_sorted_getdents(
notif: &SeccompNotif,
procfs: &Arc<Mutex<ProcfsState>>,
processes: &Arc<ProcessIndex>,
notif_fd: RawFd,
) -> NotifAction {
let pid = notif.pid;
Expand All @@ -625,16 +623,17 @@ pub(crate) async fn handle_sorted_getdents(
Ok(t) => t,
Err(_) => return NotifAction::Continue,
};
let cache_key = (
pid as i32,
child_fd,
dir_path.to_string_lossy().into_owned(),
);
let mut pfs = procfs.lock().await;

let entry = match processes.entry_for(pid as i32) {
Some(e) => e,
None => return NotifAction::Continue,
};
let cache_key = (child_fd, dir_path.to_string_lossy().into_owned());
let mut perproc = entry.1.lock().await;

// Build and cache sorted entries on first call for this open directory.
// Remove an empty cache on EOF so later fd reuse can rebuild entries.
if !pfs.getdents_cache.contains_key(&cache_key) {
if !perproc.procfs_dir_cache.contains_key(&cache_key) {
let dir = match std::fs::read_dir(&dir_path) {
Ok(d) => d,
Err(_) => return NotifAction::Continue,
Expand Down Expand Up @@ -679,17 +678,17 @@ pub(crate) async fn handle_sorted_getdents(
})
.collect();

pfs.getdents_cache.insert(cache_key.clone(), entries);
perproc.procfs_dir_cache.insert(cache_key.clone(), entries);
}

let entries = match pfs.getdents_cache.get_mut(&cache_key) {
let entries = match perproc.procfs_dir_cache.get_mut(&cache_key) {
Some(e) => e,
None => return NotifAction::Continue,
};

// Empty cache = already fully drained on a prior call → return 0 (EOF).
if entries.is_empty() {
pfs.getdents_cache.remove(&cache_key);
perproc.procfs_dir_cache.remove(&cache_key);
return NotifAction::ReturnValue(0);
}

Expand All @@ -708,7 +707,7 @@ pub(crate) async fn handle_sorted_getdents(
entries.drain(..consumed);
}

drop(pfs);
drop(perproc);

if !result.is_empty() {
if write_child_mem(notif_fd, notif.id, pid, buf_addr, &result).is_err() {
Expand Down Expand Up @@ -795,7 +794,7 @@ fn build_filtered_dirents(sandbox_pids: &HashSet<i32>) -> Vec<Vec<u8>> {
/// set of entries that hides PIDs not belonging to the sandbox.
pub(crate) async fn handle_getdents(
notif: &SeccompNotif,
procfs: &Arc<Mutex<ProcfsState>>,
processes: &Arc<ProcessIndex>,
_policy: &NotifPolicy,
notif_fd: RawFd,
) -> NotifAction {
Expand All @@ -814,16 +813,24 @@ pub(crate) async fn handle_getdents(
return NotifAction::Continue;
}

let cache_key = (pid as i32, child_fd, target.to_string_lossy().into_owned());
let mut pfs = procfs.lock().await;
let entry = match processes.entry_for(pid as i32) {
Some(e) => e,
None => return NotifAction::Continue,
};
let cache_key = (child_fd, target.to_string_lossy().into_owned());
let mut perproc = entry.1.lock().await;

// Build and cache entries on first call for this (pid, fd) pair.
if !pfs.getdents_cache.contains_key(&cache_key) {
let entries = build_filtered_dirents(&pfs.proc_pids);
pfs.getdents_cache.insert(cache_key.clone(), entries);
// Build and cache entries on first call for this (fd, target) pair.
if !perproc.procfs_dir_cache.contains_key(&cache_key) {
// Snapshot sandbox PIDs without holding the per-process lock
// any longer than needed — pids_snapshot only takes the
// ProcessIndex read lock briefly.
let snapshot = processes.pids_snapshot();
let entries = build_filtered_dirents(&snapshot);
perproc.procfs_dir_cache.insert(cache_key.clone(), entries);
}

let entries = match pfs.getdents_cache.get_mut(&cache_key) {
let entries = match perproc.procfs_dir_cache.get_mut(&cache_key) {
Some(e) => e,
None => return NotifAction::Continue,
};
Expand All @@ -841,15 +848,15 @@ pub(crate) async fn handle_getdents(

// Empty cache = already fully drained on a prior call → return 0 (EOF).
if entries.is_empty() {
pfs.getdents_cache.remove(&cache_key);
perproc.procfs_dir_cache.remove(&cache_key);
return NotifAction::ReturnValue(0);
}

if consumed > 0 {
entries.drain(..consumed);
}

drop(pfs);
drop(perproc);

// Write the result into the child's buffer and return the byte count.
if !result.is_empty() {
Expand Down
76 changes: 59 additions & 17 deletions crates/sandlock-core/src/resource.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
use std::sync::Arc;
use tokio::sync::Mutex;

use crate::seccomp::notif::{NotifAction, NotifPolicy};
use crate::seccomp::state::{ProcfsState, ResourceState};
use crate::seccomp::ctx::SupervisorCtx;
use crate::seccomp::notif::{spawn_pid_watcher, NotifAction, NotifPolicy};
use crate::seccomp::state::ResourceState;
use crate::sys::structs::{
SeccompNotif, CLONE_NS_FLAGS, EAGAIN, EPERM,
};
Expand All @@ -17,13 +18,17 @@ const MAP_ANONYMOUS: u64 = 0x20;

/// Handle fork/clone/vfork notifications.
///
/// Enforces namespace creation ban, process limits, and checkpoint hold.
/// Needs both `ResourceState` (for proc_count, hold_forks, etc.) and
/// `ProcfsState` (for proc_pids).
/// Enforces namespace creation ban and process limits, registers the
/// new child in `ProcessIndex` (with an owned pidfd), and spawns a
/// per-child pidfd watcher that runs unified cleanup on exit.
///
/// Note: `notif.pid` here is the *parent* (the task issuing
/// clone/fork). The kernel hasn't run the syscall yet, so we don't
/// know the child's pid. The child is discovered and registered later,
/// on its first own seccomp notification, via `register_child_if_new`.
pub(crate) async fn handle_fork(
notif: &SeccompNotif,
resource: &Arc<Mutex<ResourceState>>,
procfs: &Arc<Mutex<ProcfsState>>,
_policy: &NotifPolicy,
) -> NotifAction {
let nr = notif.data.nr as i64;
Expand Down Expand Up @@ -55,12 +60,39 @@ pub(crate) async fn handle_fork(
}

rs.proc_count += 1;
drop(rs);
NotifAction::Continue
}

/// If `notif.pid` is not yet tracked in the ProcessIndex, register
/// it: open a pidfd, record the canonical PidKey, and spawn the exit
/// watcher. Called from the supervisor's notification dispatcher
/// before per-syscall handlers run, so handlers can rely on
/// `ProcessIndex::key_for(notif.pid)` returning a fresh PidKey.
///
/// The fast path is a single `RwLock` read: if the pid is already
/// tracked, we trust the entry. PID-identity correctness comes from
/// the per-child pidfd watcher — a process can't issue notifications
/// after it has exited, and the kernel won't recycle a PID until the
/// parent has waited (which we observe), so a stale entry has no
/// window in which to be hit. We deliberately do *not* re-stat
/// /proc/<pid>/stat on every notification.
pub(crate) async fn register_child_if_new(ctx: &Arc<SupervisorCtx>, pid: i32) {
if ctx.processes.contains(pid) {
return;
}

let mut pfs = procfs.lock().await;
pfs.proc_pids.insert(notif.pid as i32);
let pidfd = match crate::sys::syscall::pidfd_open(pid as u32, 0) {
Ok(fd) => fd,
Err(_) => return, // old kernel or process gone — GC backstop will clean up
};

NotifAction::Continue
let key = match ctx.processes.register(pid) {
Some(k) => k,
None => return, // process exited between pidfd_open and stat read
};

// Hand the pidfd to the watcher; it owns the fd's lifetime now.
spawn_pid_watcher(Arc::clone(ctx), key, pidfd);
}

/// Handle wait4/waitid notifications — decrement the concurrent process count.
Expand All @@ -82,14 +114,14 @@ pub(crate) async fn handle_wait(
/// Tracks anonymous memory usage and enforces the configured memory limit.
pub(crate) async fn handle_memory(
notif: &SeccompNotif,
resource: &Arc<Mutex<ResourceState>>,
ctx: &Arc<SupervisorCtx>,
policy: &NotifPolicy,
) -> NotifAction {
let nr = notif.data.nr as i64;
let args = &notif.data.args;
let limit = policy.max_memory_bytes;

let mut st = resource.lock().await;
let mut st = ctx.resource.lock().await;

let kill = NotifAction::Kill { sig: libc::SIGKILL, pgid: notif.pid as i32 };

Expand All @@ -110,26 +142,36 @@ pub(crate) async fn handle_memory(
} else if nr == libc::SYS_brk {
// args[0] = new_brk
let new_brk = args[0];
let pid = notif.pid as i32;

if new_brk == 0 {
// Query: return Continue, kernel handles it.
return NotifAction::Continue;
}

let base = *st.brk_bases.entry(pid).or_insert(new_brk);

// Per-process brk base is in PerProcessState. Drop the global
// ResourceState lock first to avoid lock ordering issues with
// the per-process lock acquired below (per-process first,
// then global, when both are needed).
drop(st);
let entry = match ctx.processes.entry_for(notif.pid as i32) {
Some(e) => e,
None => return NotifAction::Continue,
};
let mut perproc = entry.1.lock().await;
let mut st = ctx.resource.lock().await;

let base = *perproc.brk_base.get_or_insert(new_brk);
if new_brk > base {
let delta = new_brk - base;
if st.mem_used.saturating_add(delta) > limit {
return kill;
}
st.mem_used += delta;
st.brk_bases.insert(pid, new_brk);
perproc.brk_base = Some(new_brk);
} else if new_brk < base {
let delta = base - new_brk;
st.mem_used = st.mem_used.saturating_sub(delta);
st.brk_bases.insert(pid, new_brk);
perproc.brk_base = Some(new_brk);
}
} else if nr == libc::SYS_mremap {
// args[1] = old_len, args[2] = new_len
Expand Down
9 changes: 6 additions & 3 deletions crates/sandlock-core/src/sandbox.rs
Original file line number Diff line number Diff line change
Expand Up @@ -957,9 +957,8 @@ impl Sandbox {
net_state.port_map.on_bind = Some(cb);
}

// ProcfsState
let mut procfs_state = ProcfsState::new();
procfs_state.proc_pids.insert(pid);
// ProcfsState (sandbox membership lives in ProcessIndex now).
let procfs_state = ProcfsState::new();

// ResourceState
let mut res_state = ResourceState::new(
Expand Down Expand Up @@ -1029,6 +1028,9 @@ impl Sandbox {
let time_random_state = Arc::new(Mutex::new(time_random_state));
let policy_fn_state = Arc::new(Mutex::new(policy_fn_state));
let chroot_state = Arc::new(Mutex::new(chroot_state));
// Root child is registered (with watcher) on its first
// notification, the same path grandchildren take.
let processes = Arc::new(crate::seccomp::state::ProcessIndex::new());

let ctx = Arc::new(SupervisorCtx {
resource: Arc::clone(&res_state),
Expand All @@ -1039,6 +1041,7 @@ impl Sandbox {
policy_fn: Arc::clone(&policy_fn_state),
chroot: Arc::clone(&chroot_state),
netlink: Arc::new(crate::netlink::NetlinkState::new()),
processes: Arc::clone(&processes),
policy: Arc::new(notif_policy),
child_pidfd: child_pidfd_raw,
notif_fd: notif_raw_fd,
Expand Down
10 changes: 9 additions & 1 deletion crates/sandlock-core/src/seccomp/ctx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ use std::sync::Arc;
use tokio::sync::Mutex;

use super::notif::NotifPolicy;
use super::state::{ChrootState, CowState, NetworkState, PolicyFnState, ProcfsState, ResourceState, TimeRandomState};
use super::state::{
ChrootState, CowState, NetworkState, PolicyFnState, ProcessIndex, ProcfsState, ResourceState,
TimeRandomState,
};

/// Holds all supervisor state and policy. Passed to every handler.
pub struct SupervisorCtx {
Expand All @@ -23,6 +26,11 @@ pub struct SupervisorCtx {
pub chroot: Arc<Mutex<ChrootState>>,
/// NETLINK_ROUTE virtualization state.
pub netlink: Arc<crate::netlink::NetlinkState>,
/// Per-process registry: pid → PidKey. Source of truth for
/// "which processes are in the sandbox" and the anchor for
/// unified per-process state cleanup. Wraps an internal RwLock,
/// so handlers can query it synchronously without `.await`.
pub processes: Arc<ProcessIndex>,
/// Immutable policy — no lock needed.
pub policy: Arc<NotifPolicy>,
/// pidfd for the child process (immutable after spawn).
Expand Down
Loading
Loading