diff --git a/Cargo.lock b/Cargo.lock index 3be086f..5b1c65a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -247,6 +247,7 @@ dependencies = [ "env_logger", "humantime", "log", + "murmur2", "rustix", "safe-fork", "serde", @@ -256,6 +257,7 @@ dependencies = [ "tokio-stream", "udev", "walkdir", + "zerocopy", ] [[package]] @@ -483,6 +485,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "murmur2" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb585ade2549a017db2e35978b77c319214fa4b37cede841e27954dd6e8f3ca8" + [[package]] name = "object" version = "0.36.7" @@ -1012,3 +1020,23 @@ name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zerocopy" +version = "0.8.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index 8b11d53..1e68cc9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,13 +17,15 @@ tokio = { version = "1", features = ["full"] } tokio-stream = "0.1" async-stream = "0.3" udev = "0.9" -rustix = { version = "1", features = ["fs", "stdio", "process", "thread", "pipe", "mount"] } +rustix = { version = "1", features = ["fs", "stdio", "process", "thread", "pipe", "mount", "net"] } bitflags = "2" humantime = "2" serde = { version = "1", features = ["derive"] } serde_json = "1" safe-fork = "0.1.1" aya = "0.13" +murmur2 = "0.1.0" +zerocopy = { version = "0.8.24", features = ["derive"] } [build-dependencies] anyhow = { version = "1", features = ["backtrace"] } diff --git a/src/hotplug/kobject_uevent.rs b/src/hotplug/kobject_uevent.rs new file mode 100644 index 0000000..8c2bda5 --- /dev/null +++ b/src/hotplug/kobject_uevent.rs @@ -0,0 +1,117 @@ +use std::io::{IoSlice, Write}; +use std::os::fd::OwnedFd; + +use anyhow::Result; +use rustix::net::{AddressFamily, SendFlags, SocketType, netlink::SocketAddrNetlink}; +use zerocopy::{Immutable, IntoBytes}; + +use crate::util::namespace::NetNamespace; + +// This needs to be compatible with +// https://github.com/systemd/systemd/blob/main/src/libsystemd/sd-device/device-monitor.c. +#[repr(C)] +#[derive(Immutable, IntoBytes)] +struct MonitorNetlinkHeader { + /// "libudev" prefix to distinguish libudev and kernel messages. + prefix: [u8; 8], + /// Magic to protect against daemon <-> Library message format mismatch + /// Used in the kernel from socket filter rules; needs to be stored in network order. + magic: u32, + /// Total length of header structure known to the sender. + header_size: u32, + /// Properties string buffer + properties_off: u32, + properties_len: u32, + /// Hashes of primary device properties strings, to let libudev subscribers + /// use in-kernel socket filters; values need to be stored in network order. + filter_subsystem_hash: u32, + filter_devtype_hash: u32, + filter_tag_bloom_hi: u32, + filter_tag_bloom_lo: u32, +} + +/// Udev netlink event sender. +/// +/// When a device is added/removed, after processing rules, `systemd-udevd` will send a netlink +/// message to `kobject_uevent` netlink socket. This is picked up by libudev monitor users. +/// +/// This netlink socket is namespaced, so udevd-sent messages are not observed by the container. +/// This sender takes the place of udevd and ensures that libudev users inside the container may +/// see the device add/removal event after being processed by container-hotplug. +pub struct UdevSender { + socket: OwnedFd, + seq_num: u64, + ns: NetNamespace, +} + +impl UdevSender { + pub fn new(ns: NetNamespace) -> Result { + let socket = ns.with(|| { + rustix::net::socket( + AddressFamily::NETLINK, + SocketType::DGRAM, + Some(rustix::net::netlink::KOBJECT_UEVENT), + ) + })??; + + Ok(Self { + socket, + seq_num: 0, + ns, + }) + } + + pub fn send(&mut self, device: &udev::Device, event: &str) -> Result<()> { + self.seq_num += 1; + + let mut properties = Vec::new(); + write!(properties, "ACTION={event}\0SEQNUM={}\0", self.seq_num)?; + for property in device.properties() { + // These properties are specially handled. + if property.name() == "ACTION" || property.name() == "SEQNUM" { + continue; + } + properties.extend_from_slice(property.name().as_encoded_bytes()); + properties.push(b'='); + properties.extend_from_slice(property.value().as_encoded_bytes()); + properties.push(0); + } + let header = MonitorNetlinkHeader { + prefix: *b"libudev\0", + magic: 0xFEEDCAFEu32.to_be(), + header_size: std::mem::size_of::() as u32, + properties_off: std::mem::size_of::() as u32, + properties_len: properties.len() as u32, + filter_subsystem_hash: device + .subsystem() + .map(|x| murmur2::murmur2ne(x.as_encoded_bytes(), 0).to_be()) + .unwrap_or_default(), + filter_devtype_hash: device + .devtype() + .map(|x| murmur2::murmur2ne(x.as_encoded_bytes(), 0).to_be()) + .unwrap_or_default(), + // Don't bother computing the value in the same way as systemd, + // just be conservative and always make it match -- this is an optimisation anyway. + filter_tag_bloom_hi: 0xFFFFFFFF, + filter_tag_bloom_lo: 0xFFFFFFFF, + }; + + // We re-enter the namespace to obtain root UID/GID so it'll be trusted by libudev. + // Otherwise, when userns is used, we're the global root which is mapped to nobody in the + // container. libudev will use SCM credentials to check for the sender and identify if the + // message is to be trusted. + // + // Technically just changing UID/GID is sufficient and network namespace re-entering isn't + // necessary -- but there's no harm in doing so and it makes code simpler. + self.ns.with(|| { + rustix::net::sendmsg_addr( + &self.socket, + &SocketAddrNetlink::new(0, 2), + &[IoSlice::new(header.as_bytes()), IoSlice::new(&properties)], + &mut Default::default(), + SendFlags::empty(), + ) + })??; + Ok(()) + } +} diff --git a/src/hotplug/mod.rs b/src/hotplug/mod.rs index 048f94e..236e4f5 100644 --- a/src/hotplug/mod.rs +++ b/src/hotplug/mod.rs @@ -1,5 +1,7 @@ mod attached_device; +mod kobject_uevent; pub use attached_device::AttachedDevice; +pub use kobject_uevent::UdevSender; use std::collections::HashMap; use std::path::PathBuf; @@ -20,6 +22,7 @@ pub struct HotPlug { symlinks: Vec, monitor: DeviceMonitor, devices: HashMap, + udev_sender: UdevSender, } impl HotPlug { @@ -31,11 +34,16 @@ impl HotPlug { let monitor = DeviceMonitor::new(hub_path.clone())?; let devices = Default::default(); + let udev_sender = UdevSender::new(crate::util::namespace::NetNamespace::of_pid( + container.pid(), + )?)?; + Ok(Self { container, symlinks, monitor, devices, + udev_sender, }) } @@ -80,6 +88,8 @@ impl HotPlug { self.container.symlink(&devnode.path, symlink).await?; } + self.udev_sender.send(device.udev(), "add")?; + let syspath = device.syspath().to_owned(); let device = AttachedDevice { device, symlinks }; self.devices.insert(syspath, device.clone()); @@ -100,6 +110,8 @@ impl HotPlug { self.container.rm(symlink).await?; } + self.udev_sender.send(device.udev(), "remove")?; + Ok(Some(Event::Detach(device))) } } diff --git a/src/runc/container.rs b/src/runc/container.rs index af6f764..eadc23c 100644 --- a/src/runc/container.rs +++ b/src/runc/container.rs @@ -125,6 +125,10 @@ impl Container { Ok(container) } + pub fn pid(&self) -> Pid { + self.pid + } + /// Remount /dev inside the init namespace. /// /// When user namespace is used, the /dev created by runc will be mounted inside the user namespace, diff --git a/src/util/namespace.rs b/src/util/namespace.rs index ba45c78..42cf7a3 100644 --- a/src/util/namespace.rs +++ b/src/util/namespace.rs @@ -153,3 +153,48 @@ impl MntNamespace { }) } } + +pub struct NetNamespace { + net_fd: File, + user_ns: UserNamespace, +} + +impl NetNamespace { + /// Open the network namespace of a process. + pub fn of_pid(pid: Pid) -> Result { + let net_fd = File::open(format!("/proc/{}/ns/net", pid.as_raw_nonzero()))?; + let user_ns = UserNamespace::of_pid(pid)?; + Ok(NetNamespace { net_fd, user_ns }) + } + + /// Enter the network namespace. + /// + /// This operation is not reversible. + pub fn enter(&self) -> Result<()> { + // Switch this particular thread to the container's network namespace. + rustix::thread::move_into_link_name_space( + self.net_fd.as_fd(), + Some(LinkNameSpaceType::Network), + )?; + + // Similar to mount namespace, we also want to behave as container root. + // This is so that SCM credentials are seen properly. + self.user_ns.enter()?; + Ok(()) + } + + /// Execute inside the mount namespace. + pub fn with T + Send>(&self, f: F) -> Result { + // To avoid messing with rest of the process, we do everything in a new thread. + // Use scoped thread to avoid 'static bound (we need to access fd). + std::thread::scope(|scope| { + scope + .spawn(|| -> Result { + self.enter()?; + Ok(f()) + }) + .join() + .map_err(|_| anyhow::anyhow!("work thread panicked"))? + }) + } +}