From 11c0114a9532c35e282054ba7b690e4626447de6 Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Wed, 9 Apr 2025 12:03:34 +0100 Subject: [PATCH] Propagate udev messages into the container By default, in terms of kobject uevent messages, containers will: * Without userns: see kernel messages but not udev messages * With userns: see no messages at all And this creates trouble for libraries such as libusb, which relies on these netlink messages to detect device updates. We fix this by filling the gap for systemd and send out our own udev messages. --- Cargo.lock | 28 ++++++++ Cargo.toml | 4 +- src/hotplug/kobject_uevent.rs | 117 ++++++++++++++++++++++++++++++++++ src/hotplug/mod.rs | 12 ++++ src/runc/container.rs | 4 ++ src/util/namespace.rs | 45 +++++++++++++ 6 files changed, 209 insertions(+), 1 deletion(-) create mode 100644 src/hotplug/kobject_uevent.rs diff --git a/Cargo.lock b/Cargo.lock index 3be086f..5b1c65a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -247,6 +247,7 @@ dependencies = [ "env_logger", "humantime", "log", + "murmur2", "rustix", "safe-fork", "serde", @@ -256,6 +257,7 @@ dependencies = [ "tokio-stream", "udev", "walkdir", + "zerocopy", ] [[package]] @@ -483,6 +485,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "murmur2" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb585ade2549a017db2e35978b77c319214fa4b37cede841e27954dd6e8f3ca8" + [[package]] name = "object" version = "0.36.7" @@ -1012,3 +1020,23 @@ name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zerocopy" +version = "0.8.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index 8b11d53..1e68cc9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,13 +17,15 @@ tokio = { version = "1", features = ["full"] } tokio-stream = "0.1" async-stream = "0.3" udev = "0.9" -rustix = { version = "1", features = ["fs", "stdio", "process", "thread", "pipe", "mount"] } +rustix = { version = "1", features = ["fs", "stdio", "process", "thread", "pipe", "mount", "net"] } bitflags = "2" humantime = "2" serde = { version = "1", features = ["derive"] } serde_json = "1" safe-fork = "0.1.1" aya = "0.13" +murmur2 = "0.1.0" +zerocopy = { version = "0.8.24", features = ["derive"] } [build-dependencies] anyhow = { version = "1", features = ["backtrace"] } diff --git a/src/hotplug/kobject_uevent.rs b/src/hotplug/kobject_uevent.rs new file mode 100644 index 0000000..8c2bda5 --- /dev/null +++ b/src/hotplug/kobject_uevent.rs @@ -0,0 +1,117 @@ +use std::io::{IoSlice, Write}; +use std::os::fd::OwnedFd; + +use anyhow::Result; +use rustix::net::{AddressFamily, SendFlags, SocketType, netlink::SocketAddrNetlink}; +use zerocopy::{Immutable, IntoBytes}; + +use crate::util::namespace::NetNamespace; + +// This needs to be compatible with +// https://github.com/systemd/systemd/blob/main/src/libsystemd/sd-device/device-monitor.c. +#[repr(C)] +#[derive(Immutable, IntoBytes)] +struct MonitorNetlinkHeader { + /// "libudev" prefix to distinguish libudev and kernel messages. + prefix: [u8; 8], + /// Magic to protect against daemon <-> Library message format mismatch + /// Used in the kernel from socket filter rules; needs to be stored in network order. + magic: u32, + /// Total length of header structure known to the sender. + header_size: u32, + /// Properties string buffer + properties_off: u32, + properties_len: u32, + /// Hashes of primary device properties strings, to let libudev subscribers + /// use in-kernel socket filters; values need to be stored in network order. + filter_subsystem_hash: u32, + filter_devtype_hash: u32, + filter_tag_bloom_hi: u32, + filter_tag_bloom_lo: u32, +} + +/// Udev netlink event sender. +/// +/// When a device is added/removed, after processing rules, `systemd-udevd` will send a netlink +/// message to `kobject_uevent` netlink socket. This is picked up by libudev monitor users. +/// +/// This netlink socket is namespaced, so udevd-sent messages are not observed by the container. +/// This sender takes the place of udevd and ensures that libudev users inside the container may +/// see the device add/removal event after being processed by container-hotplug. +pub struct UdevSender { + socket: OwnedFd, + seq_num: u64, + ns: NetNamespace, +} + +impl UdevSender { + pub fn new(ns: NetNamespace) -> Result { + let socket = ns.with(|| { + rustix::net::socket( + AddressFamily::NETLINK, + SocketType::DGRAM, + Some(rustix::net::netlink::KOBJECT_UEVENT), + ) + })??; + + Ok(Self { + socket, + seq_num: 0, + ns, + }) + } + + pub fn send(&mut self, device: &udev::Device, event: &str) -> Result<()> { + self.seq_num += 1; + + let mut properties = Vec::new(); + write!(properties, "ACTION={event}\0SEQNUM={}\0", self.seq_num)?; + for property in device.properties() { + // These properties are specially handled. + if property.name() == "ACTION" || property.name() == "SEQNUM" { + continue; + } + properties.extend_from_slice(property.name().as_encoded_bytes()); + properties.push(b'='); + properties.extend_from_slice(property.value().as_encoded_bytes()); + properties.push(0); + } + let header = MonitorNetlinkHeader { + prefix: *b"libudev\0", + magic: 0xFEEDCAFEu32.to_be(), + header_size: std::mem::size_of::() as u32, + properties_off: std::mem::size_of::() as u32, + properties_len: properties.len() as u32, + filter_subsystem_hash: device + .subsystem() + .map(|x| murmur2::murmur2ne(x.as_encoded_bytes(), 0).to_be()) + .unwrap_or_default(), + filter_devtype_hash: device + .devtype() + .map(|x| murmur2::murmur2ne(x.as_encoded_bytes(), 0).to_be()) + .unwrap_or_default(), + // Don't bother computing the value in the same way as systemd, + // just be conservative and always make it match -- this is an optimisation anyway. + filter_tag_bloom_hi: 0xFFFFFFFF, + filter_tag_bloom_lo: 0xFFFFFFFF, + }; + + // We re-enter the namespace to obtain root UID/GID so it'll be trusted by libudev. + // Otherwise, when userns is used, we're the global root which is mapped to nobody in the + // container. libudev will use SCM credentials to check for the sender and identify if the + // message is to be trusted. + // + // Technically just changing UID/GID is sufficient and network namespace re-entering isn't + // necessary -- but there's no harm in doing so and it makes code simpler. + self.ns.with(|| { + rustix::net::sendmsg_addr( + &self.socket, + &SocketAddrNetlink::new(0, 2), + &[IoSlice::new(header.as_bytes()), IoSlice::new(&properties)], + &mut Default::default(), + SendFlags::empty(), + ) + })??; + Ok(()) + } +} diff --git a/src/hotplug/mod.rs b/src/hotplug/mod.rs index 048f94e..236e4f5 100644 --- a/src/hotplug/mod.rs +++ b/src/hotplug/mod.rs @@ -1,5 +1,7 @@ mod attached_device; +mod kobject_uevent; pub use attached_device::AttachedDevice; +pub use kobject_uevent::UdevSender; use std::collections::HashMap; use std::path::PathBuf; @@ -20,6 +22,7 @@ pub struct HotPlug { symlinks: Vec, monitor: DeviceMonitor, devices: HashMap, + udev_sender: UdevSender, } impl HotPlug { @@ -31,11 +34,16 @@ impl HotPlug { let monitor = DeviceMonitor::new(hub_path.clone())?; let devices = Default::default(); + let udev_sender = UdevSender::new(crate::util::namespace::NetNamespace::of_pid( + container.pid(), + )?)?; + Ok(Self { container, symlinks, monitor, devices, + udev_sender, }) } @@ -80,6 +88,8 @@ impl HotPlug { self.container.symlink(&devnode.path, symlink).await?; } + self.udev_sender.send(device.udev(), "add")?; + let syspath = device.syspath().to_owned(); let device = AttachedDevice { device, symlinks }; self.devices.insert(syspath, device.clone()); @@ -100,6 +110,8 @@ impl HotPlug { self.container.rm(symlink).await?; } + self.udev_sender.send(device.udev(), "remove")?; + Ok(Some(Event::Detach(device))) } } diff --git a/src/runc/container.rs b/src/runc/container.rs index af6f764..eadc23c 100644 --- a/src/runc/container.rs +++ b/src/runc/container.rs @@ -125,6 +125,10 @@ impl Container { Ok(container) } + pub fn pid(&self) -> Pid { + self.pid + } + /// Remount /dev inside the init namespace. /// /// When user namespace is used, the /dev created by runc will be mounted inside the user namespace, diff --git a/src/util/namespace.rs b/src/util/namespace.rs index ba45c78..42cf7a3 100644 --- a/src/util/namespace.rs +++ b/src/util/namespace.rs @@ -153,3 +153,48 @@ impl MntNamespace { }) } } + +pub struct NetNamespace { + net_fd: File, + user_ns: UserNamespace, +} + +impl NetNamespace { + /// Open the network namespace of a process. + pub fn of_pid(pid: Pid) -> Result { + let net_fd = File::open(format!("/proc/{}/ns/net", pid.as_raw_nonzero()))?; + let user_ns = UserNamespace::of_pid(pid)?; + Ok(NetNamespace { net_fd, user_ns }) + } + + /// Enter the network namespace. + /// + /// This operation is not reversible. + pub fn enter(&self) -> Result<()> { + // Switch this particular thread to the container's network namespace. + rustix::thread::move_into_link_name_space( + self.net_fd.as_fd(), + Some(LinkNameSpaceType::Network), + )?; + + // Similar to mount namespace, we also want to behave as container root. + // This is so that SCM credentials are seen properly. + self.user_ns.enter()?; + Ok(()) + } + + /// Execute inside the mount namespace. + pub fn with T + Send>(&self, f: F) -> Result { + // To avoid messing with rest of the process, we do everything in a new thread. + // Use scoped thread to avoid 'static bound (we need to access fd). + std::thread::scope(|scope| { + scope + .spawn(|| -> Result { + self.enter()?; + Ok(f()) + }) + .join() + .map_err(|_| anyhow::anyhow!("work thread panicked"))? + }) + } +}