Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@ tokio = { version = "1", features = ["full"] }
tokio-stream = "0.1"
async-stream = "0.3"
udev = "0.9"
rustix = { version = "1", features = ["fs", "stdio", "process", "thread", "pipe", "mount"] }
rustix = { version = "1", features = ["fs", "stdio", "process", "thread", "pipe", "mount", "net"] }
bitflags = "2"
humantime = "2"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
safe-fork = "0.1.1"
aya = "0.13"
murmur2 = "0.1.0"
zerocopy = { version = "0.8.24", features = ["derive"] }

[build-dependencies]
anyhow = { version = "1", features = ["backtrace"] }
Expand Down
117 changes: 117 additions & 0 deletions src/hotplug/kobject_uevent.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
use std::io::{IoSlice, Write};
use std::os::fd::OwnedFd;

use anyhow::Result;
use rustix::net::{AddressFamily, SendFlags, SocketType, netlink::SocketAddrNetlink};
use zerocopy::{Immutable, IntoBytes};

use crate::util::namespace::NetNamespace;

// This needs to be compatible with
// https://github.com/systemd/systemd/blob/main/src/libsystemd/sd-device/device-monitor.c.
#[repr(C)]
#[derive(Immutable, IntoBytes)]
struct MonitorNetlinkHeader {
/// "libudev" prefix to distinguish libudev and kernel messages.
prefix: [u8; 8],
/// Magic to protect against daemon <-> Library message format mismatch
/// Used in the kernel from socket filter rules; needs to be stored in network order.
magic: u32,
/// Total length of header structure known to the sender.
header_size: u32,
/// Properties string buffer
properties_off: u32,
properties_len: u32,
/// Hashes of primary device properties strings, to let libudev subscribers
/// use in-kernel socket filters; values need to be stored in network order.
filter_subsystem_hash: u32,
filter_devtype_hash: u32,
filter_tag_bloom_hi: u32,
filter_tag_bloom_lo: u32,
}

/// Udev netlink event sender.
///
/// When a device is added/removed, after processing rules, `systemd-udevd` will send a netlink
/// message to `kobject_uevent` netlink socket. This is picked up by libudev monitor users.
///
/// This netlink socket is namespaced, so udevd-sent messages are not observed by the container.
/// This sender takes the place of udevd and ensures that libudev users inside the container may
/// see the device add/removal event after being processed by container-hotplug.
pub struct UdevSender {
socket: OwnedFd,
seq_num: u64,
ns: NetNamespace,
}

impl UdevSender {
pub fn new(ns: NetNamespace) -> Result<Self> {
let socket = ns.with(|| {
rustix::net::socket(
AddressFamily::NETLINK,
SocketType::DGRAM,
Some(rustix::net::netlink::KOBJECT_UEVENT),
)
})??;

Ok(Self {
socket,
seq_num: 0,
ns,
})
}

pub fn send(&mut self, device: &udev::Device, event: &str) -> Result<()> {
self.seq_num += 1;

let mut properties = Vec::new();
write!(properties, "ACTION={event}\0SEQNUM={}\0", self.seq_num)?;
for property in device.properties() {
// These properties are specially handled.
if property.name() == "ACTION" || property.name() == "SEQNUM" {
continue;
}
properties.extend_from_slice(property.name().as_encoded_bytes());
properties.push(b'=');
properties.extend_from_slice(property.value().as_encoded_bytes());
properties.push(0);
}
let header = MonitorNetlinkHeader {
prefix: *b"libudev\0",
magic: 0xFEEDCAFEu32.to_be(),
header_size: std::mem::size_of::<MonitorNetlinkHeader>() as u32,
properties_off: std::mem::size_of::<MonitorNetlinkHeader>() as u32,
properties_len: properties.len() as u32,
filter_subsystem_hash: device
.subsystem()
.map(|x| murmur2::murmur2ne(x.as_encoded_bytes(), 0).to_be())
.unwrap_or_default(),
filter_devtype_hash: device
.devtype()
.map(|x| murmur2::murmur2ne(x.as_encoded_bytes(), 0).to_be())
.unwrap_or_default(),
// Don't bother computing the value in the same way as systemd,
// just be conservative and always make it match -- this is an optimisation anyway.
filter_tag_bloom_hi: 0xFFFFFFFF,
filter_tag_bloom_lo: 0xFFFFFFFF,
};

// We re-enter the namespace to obtain root UID/GID so it'll be trusted by libudev.
// Otherwise, when userns is used, we're the global root which is mapped to nobody in the
// container. libudev will use SCM credentials to check for the sender and identify if the
// message is to be trusted.
//
// Technically just changing UID/GID is sufficient and network namespace re-entering isn't
// necessary -- but there's no harm in doing so and it makes code simpler.
self.ns.with(|| {
rustix::net::sendmsg_addr(
&self.socket,
&SocketAddrNetlink::new(0, 2),
&[IoSlice::new(header.as_bytes()), IoSlice::new(&properties)],
&mut Default::default(),
SendFlags::empty(),
)
})??;
Ok(())
}
}
12 changes: 12 additions & 0 deletions src/hotplug/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
mod attached_device;
mod kobject_uevent;
pub use attached_device::AttachedDevice;
pub use kobject_uevent::UdevSender;

use std::collections::HashMap;
use std::path::PathBuf;
Expand All @@ -20,6 +22,7 @@ pub struct HotPlug {
symlinks: Vec<cli::Symlink>,
monitor: DeviceMonitor,
devices: HashMap<PathBuf, AttachedDevice>,
udev_sender: UdevSender,
}

impl HotPlug {
Expand All @@ -31,11 +34,16 @@ impl HotPlug {
let monitor = DeviceMonitor::new(hub_path.clone())?;
let devices = Default::default();

let udev_sender = UdevSender::new(crate::util::namespace::NetNamespace::of_pid(
container.pid(),
)?)?;

Ok(Self {
container,
symlinks,
monitor,
devices,
udev_sender,
})
}

Expand Down Expand Up @@ -80,6 +88,8 @@ impl HotPlug {
self.container.symlink(&devnode.path, symlink).await?;
}

self.udev_sender.send(device.udev(), "add")?;

let syspath = device.syspath().to_owned();
let device = AttachedDevice { device, symlinks };
self.devices.insert(syspath, device.clone());
Expand All @@ -100,6 +110,8 @@ impl HotPlug {
self.container.rm(symlink).await?;
}

self.udev_sender.send(device.udev(), "remove")?;

Ok(Some(Event::Detach(device)))
}
}
Expand Down
4 changes: 4 additions & 0 deletions src/runc/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,10 @@ impl Container {
Ok(container)
}

pub fn pid(&self) -> Pid {
self.pid
}

/// Remount /dev inside the init namespace.
///
/// When user namespace is used, the /dev created by runc will be mounted inside the user namespace,
Expand Down
45 changes: 45 additions & 0 deletions src/util/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,48 @@ impl MntNamespace {
})
}
}

pub struct NetNamespace {
net_fd: File,
user_ns: UserNamespace,
}

impl NetNamespace {
/// Open the network namespace of a process.
pub fn of_pid(pid: Pid) -> Result<NetNamespace> {
let net_fd = File::open(format!("/proc/{}/ns/net", pid.as_raw_nonzero()))?;
let user_ns = UserNamespace::of_pid(pid)?;
Ok(NetNamespace { net_fd, user_ns })
}

/// Enter the network namespace.
///
/// This operation is not reversible.
pub fn enter(&self) -> Result<()> {
// Switch this particular thread to the container's network namespace.
rustix::thread::move_into_link_name_space(
self.net_fd.as_fd(),
Some(LinkNameSpaceType::Network),
)?;

// Similar to mount namespace, we also want to behave as container root.
// This is so that SCM credentials are seen properly.
self.user_ns.enter()?;
Ok(())
}

/// Execute inside the mount namespace.
pub fn with<T: Send, F: FnOnce() -> T + Send>(&self, f: F) -> Result<T> {
// To avoid messing with rest of the process, we do everything in a new thread.
// Use scoped thread to avoid 'static bound (we need to access fd).
std::thread::scope(|scope| {
scope
.spawn(|| -> Result<T> {
self.enter()?;
Ok(f())
})
.join()
.map_err(|_| anyhow::anyhow!("work thread panicked"))?
})
}
}