From fda12ab1014e0ee23323cff6ec27ce7809c1e531 Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Mon, 3 Apr 2023 15:52:35 +0200 Subject: [PATCH] Support idmap mounts on volumes This commit adds support for idmap mounts as specified in the runtime-spec. We open the idmap source paths and call mount_setattr() in runc PARENT, as we need privileges in the init userns for that, and then sends the fds to the child process. For this fd passing we use the same mechanism used in other parts of thecode, the _LIBCONTAINER_ env vars. The mount is finished (unix.MoveMount) from go code, inside the userns, so we reuse all the prepareBindMount() security checks and the remount logic for some flags too. This commit only supports idmap mounts when userns are used AND the mappings are the same specified for the userns mapping. This limitation is to simplify the initial implementation, as all our users so far only need this, and we can avoid sending over netlink the mappings, creating a userns with this custom mapping, etc. Future PRs will remove this limitation. Co-authored-by: Francis Laniel Signed-off-by: Rodrigo Campos --- libcontainer/container_linux.go | 68 ++++++++++++++++ libcontainer/init_linux.go | 21 +++-- libcontainer/message_linux.go | 1 + libcontainer/nsenter/nsexec.c | 117 ++++++++++++++++++++++++++++ libcontainer/rootfs_linux.go | 49 +++++++++++- libcontainer/standard_init_linux.go | 7 +- 6 files changed, 249 insertions(+), 14 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 8b5709cf6a0..21c9905e69e 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -541,6 +541,38 @@ func (c *Container) shouldSendMountSources() bool { return false } +// shouldSendIdmapSources says whether the child process must setup idmap mounts with +// the mount_setattr already done in the host user namespace. +func (c *Container) shouldSendIdmapSources() bool { + // nsexec.c mount_setattr() requires CAP_SYS_ADMIN in: + // * the user namespace the filesystem was mounted in; + // * the user namespace we're trying to idmap the mount to; + // * the owning user namespace of the mount namespace you're currently located in. + // + // See the comment from Christian Brauner: + // https://github.com/opencontainers/runc/pull/3717#discussion_r1103607972 + // + // Let's just rule out rootless, we don't have those permission in the + // rootless case. + if c.config.RootlessEUID { + return false + } + + // For the time being we require userns to be in use. + if !c.config.Namespaces.Contains(configs.NEWUSER) { + return false + } + + // We need to send sources if there are idmap bind-mounts. + for _, m := range c.config.Mounts { + if m.IsBind() && m.IsIDMapped() { + return true + } + } + + return false +} + func (c *Container) sendMountSources(cmd *exec.Cmd, messageSockPair filePair) error { if !c.shouldSendMountSources() { return nil @@ -551,6 +583,16 @@ func (c *Container) sendMountSources(cmd *exec.Cmd, messageSockPair filePair) er }) } +func (c *Container) sendIdmapSources(cmd *exec.Cmd, messageSockPair filePair) error { + if !c.shouldSendIdmapSources() { + return nil + } + + return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool { + return m.IsBind() && m.IsIDMapped() + }) +} + func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envVar string, condition func(*configs.Mount) bool) error { // Elements on these slices will be paired with mounts (see StartInitialization() and // prepareRootfs()). These slices MUST have the same size as c.config.Mounts. @@ -592,6 +634,9 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l if err := c.sendMountSources(cmd, messageSockPair); err != nil { return nil, err } + if err := c.sendIdmapSources(cmd, messageSockPair); err != nil { + return nil, err + } init := &initProcess{ cmd: cmd, @@ -2256,6 +2301,29 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa }) } + // Idmap mount sources to open. + if it == initStandard && c.shouldSendIdmapSources() { + var mounts []byte + for _, m := range c.config.Mounts { + if m.IsBind() && m.IsIDMapped() { + // While other parts of the code check this too (like + // libcontainer/specconv/spec_linux.go) we do it here also because some libcontainer + // users don't use those functions. + if strings.IndexByte(m.Source, 0) >= 0 { + return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source) + } + + mounts = append(mounts, []byte(m.Source)...) + } + mounts = append(mounts, byte(0)) + } + + r.AddData(&Bytemsg{ + Type: IdmapSourcesAttr, + Value: mounts, + }) + } + return bytes.NewReader(r.Serialize()), nil } diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 4f6ed61c076..42cae1ccb65 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -48,12 +48,15 @@ type network struct { } type mountFds struct { - // Fds to use as source when mounting - // Size should be the same as container mounts, as it will be paired. + // sourceFds are the fds to use as source when mounting. + // The slice size should be the same as container mounts, as it will be + // paired with them. // The value -1 is used when no fd is needed for the mount. // Can't have a valid fd in the same position that other slices in this struct. // We need to use only one of these fds on any single mount. sourceFds []int + // Idem sourceFds, but fds of already created idmap mounts, to use with unix.MoveMount(). + idmapFds []int } // initConfig is used for transferring parameters from Exec() to Init() @@ -142,6 +145,12 @@ func StartInitialization() (retErr error) { return err } + // Get idmap fds. + idmapFds, err := parseFdsFromEnv("_LIBCONTAINER_IDMAP_FDS") + if err != nil { + return err + } + // clear the current process's environment to clean any libcontainer // specific env vars. os.Clearenv() @@ -157,7 +166,7 @@ func StartInitialization() (retErr error) { }() // If init succeeds, it will not return, hence none of the defers will be called. - return containerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds{sourceFds: mountSrcFds}) + return containerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) } func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds mountFds) error { @@ -170,9 +179,9 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo } switch t { case initSetns: - // mountFds must be nil in this case. We don't mount while doing runc exec. - if mountFds.sourceFds != nil { - return errors.New("mount source fds must be nil; can't mount from exec") + // mount and idmap fds must be nil in this case. We don't mount while doing runc exec. + if mountFds.sourceFds != nil || mountFds.idmapFds != nil { + return errors.New("mount and idmap fds must be nil; can't mount from exec") } i := &linuxSetnsInit{ diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go index 6d1107e875d..17db81a29f3 100644 --- a/libcontainer/message_linux.go +++ b/libcontainer/message_linux.go @@ -22,6 +22,7 @@ const ( UidmapPathAttr uint16 = 27288 GidmapPathAttr uint16 = 27289 MountSourcesAttr uint16 = 27290 + IdmapSourcesAttr uint16 = 27291 ) type Int32msg struct { diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 92583a995db..6297276f8b2 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -33,6 +33,9 @@ /* Get all of the CLONE_NEW* flags. */ #include "namespace.h" +/* Get definitions for idmap sources */ +#include "idmap.h" + /* Synchronisation values. */ enum sync_t { SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ @@ -43,6 +46,8 @@ enum sync_t { SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */ SYNC_MOUNTSOURCES_PLS = 0x46, /* Tell parent to send mount sources by SCM_RIGHTS. */ SYNC_MOUNTSOURCES_ACK = 0x47, /* All mount sources have been sent. */ + SYNC_MOUNT_IDMAP_PLS = 0x48, /* Tell parent to mount idmap sources. */ + SYNC_MOUNT_IDMAP_ACK = 0x49, /* All idmap mounts have been done. */ }; #define STAGE_SETUP -1 @@ -95,6 +100,10 @@ struct nlconfig_t { /* Mount sources opened outside the container userns. */ char *mountsources; size_t mountsources_len; + + /* Idmap sources opened outside the container userns which will be id mapped. */ + char *idmapsources; + size_t idmapsources_len; }; /* @@ -112,6 +121,7 @@ struct nlconfig_t { #define UIDMAPPATH_ATTR 27288 #define GIDMAPPATH_ATTR 27289 #define MOUNT_SOURCES_ATTR 27290 +#define IDMAP_SOURCES_ATTR 27291 /* * Use the raw syscall for versions of glibc which don't include a function for @@ -431,6 +441,10 @@ static void nl_parse(int fd, struct nlconfig_t *config) config->mountsources = current; config->mountsources_len = payload_len; break; + case IDMAP_SOURCES_ATTR: + config->idmapsources = current; + config->idmapsources_len = payload_len; + break; default: bail("unknown netlink message type %d", nlattr->nla_type); } @@ -650,6 +664,83 @@ void try_unshare(int flags, const char *msg) bail("failed to unshare %s", msg); } +void send_idmapsources(int sockfd, pid_t pid, char *idmap_src, int idmap_src_len) +{ + char proc_user_path[PATH_MAX]; + + /* Open the userns fd only once. + * Currently we only support idmap mounts that use the same mapping than + * the userns. This is validated in libcontainer/configs/validate/validator.go, + * so if we reached here, we know the mapping for the idmap is the same + * as the userns. This is why we just open the userns_fd once from the + * PID of the child process that has the userns already applied. + */ + int ret = snprintf(proc_user_path, sizeof(proc_user_path), "/proc/%d/ns/user", pid); + if (ret < 0 || (size_t)ret >= sizeof(proc_user_path)) { + sane_kill(pid, SIGKILL); + bail("failed to create userns path string"); + } + + int userns_fd = open(proc_user_path, O_RDONLY | O_CLOEXEC | O_NOCTTY); + if (userns_fd < 0) { + sane_kill(pid, SIGKILL); + bail("failed to get user namespace fd"); + } + + char *idmap_end = idmap_src + idmap_src_len; + while (idmap_src < idmap_end) { + if (idmap_src[0] == '\0') { + idmap_src++; + continue; + } + + int fd_tree = sys_open_tree(-EBADF, idmap_src, + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | + AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT); + if (fd_tree < 0) { + sane_kill(pid, SIGKILL); + if (errno == EINVAL) + bail("failed to use open_tree(2) with path: %s, the kernel doesn't supports ID-mapped mounts", idmap_src); + else + bail("failed to use open_tree(2) with path: %s", idmap_src); + } + + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_IDMAP, + .userns_fd = userns_fd, + }; + + ret = sys_mount_setattr(fd_tree, "", AT_EMPTY_PATH, &attr, sizeof(attr)); + if (ret < 0) { + sane_kill(pid, SIGKILL); + if (errno == EINVAL) + bail("failed to change mount attributes, maybe the filesystem doesn't supports ID-mapped mounts"); + else + bail("failed to change mount attributes"); + } + + write_log(DEBUG, "~> sending idmap source: %s with mapping from: %s", idmap_src, proc_user_path); + send_fd(sockfd, fd_tree); + + if (close(fd_tree) < 0) { + sane_kill(pid, SIGKILL); + bail("error closing fd_tree"); + } + + idmap_src += strlen(idmap_src) + 1; + } + + if (close(userns_fd) < 0) { + sane_kill(pid, SIGKILL); + bail("error closing userns fd"); + } +} + +void receive_idmapsources(int sockfd) +{ + receive_fd_sources(sockfd, "_LIBCONTAINER_IDMAP_FDS"); +} + void nsexec(void) { int pipenum; @@ -891,6 +982,17 @@ void nsexec(void) sane_kill(stage1_pid, SIGKILL); bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)"); } + break; + case SYNC_MOUNT_IDMAP_PLS: + write_log(DEBUG, "stage-1 requested to open idmap sources"); + send_idmapsources(syncfd, stage1_pid, config.idmapsources, + config.idmapsources_len); + s = SYNC_MOUNT_IDMAP_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + sane_kill(stage1_pid, SIGKILL); + bail("failed to sync with child: write(SYNC_MOUNT_IDMAP_ACK)"); + } + break; case SYNC_CHILD_FINISH: write_log(DEBUG, "stage-1 complete"); @@ -1062,6 +1164,21 @@ void nsexec(void) bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s); } + if (config.idmapsources) { + write_log(DEBUG, "request stage-0 to send idmap sources"); + s = SYNC_MOUNT_IDMAP_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: write(SYNC_MOUNT_IDMAP_PLS)"); + + /* Receive and install all idmap fds. */ + receive_idmapsources(syncfd); + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: read(SYNC_MOUNT_IDMAP_ACK)"); + if (s != SYNC_MOUNT_IDMAP_ACK) + bail("failed to sync with parent: SYNC_MOUNT_IDMAP_ACK: got %u", s); + } + /* * TODO: What about non-namespace clone flags that we're dropping here? * diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 9622798f0fd..edd3abd3c82 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -40,7 +40,8 @@ type mountConfig struct { // mountEntry contains mount data specific to a mount point. type mountEntry struct { *configs.Mount - srcFD string + srcFD string + idmapFD int } func (m *mountEntry) src() string { @@ -73,6 +74,10 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds mountFds) ( return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v", len(config.Mounts), len(mountFds.sourceFds)) } + if mountFds.idmapFds != nil && len(mountFds.idmapFds) != len(config.Mounts) { + return fmt.Errorf("malformed idmapFds slice: expected size: %v, got: %v", len(config.Mounts), len(mountFds.idmapFds)) + } + mountConfig := &mountConfig{ root: config.Rootfs, label: config.MountLabel, @@ -81,13 +86,22 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds mountFds) ( cgroupns: config.Namespaces.Contains(configs.NEWCGROUP), } for i, m := range config.Mounts { - entry := mountEntry{Mount: m} + entry := mountEntry{Mount: m, idmapFD: -1} // Just before the loop we checked that if not empty, len(mountFds) == len(config.Mounts). // Therefore, we can access mountFds[i] without any concerns. if mountFds.sourceFds != nil && mountFds.sourceFds[i] != -1 { entry.srcFD = "/proc/self/fd/" + strconv.Itoa(mountFds.sourceFds[i]) } + // We validated before we can access idmapFds[i]. + if mountFds.idmapFds != nil && mountFds.idmapFds[i] != -1 { + entry.idmapFD = mountFds.idmapFds[i] + } + + if entry.idmapFD != -1 && entry.srcFD != "" { + return fmt.Errorf("malformed mountFds and idmapFds slice, entry: %v has fds in both slices", i) + } + if err := mountToRootfs(mountConfig, entry); err != nil { return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err) } @@ -466,8 +480,35 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { if err := prepareBindMount(m, rootfs); err != nil { return err } - if err := mountPropagate(m, rootfs, mountLabel); err != nil { - return err + + if m.IsBind() && m.IsIDMapped() { + if m.idmapFD == -1 { + return fmt.Errorf("error creating mount %+v: idmapFD is invalid, should point to a valid fd", m) + } + if err := unix.MoveMount(m.idmapFD, "", -1, dest, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil { + return fmt.Errorf("error on unix.MoveMount %+v: %w", m, err) + } + + // In nsexec.c, we did not set the propagation field of mount_attr struct. + // So, let's deal with these flags right now! + if err := utils.WithProcfd(rootfs, dest, func(dstFD string) error { + for _, pflag := range m.PropagationFlags { + // When using mount for setting propagations flags, the source, file + // system type and data arguments are ignored: + // https://man7.org/linux/man-pages/man2/mount.2.html + // We also ignore procfd because we want to act on dest. + if err := mountViaFDs("", "", dest, dstFD, "", uintptr(pflag), ""); err != nil { + return err + } + } + return nil + }); err != nil { + return fmt.Errorf("change mount propagation through procfd: %w", err) + } + } else { + if err := mountPropagate(m, rootfs, mountLabel); err != nil { + return err + } } // bind mount won't change mount options, we need remount to make mount options effective. // first check that we have non-default options required before attempting a remount diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index b22b37ecdd6..f3d04282362 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -86,15 +86,14 @@ func (l *linuxStandardInit) Init() error { // initialises the labeling system selinux.GetEnabled() - // We don't need the mountFds.SourceFds after prepareRootfs() nor if it fails. + // We don't need the mount nor idmap fds after prepareRootfs() nor if it fails. err := prepareRootfs(l.pipe, l.config, l.mountFds) - for _, m := range l.mountFds.sourceFds { + for _, m := range append(l.mountFds.sourceFds, l.mountFds.idmapFds...) { if m == -1 { continue } - if err := unix.Close(m); err != nil { - return fmt.Errorf("unable to close mount sourceFds: %w", err) + return fmt.Errorf("unable to close mountFds fds: %w", err) } }