From bf310548a0bbd58aa0ea4faba871e05605cf5292 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 17 May 2021 11:41:38 +0200 Subject: [PATCH] conf: support idmapped lxc.mount.entry entries Signed-off-by: Christian Brauner --- src/lxc/af_unix.c | 25 +- src/lxc/af_unix.h | 19 +- src/lxc/cgroups/cgfsng.c | 5 +- src/lxc/conf.c | 523 +++++++++++++++++++++++++++++++++----- src/lxc/conf.h | 13 +- src/lxc/confile.c | 2 +- src/lxc/start.c | 104 +++++--- src/lxc/storage/storage.c | 1 - src/lxc/sync.c | 8 +- src/lxc/sync.h | 10 +- 10 files changed, 582 insertions(+), 128 deletions(-) diff --git a/src/lxc/af_unix.c b/src/lxc/af_unix.c index b491b95078..14d3160cdd 100644 --- a/src/lxc/af_unix.c +++ b/src/lxc/af_unix.c @@ -164,6 +164,16 @@ int lxc_unix_send_fds(int fd, int *sendfds, int num_sendfds, void *data, return lxc_abstract_unix_send_fds(fd, sendfds, num_sendfds, data, size); } +int __lxc_abstract_unix_send_two_fds(int fd, int fd_first, int fd_second, + void *data, size_t size) +{ + int fd_send[2] = { + fd_first, + fd_second, + }; + return lxc_abstract_unix_send_fds(fd, fd_send, 2, data, size); +} + static ssize_t lxc_abstract_unix_recv_fds_iov(int fd, struct unix_fds *ret_fds, struct iovec *ret_iov, @@ -355,13 +365,14 @@ ssize_t lxc_abstract_unix_recv_one_fd(int fd, int *ret_fd, void *ret_data, return ret; } -ssize_t lxc_abstract_unix_recv_two_fds(int fd, int *ret_fd) +ssize_t __lxc_abstract_unix_recv_two_fds(int fd, int *fd_first, int *fd_second, + void *data, size_t size) { call_cleaner(put_unix_fds) struct unix_fds *fds = NULL; char buf[1] = {}; struct iovec iov = { - .iov_base = buf, - .iov_len = sizeof(buf), + .iov_base = data ?: buf, + .iov_len = size ?: sizeof(buf), }; ssize_t ret; @@ -377,11 +388,11 @@ ssize_t lxc_abstract_unix_recv_two_fds(int fd, int *ret_fd) return ret_errno(ENODATA); if (fds->fd_count_ret != fds->fd_count_max) { - ret_fd[0] = -EBADF; - ret_fd[1] = -EBADF; + *fd_first = -EBADF; + *fd_second = -EBADF; } else { - ret_fd[0] = move_fd(fds->fd[0]); - ret_fd[1] = move_fd(fds->fd[1]); + *fd_first = move_fd(fds->fd[0]); + *fd_second = move_fd(fds->fd[1]); } return 0; diff --git a/src/lxc/af_unix.h b/src/lxc/af_unix.h index 7b97937434..77c115a3b5 100644 --- a/src/lxc/af_unix.h +++ b/src/lxc/af_unix.h @@ -125,7 +125,24 @@ __hidden extern ssize_t lxc_abstract_unix_recv_one_fd(int fd, int *ret_fd, size_t size_ret_data) __access_r(3, 4); -__hidden extern ssize_t lxc_abstract_unix_recv_two_fds(int fd, int *ret_fd); +__hidden extern int __lxc_abstract_unix_send_two_fds(int fd, int fd_first, + int fd_second, void *data, + size_t size); + +static inline int lxc_abstract_unix_send_two_fds(int fd, int fd_first, + int fd_second) +{ + return __lxc_abstract_unix_send_two_fds(fd, fd_first, fd_second, NULL, 0); +} + +__hidden extern ssize_t __lxc_abstract_unix_recv_two_fds(int fd, int *fd_first, + int *fd_second, + void *data, size_t size); + +static inline ssize_t lxc_abstract_unix_recv_two_fds(int fd, int *fd_first, int *fd_second) +{ + return __lxc_abstract_unix_recv_two_fds(fd, fd_first, fd_second, NULL, 0); +} __hidden extern int lxc_unix_send_fds(int fd, int *sendfds, int num_sendfds, void *data, size_t size); diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c index 078d47609c..80fcbb93ca 100644 --- a/src/lxc/cgroups/cgfsng.c +++ b/src/lxc/cgroups/cgfsng.c @@ -2211,16 +2211,13 @@ static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf, int *sk_fd, pid_t pid) { __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF; - int target_fds[2]; char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1]; size_t pidstr_len; ssize_t ret; - ret = lxc_abstract_unix_recv_two_fds(sk, target_fds); + ret = lxc_abstract_unix_recv_two_fds(sk, &target_fd0, &target_fd1); if (ret < 0) return log_error_errno(-1, errno, "Failed to receive target cgroup fd"); - target_fd0 = target_fds[0]; - target_fd1 = target_fds[1]; pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid); diff --git a/src/lxc/conf.c b/src/lxc/conf.c index 4933f817e7..7900e6507d 100644 --- a/src/lxc/conf.c +++ b/src/lxc/conf.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -57,6 +58,7 @@ #include "start.h" #include "storage/storage.h" #include "storage/overlay.h" +#include "sync.h" #include "syscall_wrappers.h" #include "terminal.h" #include "utils.h" @@ -126,6 +128,8 @@ char *lxchook_names[NUM_LXC_HOOKS] = { struct mount_opt { char *name; int clear; + bool recursive; + __u64 flag; int legacy_flag; }; @@ -140,45 +144,48 @@ struct limit_opt { }; static struct mount_opt mount_opt[] = { - { "async", 1, MS_SYNCHRONOUS }, - { "atime", 1, MS_NOATIME }, - { "bind", 0, MS_BIND }, - { "defaults", 0, 0 }, - { "dev", 1, MS_NODEV }, - { "diratime", 1, MS_NODIRATIME }, - { "dirsync", 0, MS_DIRSYNC }, - { "exec", 1, MS_NOEXEC }, - { "lazytime", 0, MS_LAZYTIME }, - { "mand", 0, MS_MANDLOCK }, - { "noatime", 0, MS_NOATIME }, - { "nodev", 0, MS_NODEV }, - { "nodiratime", 0, MS_NODIRATIME }, - { "noexec", 0, MS_NOEXEC }, - { "nomand", 1, MS_MANDLOCK }, - { "norelatime", 1, MS_RELATIME }, - { "nostrictatime", 1, MS_STRICTATIME }, - { "nosuid", 0, MS_NOSUID }, - { "rbind", 0, MS_BIND|MS_REC }, - { "relatime", 0, MS_RELATIME }, - { "remount", 0, MS_REMOUNT }, - { "ro", 0, MS_RDONLY }, - { "rw", 1, MS_RDONLY }, - { "strictatime", 0, MS_STRICTATIME }, - { "suid", 1, MS_NOSUID }, - { "sync", 0, MS_SYNCHRONOUS }, - { NULL, 0, 0 }, + { "atime", 1, false, MOUNT_ATTR_NOATIME, MS_NOATIME }, + { "dev", 1, false, MOUNT_ATTR_NODEV, MS_NODEV }, + { "diratime", 1, false, MOUNT_ATTR_NODIRATIME, MS_NODIRATIME }, + { "exec", 1, false, MOUNT_ATTR_NOEXEC, MS_NOEXEC }, + { "noatime", 0, false, MOUNT_ATTR_NOATIME, MS_NOATIME }, + { "nodev", 0, false, MOUNT_ATTR_NODEV, MS_NODEV }, + { "nodiratime", 0, false, MOUNT_ATTR_NODIRATIME, MS_NODIRATIME }, + { "noexec", 0, false, MOUNT_ATTR_NOEXEC, MS_NOEXEC }, + { "norelatime", 1, false, MOUNT_ATTR_RELATIME, MS_RELATIME }, + { "nostrictatime", 1, false, MOUNT_ATTR_STRICTATIME, MS_STRICTATIME }, + { "nosuid", 0, false, MOUNT_ATTR_NOSUID, MS_NOSUID }, + { "relatime", 0, false, MOUNT_ATTR_RELATIME, MS_RELATIME }, + { "ro", 0, false, MOUNT_ATTR_RDONLY, MS_RDONLY }, + { "rw", 1, false, MOUNT_ATTR_RDONLY, MS_RDONLY }, + { "strictatime", 0, false, MOUNT_ATTR_STRICTATIME, MS_STRICTATIME }, + { "suid", 1, false, MOUNT_ATTR_NOSUID, MS_NOSUID }, + + { "bind", 0, false, 0, MS_BIND }, + { "defaults", 0, false, 0, 0 }, + { "rbind", 0, true, 0, MS_BIND | MS_REC }, + + { "sync", 0, false, ~0, MS_SYNCHRONOUS }, + { "async", 1, false, ~0, MS_SYNCHRONOUS }, + { "dirsync", 0, false, ~0, MS_DIRSYNC }, + { "lazytime", 0, false, ~0, MS_LAZYTIME }, + { "mand", 0, false, ~0, MS_MANDLOCK }, + { "nomand", 1, false, ~0, MS_MANDLOCK }, + { "remount", 0, false, ~0, MS_REMOUNT }, + + { NULL, 0, false, ~0, ~0 }, }; static struct mount_opt propagation_opt[] = { - { "private", 0, MS_PRIVATE }, - { "shared", 0, MS_SHARED }, - { "slave", 0, MS_SLAVE }, - { "unbindable", 0, MS_UNBINDABLE }, - { "rprivate", 0, MS_PRIVATE|MS_REC }, - { "rshared", 0, MS_SHARED|MS_REC }, - { "rslave", 0, MS_SLAVE|MS_REC }, - { "runbindable", 0, MS_UNBINDABLE|MS_REC }, - { NULL, 0, 0 }, + { "private", 0, false, MS_PRIVATE, MS_PRIVATE }, + { "shared", 0, false, MS_SHARED, MS_SHARED }, + { "slave", 0, false, MS_SLAVE, MS_SLAVE }, + { "unbindable", 0, false, MS_UNBINDABLE, MS_UNBINDABLE }, + { "rprivate", 0, true, MS_PRIVATE, MS_PRIVATE | MS_REC }, + { "rshared", 0, true, MS_SHARED, MS_SHARED | MS_REC }, + { "rslave", 0, true, MS_SLAVE, MS_SLAVE | MS_REC }, + { "runbindable", 0, true, MS_UNBINDABLE, MS_UNBINDABLE | MS_REC }, + { NULL, 0, 0 }, }; static struct caps_opt caps_opt[] = { @@ -525,7 +532,7 @@ void lxc_storage_put(struct lxc_conf *conf) */ int lxc_rootfs_init(struct lxc_conf *conf, bool userns) { - __do_close int dfd_path = -EBADF, fd_pin = -EBADF, fd_userns = -EBADF; + __do_close int dfd_path = -EBADF, fd_pin = -EBADF; int ret; struct stat st; struct statfs stfs; @@ -600,7 +607,6 @@ int lxc_rootfs_init(struct lxc_conf *conf, bool userns) out: rootfs->fd_path_pin = move_fd(fd_pin); - rootfs->mnt_opts.userns_fd = move_fd(fd_userns); return 0; } @@ -627,7 +633,7 @@ int lxc_rootfs_prepare_parent(struct lxc_handler *handler) if (!can_use_bind_mounts()) return syserror_set(-EOPNOTSUPP, "Kernel does not support the new mount api"); - if (rootfs->mnt_opts.userns_self) + if (strequal(rootfs->mnt_opts.userns_path, "container")) fd_userns = dup_cloexec(handler->nsfd[LXC_NS_USER]); else fd_userns = open_at(-EBADF, rootfs->mnt_opts.userns_path, @@ -1109,7 +1115,7 @@ void lxc_delete_tty(struct lxc_tty_info *ttys) free_disarm(ttys->tty); } -static int lxc_send_ttys_to_parent(struct lxc_handler *handler) +static int __lxc_send_ttys_to_parent(struct lxc_handler *handler) { int i; int ret = -1; @@ -1154,12 +1160,6 @@ static int lxc_create_ttys(struct lxc_handler *handler) goto on_error; } - ret = lxc_send_ttys_to_parent(handler); - if (ret < 0) { - ERROR("Failed to send ttys to parent"); - goto on_error; - } - if (!conf->is_execute) { ret = lxc_setup_ttys(conf); if (ret < 0) { @@ -1170,15 +1170,26 @@ static int lxc_create_ttys(struct lxc_handler *handler) if (conf->ttys.tty_names) { ret = setenv("container_ttys", conf->ttys.tty_names, 1); - if (ret < 0) + if (ret < 0) { SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names); + goto on_error; + } } - ret = 0; + return 0; on_error: lxc_delete_tty(&conf->ttys); + return -1; +} + +int lxc_send_ttys_to_parent(struct lxc_handler *handler) +{ + int ret = -1; + + ret = __lxc_send_ttys_to_parent(handler); + lxc_delete_tty(&handler->conf->ttys); return ret; } @@ -1652,7 +1663,6 @@ static int lxc_setup_devpts_child(struct lxc_handler *handler) char default_devpts_mntopts[256] = "gid=5,newinstance,ptmxmode=0666,mode=0620"; struct lxc_conf *conf = handler->conf; struct lxc_rootfs *rootfs = &conf->rootfs; - int sock = handler->data_sock[0]; if (conf->pty_max <= 0) return log_debug(0, "No new devpts instance will be mounted since no pts devices are requested"); @@ -1699,14 +1709,9 @@ static int lxc_setup_devpts_child(struct lxc_handler *handler) if (devpts_fd < 0) { devpts_fd = -EBADF; TRACE("Failed to create detached devpts mount"); - ret = lxc_abstract_unix_send_fds(sock, NULL, 0, &devpts_fd, sizeof(int)); - } else { - ret = lxc_abstract_unix_send_fds(sock, &devpts_fd, 1, NULL, 0); } - if (ret < 0) - return log_error_errno(-1, errno, "Failed to send devpts fd to parent"); - TRACE("Sent devpts file descriptor %d to parent", devpts_fd); + handler->conf->devpts_fd = move_fd(devpts_fd); /* Remove any pre-existing /dev/ptmx file. */ ret = unlinkat(rootfs->dfd_dev, "ptmx", 0); @@ -1745,6 +1750,24 @@ static int lxc_setup_devpts_child(struct lxc_handler *handler) return 0; } +int lxc_send_devpts_to_parent(struct lxc_handler *handler) +{ + int ret; + + if (handler->conf->pty_max <= 0) + return log_debug(0, "No devpts file descriptor will be sent since no pts devices are requested"); + + ret = lxc_abstract_unix_send_fds(handler->data_sock[0], &handler->conf->devpts_fd, 1, NULL, 0); + if (ret < 0) + SYSERROR("Failed to send devpts file descriptor %d to parent", handler->conf->devpts_fd); + else + TRACE("Sent devpts file descriptor %d to parent", handler->conf->devpts_fd); + + close_prot_errno_disarm(handler->conf->devpts_fd); + + return 0; +} + static int setup_personality(personality_t persona) { int ret; @@ -2035,6 +2058,104 @@ int parse_mntopts_legacy(const char *mntopts, unsigned long *mntflags, char **mn return 0; } +static int parse_vfs_attr(struct lxc_mount_options *opts, char *opt, size_t size) +{ + /* + * If opt is found in mount_opt, set or clear flags. + * Otherwise append it to data. + */ + for (struct mount_opt *mo = &mount_opt[0]; mo->name != NULL; mo++) { + if (!strnequal(opt, mo->name, strlen(mo->name))) + continue; + + /* This is a recursive bind-mount. */ + if (strequal(mo->name, "rbind")) { + opts->recursive = 1; + opts->bind = 1; + return 0; + } + + /* This is a bind-mount. */ + if (strequal(mo->name, "bind")) { + opts->bind = 1; + return 0; + } + + if (mo->flag == ~0) + return log_info(0, "Ignoring %s mount option", mo->name); + + if (mo->clear) { + opts->attr.attr_clr |= mo->flag; + TRACE("Lowering %s", mo->name); + } else { + opts->attr.attr_set |= mo->flag; + TRACE("Raising %s", mo->name); + } + + return 0; + } + + for (struct mount_opt *mo = &mount_opt[0]; mo->name != NULL; mo++) { + if (!strnequal(opt, mo->name, strlen(mo->name))) + continue; + + /* TODO: Handle recursive propagation requests. */ + opts->attr.propagation = mo->flag; + return 0; + } + + return 0; +} + +static int parse_mount_attrs(struct lxc_mount_options *opts, const char *mntopts) +{ + __do_free char *mntopts_new = NULL, *mntopts_dup = NULL; + char *mntopt_cur = NULL; + int ret; + size_t size; + + if (!opts) + return ret_errno(EINVAL); + + if (!mntopts) + return 0; + + mntopts_dup = strdup(mntopts); + if (!mntopts_dup) + return ret_errno(ENOMEM); + + size = strlen(mntopts_dup) + 1; + mntopts_new = zalloc(size); + if (!mntopts_new) + return ret_errno(ENOMEM); + + lxc_iterate_parts(mntopt_cur, mntopts_dup, ",") { + char *end = NULL; + + /* This is a filesystem specific option. */ + if (strchr(mntopt_cur, '=')) { + if (!end) { + end = stpcpy(mntopts_new, mntopt_cur); + } else { + end = stpcpy(end, ","); + end = stpcpy(end, mntopt_cur); + } + + continue; + } + + /* This is a generic vfs option. */ + ret = parse_vfs_attr(opts, mntopt_cur, size); + if (ret < 0) + return syserror("Failed to parse mount attributes: \"%s\"", mntopt_cur); + } + + if (*mntopts_new) + opts->data = move_ptr(mntopts_new); + + return 0; +} + static void parse_propagationopt(char *opt, unsigned long *flags) { struct mount_opt *mo; @@ -2208,7 +2329,7 @@ const char *lxc_mount_options_info[LXC_MOUNT_MAX] = { }; /* Remove "optional", "create=dir", and "create=file" from mntopt */ -int parse_lxc_mntopts(struct lxc_mount_options *opts, char *mnt_opts) +int parse_lxc_mount_attrs(struct lxc_mount_options *opts, char *mnt_opts) { for (size_t i = LXC_MOUNT_CREATE_DIR; i < LXC_MOUNT_MAX; i++) { __do_close int fd_userns = -EBADF; @@ -2242,17 +2363,15 @@ int parse_lxc_mntopts(struct lxc_mount_options *opts, char *mnt_opts) if (len >= sizeof(opts->userns_path)) return syserror_set(-EIO, "Excessive idmap path length for \"idmap=\" LXC specific mount option"); - memcpy(opts->userns_path, opt_next, len); + strlcpy(opts->userns_path, opt_next, len); if (is_empty_string(opts->userns_path)) return syserror_set(-EINVAL, "Missing idmap path for \"idmap=\" LXC specific mount option"); - if (strequal(opts->userns_path, "container")) { - opts->userns_self = 1; - } else { + if (!strequal(opts->userns_path, "container")) { fd_userns = open(opts->userns_path, O_RDONLY | O_NOCTTY | O_CLOEXEC); if (fd_userns < 0) - return syserror("Failed to open user namespace"); + return syserror("Failed to open user namespace %s", opts->userns_path); } TRACE("Parse LXC specific mount option %d->\"idmap=%s\"", fd_userns, opts->userns_path); @@ -2347,12 +2466,19 @@ static inline int mount_entry_on_generic(struct mntent *mntent, return -1; } - ret = parse_lxc_mntopts(&opts, mntent->mnt_opts); + ret = parse_lxc_mount_attrs(&opts, mntent->mnt_opts); if (ret < 0) return ret; + /* + * Idmapped mount entries will be setup by the parent for us. Note that + * we rely on mount_entry_create_dir_file() above to have already + * created the target path for us. So the parent can just open the + * target and send us the target fd. + */ + errno = EOPNOTSUPP; if (!is_empty_string(opts.userns_path)) - return syserror_set(-EINVAL, "Idmapped mount entries not yet supported"); + return systrace_ret(0, "Skipping idmapped mount entry"); ret = parse_propagationopts(mntent->mnt_opts, &pflags); if (ret < 0) @@ -2579,6 +2705,226 @@ static int setup_mount_entries(const struct lxc_conf *conf, return mount_file_entries(rootfs, f, lxc_name, lxc_path); } +static int __lxc_idmapped_mounts_child(struct lxc_handler *handler, FILE *f) +{ + struct lxc_conf *conf = handler->conf; + struct lxc_rootfs *rootfs = &conf->rootfs; + int ret; + char buf[PATH_MAX]; + struct mntent mntent; + + while (getmntent_r(f, &mntent, buf, sizeof(buf))) { + __do_close int fd_from = -EBADF, fd_to = -EBADF, + fd_userns = -EBADF; + __do_free char *__data = NULL; + struct lxc_mount_options opts = {}; + int dfd_from; + const char *source_relative, *target_relative; + + ret = parse_lxc_mount_attrs(&opts, mntent.mnt_opts); + if (ret < 0) + return syserror("Failed to parse LXC specific mount options"); + __data = opts.data; + + ret = parse_mount_attrs(&opts, mntent.mnt_opts); + if (ret < 0) + return syserror("Failed to parse mount options"); + + /* No idmapped mount entry so skip it. */ + if (is_empty_string(opts.userns_path)) + continue; + + if (!can_use_bind_mounts()) + return syserror_set(-EINVAL, "Kernel does not support idmapped mounts"); + + if (!opts.bind) + return syserror_set(-EINVAL, "Only bind mounts can currently be idmapped"); + + /* We don't support new filesystem mounts yet. */ + if (!is_empty_string(mntent.mnt_type) && + !strequal(mntent.mnt_type, "none")) + return syserror_set(-EINVAL, "Only bind mounts can currently be idmapped"); + + /* Someone specified additional mount options for a bind-mount. */ + if (!is_empty_string(opts.data)) + return syserror_set(-EINVAL, "Bind mounts don't support non-generic mount options"); + + /* + * The source path is supposed to be taken relative to the + * container's rootfs mount or - if the container does not have + * a separate rootfs - to the host's /. + */ + source_relative = deabs(mntent.mnt_fsname); + if (opts.relative || !rootfs->path) + dfd_from = rootfs->dfd_mnt; + else + dfd_from = rootfs->dfd_host; + fd_from = open_tree(dfd_from, source_relative, + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | + (opts.recursive ? AT_RECURSIVE : 0)); + if (fd_from < 0) + return syserror("Failed to create detached %smount of %d/%s", + opts.recursive ? "recursive " : "", + dfd_from, source_relative); + + if (strequal(opts.userns_path, "container")) + fd_userns = openat(dfd_from, "proc/self/ns/user", O_RDONLY | O_CLOEXEC); + else + fd_userns = open_at(-EBADF, opts.userns_path, + PROTECT_OPEN_WITH_TRAILING_SYMLINKS, 0, 0); + if (fd_userns < 0) { + if (opts.optional) { + TRACE("Skipping optional idmapped mount"); + continue; + } + + return syserror("Failed to open user namespace \"%s\" for detached %smount of %d/%s", + opts.userns_path, opts.recursive ? "recursive " : "", + dfd_from, source_relative); + } + + ret = __lxc_abstract_unix_send_two_fds(handler->data_sock[0], + fd_from, fd_userns, + &opts, sizeof(opts)); + if (ret <= 0) { + if (opts.optional) { + TRACE("Skipping optional idmapped mount"); + continue; + } + + return syserror("Failed to send file descriptor %d for detached %smount of %d/%s and file descriptor %d of user namespace \"%s\" to parent", + fd_from, opts.recursive ? "recursive " : "", + dfd_from, source_relative, fd_userns, + opts.userns_path); + } + + ret = lxc_abstract_unix_rcv_credential(handler->data_sock[0], NULL, 0); + if (ret <= 0) { + if (opts.optional) { + TRACE("Skipping optional idmapped mount"); + continue; + } + + return syserror("Failed to receive notification that parent idmapped detached %smount %d/%s to user namespace %d", + opts.recursive ? "recursive " : "", + dfd_from, source_relative, fd_userns); + } + + /* Set remaining mount options. */ + ret = mount_setattr(fd_from, "", AT_EMPTY_PATH | + (opts.recursive ? AT_RECURSIVE : 0), + &opts.attr, sizeof(opts.attr)); + if (ret < 0) { + if (opts.optional) { + TRACE("Skipping optional idmapped mount"); + continue; + } + + return syserror("Failed to receive notification that parent idmapped detached %smount %d/%s to user namespace %d", + opts.recursive ? "recursive " : "", + dfd_from, source_relative, fd_userns); + } + + /* + * In contrast to the legacy mount codepath we will simplify + * our lifes and just always treat the target mountpoint to be + * relative to the container's rootfs mountpoint or - if the + * container does not have a separate rootfs - to the host's /. + */ + + target_relative = deabs(mntent.mnt_dir); + if (rootfs->path) + dfd_from = rootfs->dfd_mnt; + else + dfd_from = rootfs->dfd_host; + fd_to = open_at(dfd_from, target_relative, PROTECT_OPATH_FILE, PROTECT_LOOKUP_BENEATH_WITH_SYMLINKS, 0); + if (fd_to < 0) { + if (opts.optional) { + TRACE("Skipping optional idmapped mount"); + continue; + } + + return syserror("Failed to open target mountpoint %d/%s for detached idmapped %smount %d:%d/%s", + dfd_from, target_relative, + opts.recursive ? "recursive " : "", + fd_userns, dfd_from, source_relative); + } + + ret = move_detached_mount(fd_from, fd_to, "", 0, 0); + if (ret) { + if (opts.optional) { + TRACE("Skipping optional idmapped mount"); + continue; + } + + return syserror("Failed to attach detached idmapped %smount %d:%d/%s to target mountpoint %d/%s", + opts.recursive ? "recursive " : "", + fd_userns, dfd_from, source_relative, dfd_from, target_relative); + } + + TRACE("Attached detached idmapped %smount %d:%d/%s to target mountpoint %d/%s", + opts.recursive ? "recursive " : "", fd_userns, dfd_from, + source_relative, dfd_from, target_relative); + } + + if (!feof(f) || ferror(f)) + return syserror_set(-EINVAL, "Failed to parse mount entries"); + + return 0; +} + +static int lxc_idmapped_mounts_child(struct lxc_handler *handler) +{ + __do_fclose FILE *f_entries = NULL; + int fret = -1; + struct lxc_conf *conf = handler->conf; + const char *fstab = conf->fstab; + struct lxc_list *mount = &conf->mount_list; + int ret; + + f_entries = make_anonymous_mount_file(mount, conf->lsm_aa_allow_nesting); + if (!f_entries) { + SYSERROR("Failed to create anonymous mount file"); + goto out; + } + + ret = __lxc_idmapped_mounts_child(handler, f_entries); + if (ret) { + SYSERROR("Failed to setup idmapped mount entries"); + goto out; + } + + TRACE("Finished setting up idmapped mounts"); + + if (fstab) { + __do_endmntent FILE *f_fstab = NULL; + + f_fstab = setmntent(fstab, "re"); + if (!f_fstab) { + SYSERROR("Failed to open fstab format file \"%s\"", fstab); + goto out; + } + + ret = __lxc_idmapped_mounts_child(handler, f_fstab); + if (ret) { + SYSERROR("Failed to setup idmapped mount entries specified in fstab"); + goto out; + } + + TRACE("Finished setting up idmapped mounts specified in fstab"); + } + + fret = 0; + +out: + ret = lxc_abstract_unix_send_credential(handler->data_sock[0], NULL, 0); + if (ret < 0) + return syserror("Failed to inform child that we are done setting up mounts"); + TRACE("AAAA"); + + return fret; +} + static int parse_cap(const char *cap) { size_t i; @@ -2845,7 +3191,6 @@ struct lxc_conf *lxc_conf_init(void) new->rootfs.dfd_host = -EBADF; new->rootfs.fd_path_pin = -EBADF; new->rootfs.dfd_idmapped = -EBADF; - new->rootfs.mnt_opts.userns_fd = -EBADF; new->logfd = -1; lxc_list_init(&new->cgroup); lxc_list_init(&new->cgroup2); @@ -3617,6 +3962,45 @@ static int lxc_rootfs_prepare_child(struct lxc_handler *handler) return 0; } +int lxc_idmapped_mounts_parent(struct lxc_handler *handler) +{ + for (;;) { + __do_close int fd_from = -EBADF, fd_userns = -EBADF; + struct lxc_mount_attr attr = {}; + struct lxc_mount_options opts = {}; + ssize_t ret; + + ret = __lxc_abstract_unix_recv_two_fds(handler->data_sock[1], + &fd_from, &fd_userns, + &opts, sizeof(opts)); + if (ret < 0) + return syserror("Failed to receive idmapped mount file descriptors from child"); + + if (fd_from < 0 || fd_userns < 0) + return log_trace(0, "Finished receiving idmapped mount file descriptors from child"); + + attr.attr_set = MOUNT_ATTR_IDMAP; + attr.userns_fd = fd_userns; + ret = mount_setattr(fd_from, "", + AT_EMPTY_PATH | + (opts.recursive ? AT_RECURSIVE : 0), + &attr, sizeof(attr)); + if (ret) + return syserror("Failed to idmap detached %smount %d to %d", + opts.recursive ? "recursive " : "", + fd_from, fd_userns); + + ret = lxc_abstract_unix_send_credential(handler->data_sock[1], NULL, 0); + if (ret < 0) + return syserror("Parent failed to notify child that detached %smount %d was idmapped to user namespace %d", + opts.recursive ? "recursive " : "", + fd_from, fd_userns); + + TRACE("Parent idmapped detached %smount %d to user namespace %d", + opts.recursive ? "recursive " : "", fd_from, fd_userns); + } +} + int lxc_setup(struct lxc_handler *handler) { int ret; @@ -3648,10 +4032,6 @@ int lxc_setup(struct lxc_handler *handler) &lxc_conf->network); if (ret < 0) return log_error(-1, "Failed to setup network"); - - ret = lxc_network_send_name_and_ifindex_to_parent(handler); - if (ret < 0) - return log_error(-1, "Failed to send network device names and ifindices to parent"); } if (lxc_conf->autodev > 0) { @@ -3678,6 +4058,13 @@ int lxc_setup(struct lxc_handler *handler) return log_error(-1, "Failed to setup mount entries"); } + if (!lxc_sync_wake_parent(handler, START_SYNC_IDMAPPED_MOUNTS)) + return -1; + + ret = lxc_idmapped_mounts_child(handler); + if (ret) + return syserror("Failed to attached detached idmapped mounts"); + lxc_conf->rootfs.dfd_dev = open_at(lxc_conf->rootfs.dfd_mnt, "dev", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0); if (lxc_conf->rootfs.dfd_dev < 0 && errno != ENOENT) diff --git a/src/lxc/conf.h b/src/lxc/conf.h index da742bdd4b..a185b2023c 100644 --- a/src/lxc/conf.h +++ b/src/lxc/conf.h @@ -201,12 +201,13 @@ struct lxc_mount_options { int create_file : 1; int optional : 1; int relative : 1; - int userns_self : 1; + int recursive : 1; + int bind : 1; char userns_path[PATH_MAX]; - int userns_fd; unsigned long mnt_flags; unsigned long prop_flags; char *data; + struct lxc_mount_attr attr; }; /* Defines a structure to store the rootfs location, the @@ -512,9 +513,12 @@ __hidden extern int lxc_rootfs_prepare(struct lxc_conf *conf, bool userns); __hidden extern void lxc_storage_put(struct lxc_conf *conf); __hidden extern int lxc_rootfs_init(struct lxc_conf *conf, bool userns); __hidden extern int lxc_rootfs_prepare_parent(struct lxc_handler *handler); +__hidden extern int lxc_idmapped_mounts_parent(struct lxc_handler *handler); __hidden extern int lxc_map_ids(struct lxc_list *idmap, pid_t pid); __hidden extern int lxc_create_tty(const char *name, struct lxc_conf *conf); __hidden extern void lxc_delete_tty(struct lxc_tty_info *ttys); +__hidden extern int lxc_send_ttys_to_parent(struct lxc_handler *handler); +__hidden extern int lxc_send_devpts_to_parent(struct lxc_handler *handler); __hidden extern int lxc_clear_config_caps(struct lxc_conf *c); __hidden extern int lxc_clear_config_keepcaps(struct lxc_conf *c); __hidden extern int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version); @@ -541,7 +545,7 @@ __hidden extern int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), v const char *fn_name); __hidden extern int parse_mntopts_legacy(const char *mntopts, unsigned long *mntflags, char **mntdata); __hidden extern int parse_propagationopts(const char *mntopts, unsigned long *pflags); -__hidden extern int parse_lxc_mntopts(struct lxc_mount_options *opts, char *mnt_opts); +__hidden extern int parse_lxc_mount_attrs(struct lxc_mount_options *opts, char *mnt_opts); __hidden extern void tmp_proc_unmount(struct lxc_conf *lxc_conf); __hidden extern void suggest_default_idmap(void); __hidden extern FILE *make_anonymous_mount_file(struct lxc_list *mount, bool include_nesting_helpers); @@ -593,12 +597,10 @@ static inline void put_lxc_mount_options(struct lxc_mount_options *mnt_opts) mnt_opts->create_file = 0; mnt_opts->optional = 0; mnt_opts->relative = 0; - mnt_opts->userns_self = 0; mnt_opts->userns_path[0] = '\0'; mnt_opts->mnt_flags = 0; mnt_opts->prop_flags = 0; - close_prot_errno_disarm(mnt_opts->userns_fd); free_disarm(mnt_opts->data); } @@ -608,7 +610,6 @@ static inline void put_lxc_rootfs(struct lxc_rootfs *rootfs, bool unpin) close_prot_errno_disarm(rootfs->dfd_host); close_prot_errno_disarm(rootfs->dfd_mnt); close_prot_errno_disarm(rootfs->dfd_dev); - close_prot_errno_disarm(rootfs->mnt_opts.userns_fd); if (unpin) close_prot_errno_disarm(rootfs->fd_path_pin); close_prot_errno_disarm(rootfs->dfd_idmapped); diff --git a/src/lxc/confile.c b/src/lxc/confile.c index a7dbd70dd9..6623fb60e4 100644 --- a/src/lxc/confile.c +++ b/src/lxc/confile.c @@ -2595,7 +2595,7 @@ static int set_config_rootfs_options(const char *key, const char *value, if (!dup) return -ENOMEM; - ret = parse_lxc_mntopts(mnt_opts, dup); + ret = parse_lxc_mount_attrs(mnt_opts, dup); if (ret < 0) return ret; diff --git a/src/lxc/start.c b/src/lxc/start.c index e3ed26d0a0..21e70dce85 100644 --- a/src/lxc/start.c +++ b/src/lxc/start.c @@ -1291,12 +1291,6 @@ static int do_start(void *data) if (ret < 0) goto out_warn_father; - ret = lxc_seccomp_send_notifier_fd(&handler->conf->seccomp, data_sock0); - if (ret < 0) { - SYSERROR("Failed to send seccomp notify fd to parent"); - goto out_warn_father; - } - ret = run_lxc_hooks(handler->name, "start", handler->conf, NULL); if (ret < 0) { ERROR("Failed to run lxc.hook.start for container \"%s\"", @@ -1336,6 +1330,35 @@ static int do_start(void *data) if (!lxc_sync_barrier_parent(handler, START_SYNC_CGROUP_LIMITS)) goto out_warn_father; + ret = lxc_seccomp_send_notifier_fd(&handler->conf->seccomp, data_sock0); + if (ret < 0) { + SYSERROR("Failed to send seccomp notify fd to parent"); + goto out_warn_father; + } + + ret = lxc_send_devpts_to_parent(handler); + if (ret < 0) { + SYSERROR("Failed to send seccomp devpts fd to parent"); + goto out_warn_father; + } + + ret = lxc_send_ttys_to_parent(handler); + if (ret < 0) { + SYSERROR("Failed to send tty file descriptors to parent"); + goto out_warn_father; + } + + if (handler->ns_clone_flags & CLONE_NEWNET) { + ret = lxc_network_send_name_and_ifindex_to_parent(handler); + if (ret < 0) { + SYSERROR("Failed to send network device names and ifindices to parent"); + goto out_warn_father; + } + } + + if (!lxc_sync_wait_parent(handler, START_SYNC_READY_START)) + goto out_warn_father; + /* Reset the environment variables the user requested in a clear * environment. */ @@ -1458,16 +1481,16 @@ static int lxc_recv_ttys_from_child(struct lxc_handler *handler) return -1; for (i = 0; i < conf->ttys.max; i++) { - int ttyfds[2]; + int ttyx = -EBADF, ttyy = -EBADF; - ret = lxc_abstract_unix_recv_two_fds(sock, ttyfds); + ret = lxc_abstract_unix_recv_two_fds(sock, &ttyx, &ttyy); if (ret < 0) break; tty = &ttys->tty[i]; tty->busy = -1; - tty->ptx = ttyfds[0]; - tty->pty = ttyfds[1]; + tty->ptx = ttyx; + tty->pty = ttyy; TRACE("Received pty with ptx fd %d and pty fd %d from child", tty->ptx, tty->pty); } @@ -1875,6 +1898,15 @@ static int lxc_spawn(struct lxc_handler *handler) if (!lxc_sync_barrier_child(handler, START_SYNC_CGROUP_UNSHARE)) goto out_delete_net; + ret = lxc_idmapped_mounts_parent(handler); + if (ret) { + ERROR("Failed to setup mount entries"); + goto out_delete_net; + } + + if (!lxc_sync_wait_child(handler, START_SYNC_CGROUP_LIMITS)) + goto out_delete_net; + /* * With isolation the limiting devices cgroup was already setup, so * only setup devices here if we have no namespace directory. @@ -1924,21 +1956,13 @@ static int lxc_spawn(struct lxc_handler *handler) goto out_delete_net; } - /* Tell the child to complete its initialization and wait for it to exec - * or return an error. (The child will never return - * START_SYNC_READY_START+1. It will either close the sync pipe, causing - * lxc_sync_barrier_child to return success, or return a different - * value, causing us to error out). - */ - if (!lxc_sync_barrier_child(handler, START_SYNC_READY_START)) + if (!lxc_sync_wake_child(handler, START_SYNC_FDS)) goto out_delete_net; - if (handler->ns_clone_flags & CLONE_NEWNET) { - ret = lxc_network_recv_name_and_ifindex_from_child(handler); - if (ret < 0) { - ERROR("Failed to receive names and ifindices for network devices from child"); - goto out_delete_net; - } + ret = lxc_seccomp_recv_notifier_fd(&handler->conf->seccomp, data_sock1); + if (ret < 0) { + SYSERROR("Failed to receive seccomp notify fd from child"); + goto out_delete_net; } ret = lxc_setup_devpts_parent(handler); @@ -1947,13 +1971,6 @@ static int lxc_spawn(struct lxc_handler *handler) goto out_delete_net; } - /* Now all networks are created, network devices are moved into place, - * and the correct names and ifindices in the respective namespaces have - * been recorded. The corresponding structs have now all been filled. So - * log them for debugging purposes. - */ - lxc_log_configured_netdevs(conf); - /* Read tty fds allocated by child. */ ret = lxc_recv_ttys_from_child(handler); if (ret < 0) { @@ -1961,12 +1978,31 @@ static int lxc_spawn(struct lxc_handler *handler) goto out_delete_net; } - ret = lxc_seccomp_recv_notifier_fd(&handler->conf->seccomp, data_sock1); - if (ret < 0) { - SYSERROR("Failed to receive seccomp notify fd from child"); - goto out_delete_net; + if (handler->ns_clone_flags & CLONE_NEWNET) { + ret = lxc_network_recv_name_and_ifindex_from_child(handler); + if (ret < 0) { + ERROR("Failed to receive names and ifindices for network devices from child"); + goto out_delete_net; + } } + /* + * Tell the child to complete its initialization and wait for it to + * exec or return an error. (The child will never return + * START_SYNC_READY_START+1. It will either close the sync pipe, + * causing lxc_sync_barrier_child to return success, or return a + * different value, causing us to error out). + */ + if (!lxc_sync_barrier_child(handler, START_SYNC_READY_START)) + goto out_delete_net; + + /* Now all networks are created, network devices are moved into place, + * and the correct names and ifindices in the respective namespaces have + * been recorded. The corresponding structs have now all been filled. So + * log them for debugging purposes. + */ + lxc_log_configured_netdevs(conf); + ret = handler->ops->post_start(handler, handler->data); if (ret < 0) goto out_abort; diff --git a/src/lxc/storage/storage.c b/src/lxc/storage/storage.c index 514bdae30f..f1bea3e30d 100644 --- a/src/lxc/storage/storage.c +++ b/src/lxc/storage/storage.c @@ -321,7 +321,6 @@ struct lxc_storage *storage_copy(struct lxc_container *c, const char *cname, .dfd_host = -EBADF, .fd_path_pin = -EBADF, .dfd_idmapped = -EBADF, - .mnt_opts.userns_fd = -EBADF, }; if (!src) { diff --git a/src/lxc/sync.c b/src/lxc/sync.c index 1d018387ec..f194e67761 100644 --- a/src/lxc/sync.c +++ b/src/lxc/sync.c @@ -74,6 +74,10 @@ static inline const char *start_sync_to_string(int state) return "cgroup-unshare"; case START_SYNC_CGROUP_LIMITS: return "cgroup-limits"; + case START_SYNC_IDMAPPED_MOUNTS: + return "idmapped-mounts"; + case START_SYNC_FDS: + return "fds"; case START_SYNC_READY_START: return "ready-start"; case START_SYNC_RESTART: @@ -109,13 +113,13 @@ bool lxc_sync_wake_parent(struct lxc_handler *handler, int sequence) bool lxc_sync_wait_parent(struct lxc_handler *handler, int sequence) { - TRACE("Parent waiting for child with sequence %s", start_sync_to_string(sequence)); + TRACE("Child waiting for parent with sequence %s", start_sync_to_string(sequence)); return sync_wait(handler->sync_sock[0], sequence); } bool lxc_sync_wait_child(struct lxc_handler *handler, int sequence) { - TRACE("Child waiting for parent with sequence %s", start_sync_to_string(sequence)); + TRACE("Parent waiting for child with sequence %s", start_sync_to_string(sequence)); return sync_wait(handler->sync_sock[1], sequence); } diff --git a/src/lxc/sync.h b/src/lxc/sync.h index 57191c1cbb..e7b3b4d374 100644 --- a/src/lxc/sync.h +++ b/src/lxc/sync.h @@ -19,10 +19,12 @@ enum /* start */ { START_SYNC_POST_CONFIGURE = 2, START_SYNC_CGROUP = 3, START_SYNC_CGROUP_UNSHARE = 4, - START_SYNC_CGROUP_LIMITS = 5, - START_SYNC_READY_START = 6, - START_SYNC_RESTART = 7, - START_SYNC_POST_RESTART = 8, + START_SYNC_IDMAPPED_MOUNTS = 5, + START_SYNC_CGROUP_LIMITS = 6, + START_SYNC_FDS = 7, + START_SYNC_READY_START = 8, + START_SYNC_RESTART = 9, + START_SYNC_POST_RESTART = 10, }; enum /* attach */ {