Skip to content

Commit

Permalink
Support idmap mounts on volumes
Browse files Browse the repository at this point in the history
This commit adds support for idmap mounts as specified in the runtime-spec.

We open the idmap source paths and call mount_setattr() in runc PARENT,
as we need privileges in the init userns for that, and then sends the
fds to the child process. For this fd passing we use the same mechanism
used in other parts of thecode, the _LIBCONTAINER_ env vars.

The mount is finished (unix.MoveMount) from go code, inside the userns,
so we reuse all the prepareBindMount() security checks and the remount
logic for some flags too.

This commit only supports idmap mounts when userns are used AND the mappings
are the same specified for the userns mapping. This limitation is to
simplify the initial implementation, as all our users so far only need
this, and we can avoid sending over netlink the mappings, creating a
userns with this custom mapping, etc. Future PRs will remove this
limitation.

Co-authored-by: Francis Laniel <flaniel@linux.microsoft.com>
Signed-off-by: Rodrigo Campos <rodrigoca@microsoft.com>
  • Loading branch information
rata and eiffel-fl committed Jul 17, 2023
1 parent fe4528b commit fda12ab
Show file tree
Hide file tree
Showing 6 changed files with 249 additions and 14 deletions.
68 changes: 68 additions & 0 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,38 @@ func (c *Container) shouldSendMountSources() bool {
return false
}

// shouldSendIdmapSources says whether the child process must setup idmap mounts with
// the mount_setattr already done in the host user namespace.
func (c *Container) shouldSendIdmapSources() bool {
// nsexec.c mount_setattr() requires CAP_SYS_ADMIN in:
// * the user namespace the filesystem was mounted in;
// * the user namespace we're trying to idmap the mount to;
// * the owning user namespace of the mount namespace you're currently located in.
//
// See the comment from Christian Brauner:
// https://github.com/opencontainers/runc/pull/3717#discussion_r1103607972
//
// Let's just rule out rootless, we don't have those permission in the
// rootless case.
if c.config.RootlessEUID {
return false
}

// For the time being we require userns to be in use.
if !c.config.Namespaces.Contains(configs.NEWUSER) {
return false
}

// We need to send sources if there are idmap bind-mounts.
for _, m := range c.config.Mounts {
if m.IsBind() && m.IsIDMapped() {
return true
}
}

return false
}

func (c *Container) sendMountSources(cmd *exec.Cmd, messageSockPair filePair) error {
if !c.shouldSendMountSources() {
return nil
Expand All @@ -551,6 +583,16 @@ func (c *Container) sendMountSources(cmd *exec.Cmd, messageSockPair filePair) er
})
}

func (c *Container) sendIdmapSources(cmd *exec.Cmd, messageSockPair filePair) error {
if !c.shouldSendIdmapSources() {
return nil
}

return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool {
return m.IsBind() && m.IsIDMapped()
})
}

func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envVar string, condition func(*configs.Mount) bool) error {
// Elements on these slices will be paired with mounts (see StartInitialization() and
// prepareRootfs()). These slices MUST have the same size as c.config.Mounts.
Expand Down Expand Up @@ -592,6 +634,9 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l
if err := c.sendMountSources(cmd, messageSockPair); err != nil {
return nil, err
}
if err := c.sendIdmapSources(cmd, messageSockPair); err != nil {
return nil, err
}

init := &initProcess{
cmd: cmd,
Expand Down Expand Up @@ -2256,6 +2301,29 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa
})
}

// Idmap mount sources to open.
if it == initStandard && c.shouldSendIdmapSources() {
var mounts []byte
for _, m := range c.config.Mounts {
if m.IsBind() && m.IsIDMapped() {
// While other parts of the code check this too (like
// libcontainer/specconv/spec_linux.go) we do it here also because some libcontainer
// users don't use those functions.
if strings.IndexByte(m.Source, 0) >= 0 {
return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
}

mounts = append(mounts, []byte(m.Source)...)
}
mounts = append(mounts, byte(0))
}

r.AddData(&Bytemsg{
Type: IdmapSourcesAttr,
Value: mounts,
})
}

return bytes.NewReader(r.Serialize()), nil
}

Expand Down
21 changes: 15 additions & 6 deletions libcontainer/init_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,15 @@ type network struct {
}

type mountFds struct {
// Fds to use as source when mounting
// Size should be the same as container mounts, as it will be paired.
// sourceFds are the fds to use as source when mounting.
// The slice size should be the same as container mounts, as it will be
// paired with them.
// The value -1 is used when no fd is needed for the mount.
// Can't have a valid fd in the same position that other slices in this struct.
// We need to use only one of these fds on any single mount.
sourceFds []int
// Idem sourceFds, but fds of already created idmap mounts, to use with unix.MoveMount().
idmapFds []int
}

// initConfig is used for transferring parameters from Exec() to Init()
Expand Down Expand Up @@ -142,6 +145,12 @@ func StartInitialization() (retErr error) {
return err
}

// Get idmap fds.
idmapFds, err := parseFdsFromEnv("_LIBCONTAINER_IDMAP_FDS")
if err != nil {
return err
}

// clear the current process's environment to clean any libcontainer
// specific env vars.
os.Clearenv()
Expand All @@ -157,7 +166,7 @@ func StartInitialization() (retErr error) {
}()

// If init succeeds, it will not return, hence none of the defers will be called.
return containerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds{sourceFds: mountSrcFds})
return containerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
}

func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds mountFds) error {
Expand All @@ -170,9 +179,9 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
}
switch t {
case initSetns:
// mountFds must be nil in this case. We don't mount while doing runc exec.
if mountFds.sourceFds != nil {
return errors.New("mount source fds must be nil; can't mount from exec")
// mount and idmap fds must be nil in this case. We don't mount while doing runc exec.
if mountFds.sourceFds != nil || mountFds.idmapFds != nil {
return errors.New("mount and idmap fds must be nil; can't mount from exec")
}

i := &linuxSetnsInit{
Expand Down
1 change: 1 addition & 0 deletions libcontainer/message_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ const (
UidmapPathAttr uint16 = 27288
GidmapPathAttr uint16 = 27289
MountSourcesAttr uint16 = 27290
IdmapSourcesAttr uint16 = 27291
)

type Int32msg struct {
Expand Down
117 changes: 117 additions & 0 deletions libcontainer/nsenter/nsexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
/* Get all of the CLONE_NEW* flags. */
#include "namespace.h"

/* Get definitions for idmap sources */
#include "idmap.h"

/* Synchronisation values. */
enum sync_t {
SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
Expand All @@ -43,6 +46,8 @@ enum sync_t {
SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */
SYNC_MOUNTSOURCES_PLS = 0x46, /* Tell parent to send mount sources by SCM_RIGHTS. */
SYNC_MOUNTSOURCES_ACK = 0x47, /* All mount sources have been sent. */
SYNC_MOUNT_IDMAP_PLS = 0x48, /* Tell parent to mount idmap sources. */
SYNC_MOUNT_IDMAP_ACK = 0x49, /* All idmap mounts have been done. */
};

#define STAGE_SETUP -1
Expand Down Expand Up @@ -95,6 +100,10 @@ struct nlconfig_t {
/* Mount sources opened outside the container userns. */
char *mountsources;
size_t mountsources_len;

/* Idmap sources opened outside the container userns which will be id mapped. */
char *idmapsources;
size_t idmapsources_len;
};

/*
Expand All @@ -112,6 +121,7 @@ struct nlconfig_t {
#define UIDMAPPATH_ATTR 27288
#define GIDMAPPATH_ATTR 27289
#define MOUNT_SOURCES_ATTR 27290
#define IDMAP_SOURCES_ATTR 27291

/*
* Use the raw syscall for versions of glibc which don't include a function for
Expand Down Expand Up @@ -431,6 +441,10 @@ static void nl_parse(int fd, struct nlconfig_t *config)
config->mountsources = current;
config->mountsources_len = payload_len;
break;
case IDMAP_SOURCES_ATTR:
config->idmapsources = current;
config->idmapsources_len = payload_len;
break;
default:
bail("unknown netlink message type %d", nlattr->nla_type);
}
Expand Down Expand Up @@ -650,6 +664,83 @@ void try_unshare(int flags, const char *msg)
bail("failed to unshare %s", msg);
}

void send_idmapsources(int sockfd, pid_t pid, char *idmap_src, int idmap_src_len)
{
char proc_user_path[PATH_MAX];

/* Open the userns fd only once.
* Currently we only support idmap mounts that use the same mapping than
* the userns. This is validated in libcontainer/configs/validate/validator.go,
* so if we reached here, we know the mapping for the idmap is the same
* as the userns. This is why we just open the userns_fd once from the
* PID of the child process that has the userns already applied.
*/
int ret = snprintf(proc_user_path, sizeof(proc_user_path), "/proc/%d/ns/user", pid);
if (ret < 0 || (size_t)ret >= sizeof(proc_user_path)) {
sane_kill(pid, SIGKILL);
bail("failed to create userns path string");
}

int userns_fd = open(proc_user_path, O_RDONLY | O_CLOEXEC | O_NOCTTY);
if (userns_fd < 0) {
sane_kill(pid, SIGKILL);
bail("failed to get user namespace fd");
}

char *idmap_end = idmap_src + idmap_src_len;
while (idmap_src < idmap_end) {
if (idmap_src[0] == '\0') {
idmap_src++;
continue;
}

int fd_tree = sys_open_tree(-EBADF, idmap_src,
OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC |
AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT);
if (fd_tree < 0) {
sane_kill(pid, SIGKILL);
if (errno == EINVAL)
bail("failed to use open_tree(2) with path: %s, the kernel doesn't supports ID-mapped mounts", idmap_src);
else
bail("failed to use open_tree(2) with path: %s", idmap_src);
}

struct mount_attr attr = {
.attr_set = MOUNT_ATTR_IDMAP,
.userns_fd = userns_fd,
};

ret = sys_mount_setattr(fd_tree, "", AT_EMPTY_PATH, &attr, sizeof(attr));
if (ret < 0) {
sane_kill(pid, SIGKILL);
if (errno == EINVAL)
bail("failed to change mount attributes, maybe the filesystem doesn't supports ID-mapped mounts");
else
bail("failed to change mount attributes");
}

write_log(DEBUG, "~> sending idmap source: %s with mapping from: %s", idmap_src, proc_user_path);
send_fd(sockfd, fd_tree);

if (close(fd_tree) < 0) {
sane_kill(pid, SIGKILL);
bail("error closing fd_tree");
}

idmap_src += strlen(idmap_src) + 1;
}

if (close(userns_fd) < 0) {
sane_kill(pid, SIGKILL);
bail("error closing userns fd");
}
}

void receive_idmapsources(int sockfd)
{
receive_fd_sources(sockfd, "_LIBCONTAINER_IDMAP_FDS");
}

void nsexec(void)
{
int pipenum;
Expand Down Expand Up @@ -891,6 +982,17 @@ void nsexec(void)
sane_kill(stage1_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)");
}
break;
case SYNC_MOUNT_IDMAP_PLS:
write_log(DEBUG, "stage-1 requested to open idmap sources");
send_idmapsources(syncfd, stage1_pid, config.idmapsources,
config.idmapsources_len);
s = SYNC_MOUNT_IDMAP_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
sane_kill(stage1_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_MOUNT_IDMAP_ACK)");
}

break;
case SYNC_CHILD_FINISH:
write_log(DEBUG, "stage-1 complete");
Expand Down Expand Up @@ -1062,6 +1164,21 @@ void nsexec(void)
bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s);
}

if (config.idmapsources) {
write_log(DEBUG, "request stage-0 to send idmap sources");
s = SYNC_MOUNT_IDMAP_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: write(SYNC_MOUNT_IDMAP_PLS)");

/* Receive and install all idmap fds. */
receive_idmapsources(syncfd);

if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_MOUNT_IDMAP_ACK)");
if (s != SYNC_MOUNT_IDMAP_ACK)
bail("failed to sync with parent: SYNC_MOUNT_IDMAP_ACK: got %u", s);
}

/*
* TODO: What about non-namespace clone flags that we're dropping here?
*
Expand Down

0 comments on commit fda12ab

Please sign in to comment.