From a6023ce6329dc6d18e9f8ef0ad88b7eddeadac61 Mon Sep 17 00:00:00 2001
From: ctalledo <ctalledo@nestybox.com>
Date: Thu, 28 Apr 2022 23:52:31 +0000
Subject: [PATCH] rootfs: add mount destination validation

NOTE: based on a similar fix on the OCI runc (see commit 0ca91f44f1664da834bc61115a849b56d22f595f).
Several adjustments for sysbox-runc done on top of that commit.

Original OCI runc commit:

rootfs: add mount destination validation

Because the target of a mount is inside a container (which may be a
volume that is shared with another container), there exists a race
condition where the target of the mount may change to a path containing
a symlink after we have sanitised the path -- resulting in us
inadvertently mounting the path outside of the container.

This is not immediately useful because we are in a mount namespace with
MS_SLAVE mount propagation applied to "/", so we cannot mount on top of
host paths in the host namespace. However, if any subsequent mountpoints
in the configuration use a subdirectory of that host path as a source,
those subsequent mounts will use an attacker-controlled source path
(resolved within the host rootfs) -- allowing the bind-mounting of "/"
into the container.

While arguably configuration issues like this are not entirely within
runc's threat model, within the context of Kubernetes (and possibly
other container managers that provide semi-arbitrary container creation
privileges to untrusted users) this is a legitimate issue. Since we
cannot block mounting from the host into the container, we need to block
the first stage of this attack (mounting onto a path outside the
container).

The long-term plan to solve this would be to migrate to libpathrs, but
as a stop-gap we implement libpathrs-like path verification through
readlink(/proc/self/fd/$n) and then do mount operations through the
procfd once it's been verified to be inside the container. The target
could move after we've checked it, but if it is inside the container
then we can assume that it is safe for the same reason that libpathrs
operations would be safe.

A slight wrinkle is the "copyup" functionality we provide for tmpfs,
which is the only case where we want to do a mount on the host
filesystem. To facilitate this, I split out the copy-up functionality
entirely so that the logic isn't interspersed with the regular tmpfs
logic. In addition, all dependencies on m.Destination being overwritten
have been removed since that pattern was just begging to be a source of
more mount-target bugs (we do still have to modify m.Destination for
tmpfs-copyup but we only do it temporarily).

Fixes: CVE-2021-30465
Reported-by: Etienne Champetier <champetier.etienne@gmail.com>
Co-authored-by: Noah Meyerhans <nmeyerha@amazon.com>
Reviewed-by: Samuel Karp <skarp@amazon.com>
Reviewed-by: Kir Kolyshkin <kolyshkin@gmail.com> (@kolyshkin)
Reviewed-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>

===

Signed-off-by: Cesar Talledo <ctalledo@nestybox.com>
---
 libcontainer/container_linux.go   |   1 +
 libcontainer/rootfs_init_linux.go |  78 ++++++----
 libcontainer/rootfs_linux.go      | 249 +++++++++++++++++-------------
 libcontainer/utils/utils.go       |  71 +++++++++
 libcontainer/utils/utils_test.go  |  35 +++++
 libsysbox/shiftfs/shiftfs.go      |   2 +
 6 files changed, 297 insertions(+), 139 deletions(-)

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 9c79260a9..da9eeb4fa 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -2474,6 +2474,7 @@ func (c *linuxContainer) handleOp(op opReqType, childPid int, reqs []opReq) erro
 	case bind, chown, mkdir:
 		namespaces = append(namespaces,
 			fmt.Sprintf("mnt:/proc/%d/ns/mnt", childPid),
+			fmt.Sprintf("pid:/proc/%d/ns/pid", childPid),
 		)
 	case switchDockerDns:
 		namespaces = append(namespaces,
diff --git a/libcontainer/rootfs_init_linux.go b/libcontainer/rootfs_init_linux.go
index f667493dc..d4daf1758 100644
--- a/libcontainer/rootfs_init_linux.go
+++ b/libcontainer/rootfs_init_linux.go
@@ -18,6 +18,7 @@ import (
 
 	sh "github.com/nestybox/sysbox-libs/idShiftUtils"
 	utils "github.com/nestybox/sysbox-libs/utils"
+	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
 )
 
 type linuxRootfsInit struct {
@@ -75,7 +76,7 @@ func iptablesRestoreHasWait() (bool, error) {
 	return verConstraint.Check(ver), nil
 }
 
-func doBindMount(m *configs.Mount) error {
+func doBindMount(rootfs string, m *configs.Mount) error {
 
 	// sysbox-runc: For some reason, when the rootfs is on shiftfs, we
 	// need to do an Lstat() of the source path prior to doing the
@@ -91,26 +92,38 @@ func doBindMount(m *configs.Mount) error {
 	if !m.BindSrcInfo.IsDir {
 		src = filepath.Dir(m.Source)
 	}
-
 	os.Lstat(src)
-	if err := unix.Mount(m.Source, m.Destination, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
 
-		// We've noticed that the lstat and/or mount syscall fails with EPERM when
-		// bind-mounting a source dir that is on a shiftfs mount on top of a tmpfs
-		// mount. For some reason the Linux "mount" command does not fail in this case,
-		// so let's try it.
+	// Bind-mount with procfd to mitigate symlink exchange attacks.
+
+	if err := libcontainerUtils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
+		if err := unix.Mount(m.Source, procfd, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
 
-		cmd := exec.Command("/bin/mount", "--rbind", m.Source, m.Destination)
-		err := cmd.Run()
-		if err != nil {
-			return fmt.Errorf("bind-mount of %s to %s failed: %v", m.Source, m.Destination, err)
+			// We've noticed that the lstat and/or mount syscall fails with EPERM when
+			// bind-mounting a source dir that is on a shiftfs mount on top of a tmpfs
+			// mount. For some reason the Linux "mount" command does not fail in this case,
+			// so let's try it.
+			cmd := exec.Command("/bin/mount", "--rbind", m.Source, procfd)
+			err := cmd.Run()
+			if err != nil {
+				realpath, _ := os.Readlink(procfd)
+				return fmt.Errorf("bind-mount of %s to %s failed: %v", m.Source, realpath, err)
+			}
 		}
+		return nil
+	}); err != nil {
+		return fmt.Errorf("bind mount through procfd of %s -> %s: %w", m.Source, m.Destination, err)
 	}
 
-	for _, pflag := range m.PropagationFlags {
-		if err := unix.Mount("", m.Destination, "", uintptr(pflag), ""); err != nil {
-			return err
+	if err := libcontainerUtils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
+		for _, pflag := range m.PropagationFlags {
+			if err := unix.Mount("", procfd, "", uintptr(pflag), ""); err != nil {
+				return err
+			}
 		}
+		return nil
+	}); err != nil {
+		return fmt.Errorf("change bind mount propagation through procfd: %w", err)
 	}
 
 	return nil
@@ -246,7 +259,6 @@ func (l *linuxRootfsInit) Init() error {
 
 	// If multiple requests are passed in the slice, they must all be
 	// of the same type.
-
 	switch l.reqs[0].Op {
 
 	case bind:
@@ -256,14 +268,21 @@ func (l *linuxRootfsInit) Init() error {
 			return newSystemErrorWithCausef(err, "chdir to rootfs %s", rootfs)
 		}
 
-		initPid := l.reqs[0].InitPid
-		usernsPath := fmt.Sprintf("/proc/%d/ns/user", initPid)
+		// We are in the pid and mount ns of the container's init process; remount
+		// /proc so that it picks up this fact.
+		os.Lstat("/proc")
+		if err := unix.Mount("proc", "/proc", "proc", 0, ""); err != nil {
+			return newSystemErrorWithCause(err, "re-mounting procfs")
+		}
+		defer unix.Unmount("/proc", unix.MNT_DETACH)
+
+		usernsPath := "/proc/1/ns/user"
 
 		for _, req := range l.reqs {
 			m := &req.Mount
 			mountLabel := req.Label
 
-			if err := doBindMount(m); err != nil {
+			if err := doBindMount(rootfs, m); err != nil {
 				return newSystemErrorWithCausef(err, "bind mounting %s to %s", m.Source, m.Destination)
 			}
 
@@ -290,18 +309,17 @@ func (l *linuxRootfsInit) Init() error {
 
 			// Set up the ID-mapping as needed
 			if m.IDMappedMount {
-
-				// Note: arguments to IDMapMount() must be absolute.
-				target, err := securejoin.SecureJoin(rootfs, m.Destination)
-				if err != nil {
-					return err
-				}
-
-				if err := sh.IDMapMount(usernsPath, target); err != nil {
-					fsName, _ := utils.GetFsName(target)
-					return newSystemErrorWithCausef(err,
-						"setting up ID-mapped mount on userns %s, path %s (likely means idmapped mounts are not supported on the filesystem at this path (%s))",
-						usernsPath, target, fsName)
+				if err := libcontainerUtils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
+					if err := sh.IDMapMount(usernsPath, procfd); err != nil {
+						fsName, _ := utils.GetFsName(procfd)
+						realpath, _ := os.Readlink(procfd)
+						return newSystemErrorWithCausef(err,
+							"setting up ID-mapped mount on path %s (likely means idmapped mounts are not supported on the filesystem at this path (%s))",
+							realpath, fsName)
+					}
+					return nil
+				}); err != nil {
+					return newSystemErrorWithCausef(err, "ID-map mount on %s", m.Destination)
 				}
 			}
 		}
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index 1bd653af6..288167912 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -33,6 +33,7 @@ import (
 
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/opencontainers/selinux/go-selinux/label"
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 
@@ -249,23 +250,32 @@ func mountCgroupV1(m *configs.Mount, enableCgroupns bool, config *configs.Config
 	}
 	for _, b := range binds {
 		if enableCgroupns {
+
 			// sysbox-runc: use relative path (as otherwise we may not have permission to mkdir)
-			subsystemPath := b.Destination
-			if err := mkdirall(subsystemPath, 0755, config, pipe); err != nil {
+			subsystemPath, err := securejoin.SecureJoin(".", b.Destination)
+			if err != nil {
 				return err
 			}
-			flags := defaultMountFlags
-			if m.Flags&unix.MS_RDONLY != 0 {
-				flags = flags | unix.MS_RDONLY
-			}
-			cgroupmount := &configs.Mount{
-				Source:      "cgroup",
-				Device:      "cgroup", // this is actually fstype
-				Destination: subsystemPath,
-				Flags:       flags,
-				Data:        filepath.Base(subsystemPath),
+
+			if err := mkdirall(subsystemPath, 0755, config, pipe); err != nil {
+				return err
 			}
-			if err := mountNewCgroup(cgroupmount); err != nil {
+
+			if err := utils.WithProcfd(".", subsystemPath, func(procfd string) error {
+				flags := defaultMountFlags
+				if m.Flags&unix.MS_RDONLY != 0 {
+					flags = flags | unix.MS_RDONLY
+				}
+				var (
+					source = "cgroup"
+					data   = filepath.Base(subsystemPath)
+				)
+				if data == "systemd" {
+					data = cgroups.CgroupNamePrefix + data
+					source = "systemd"
+				}
+				return unix.Mount(source, procfd, "cgroup", uintptr(flags), data)
+			}); err != nil {
 				return err
 			}
 		} else {
@@ -288,21 +298,71 @@ func mountCgroupV1(m *configs.Mount, enableCgroupns bool, config *configs.Config
 }
 
 func mountCgroupV2(m *configs.Mount, enableCgroupns bool, config *configs.Config, pipe io.ReadWriter) error {
+
+	// sysbox-runc: use relative path (as otherwise we may not have permission to mkdir)
 	cgroupPath, err := securejoin.SecureJoin(".", m.Destination)
 	if err != nil {
 		return err
 	}
+
 	if err := mkdirall(cgroupPath, 0755, config, pipe); err != nil {
 		return err
 	}
-	if err := unix.Mount(m.Source, cgroupPath, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
-		// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
-		if err == unix.EPERM || err == unix.EBUSY {
-			return unix.Mount("/sys/fs/cgroup", cgroupPath, "", uintptr(m.Flags)|unix.MS_BIND, "")
+
+	return utils.WithProcfd(".", cgroupPath, func(procfd string) error {
+		if err := unix.Mount(m.Source, procfd, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
+			// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
+			if err == unix.EPERM || err == unix.EBUSY {
+				return unix.Mount("/sys/fs/cgroup", procfd, "", uintptr(m.Flags)|unix.MS_BIND, "")
+			}
+			return nil
 		}
+		return nil
+	})
+}
+
+func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
+	// Set up a scratch dir for the tmpfs on the host.
+	tmpdir, err := prepareTmp("/tmp")
+	if err != nil {
+		return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
+	}
+	defer cleanupTmp(tmpdir)
+	tmpDir, err := ioutil.TempDir(tmpdir, "runctmpdir")
+	if err != nil {
+		return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
+	}
+	defer os.RemoveAll(tmpDir)
+
+	// Configure the *host* tmpdir as if it's the container mount. We change
+	// m.Destination since we are going to mount *on the host*.
+	oldDest := m.Destination
+	m.Destination = tmpDir
+	err = mountPropagate(m, "/", mountLabel)
+	m.Destination = oldDest
+	if err != nil {
 		return err
 	}
-	return nil
+	defer func() {
+		if Err != nil {
+			if err := unix.Unmount(tmpDir, unix.MNT_DETACH); err != nil {
+				logrus.Warnf("tmpcopyup: failed to unmount tmpdir on error: %v", err)
+			}
+		}
+	}()
+	return utils.WithProcfd(".", m.Destination, func(procfd string) (Err error) {
+		// Copy the container data to the host tmpdir. We append "/" to force
+		// CopyDirectory to resolve the symlink rather than trying to copy the
+		// symlink itself.
+		if err := fileutils.CopyDirectory(procfd+"/", tmpDir); err != nil {
+			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, procfd, tmpDir, err)
+		}
+		// Now move the mount into the container.
+		if err := unix.Mount(tmpDir, procfd, "", unix.MS_MOVE, ""); err != nil {
+			return fmt.Errorf("tmpcopyup: failed to move mount %s to %s (%s): %w", tmpDir, procfd, m.Destination, err)
+		}
+		return nil
+	})
 }
 
 // mkdirall calls into os.Mkdirall(), but precedes the call with an open of the current
@@ -349,14 +409,17 @@ func mkdirall(path string, mode os.FileMode, config *configs.Config, pipe io.Rea
 }
 
 func mountToRootfs(m *configs.Mount, config *configs.Config, enableCgroupns bool, pipe io.ReadWriter) error {
-	var (
-		mountLabel = config.MountLabel
-		dest       = m.Destination
-	)
 
-	// This function assumes cwd is the container's rootfs
-	dest = filepath.Join(".", dest)
-	m.Destination = dest
+	mountLabel := config.MountLabel
+
+	// sysbox-runc: use relative path for the rootfs as we may not have access to it via the abs path.
+	rootfs := "."
+
+	// Verify the mount destination is within the container's rootfs
+	dest, err := securejoin.SecureJoin(rootfs, m.Destination)
+	if err != nil {
+		return err
+	}
 
 	switch m.Device {
 	case "proc", "sysfs":
@@ -376,63 +439,30 @@ func mountToRootfs(m *configs.Mount, config *configs.Config, enableCgroupns bool
 			return fmt.Errorf("failed to created dir for %s mount: %v", m.Device, err)
 		}
 		// Selinux kernels do not support labeling of /proc or /sys
-		return mountPropagate(m, "")
+		return mountPropagate(m, ".", "")
 	case "mqueue":
 		if err := mkdirall(dest, 0755, config, pipe); err != nil {
 			return err
 		}
-		if err := mountPropagate(m, ""); err != nil {
+		if err := mountPropagate(m, rootfs, ""); err != nil {
 			return err
 		}
 		return label.SetFileLabel(dest, mountLabel)
 	case "tmpfs":
-		copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
-		tmpDir := ""
-		// dest might be an absolute symlink, so it needs
-		// to be resolved under rootfs.
-		dest, err := securejoin.SecureJoin(".", m.Destination)
-		if err != nil {
-			return err
-		}
-		m.Destination = dest
 		stat, err := os.Stat(dest)
 		if err != nil {
 			if err := mkdirall(dest, 0755, config, pipe); err != nil {
 				return err
 			}
 		}
-		if copyUp {
-			tmpdir, err := prepareTmp("/tmp")
-			if err != nil {
-				return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
-			}
-			defer cleanupTmp(tmpdir)
-			tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir")
-			if err != nil {
-				return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
-			}
-			defer os.RemoveAll(tmpDir)
-			m.Destination = tmpDir
+		if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP {
+			err = doTmpfsCopyUp(m, rootfs, mountLabel)
+		} else {
+			err = mountPropagate(m, rootfs, mountLabel)
 		}
-		if err := mountPropagate(m, mountLabel); err != nil {
+		if err != nil {
 			return err
 		}
-		if copyUp {
-			if err := fileutils.CopyDirectory(dest, tmpDir); err != nil {
-				errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err)
-				if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
-					return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
-				}
-				return errMsg
-			}
-			if err := unix.Mount(tmpDir, dest, "", unix.MS_MOVE, ""); err != nil {
-				errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err)
-				if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
-					return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
-				}
-				return errMsg
-			}
-		}
 		if stat != nil {
 			if err = os.Chmod(dest, stat.Mode()); err != nil {
 				return err
@@ -458,7 +488,7 @@ func mountToRootfs(m *configs.Mount, config *configs.Config, enableCgroupns bool
 		if err := mkdirall(dest, 0755, config, pipe); err != nil {
 			return err
 		}
-		return mountPropagate(m, mountLabel)
+		return mountPropagate(m, rootfs, mountLabel)
 	}
 }
 
@@ -667,7 +697,7 @@ func createDevices(config *configs.Config, pipe io.ReadWriter) error {
 	return nil
 }
 
-func bindMountDeviceNode(dest string, node *devices.Device) error {
+func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error {
 	f, err := os.Create(dest)
 	if err != nil && !os.IsExist(err) {
 		return err
@@ -675,8 +705,9 @@ func bindMountDeviceNode(dest string, node *devices.Device) error {
 	if f != nil {
 		f.Close()
 	}
-
-	return unix.Mount(node.Path, dest, "bind", unix.MS_BIND, "")
+	return utils.WithProcfd(rootfs, dest, func(procfd string) error {
+		return unix.Mount(node.Path, procfd, "bind", unix.MS_BIND, "")
+	})
 }
 
 // Creates the device node in the rootfs of the container.
@@ -685,19 +716,26 @@ func createDeviceNode(node *devices.Device, bind bool, config *configs.Config, p
 		// The node only exists for cgroup reasons, ignore it here.
 		return nil
 	}
-	dest := filepath.Join(".", node.Path)
 
+	// sysbox-runc: use relative path for the rootfs as we may not have access to it via the abs path.
+	rootfs := "."
+
+	// Verify the device node path is within the container's rootfs
+	dest, err := securejoin.SecureJoin(rootfs, node.Path)
+	if err != nil {
+		return err
+	}
 	if err := mkdirall(filepath.Dir(dest), 0755, config, pipe); err != nil {
 		return err
 	}
 	if bind {
-		return bindMountDeviceNode(dest, node)
+		return bindMountDeviceNode(rootfs, dest, node)
 	}
 	if err := mknodDevice(dest, node); err != nil {
 		if os.IsExist(err) {
 			return nil
 		} else if os.IsPermission(err) {
-			return bindMountDeviceNode(dest, node)
+			return bindMountDeviceNode(rootfs, dest, node)
 		}
 		return err
 	}
@@ -1057,55 +1095,48 @@ func remount(m *configs.Mount) error {
 	}
 	flags |= uintptr(s.Flags)
 
-	if err := unix.Mount("", m.Destination, "", flags, ""); err != nil {
-		return fmt.Errorf("failed to remount %s with flags %#x", m.Destination, int(flags))
-	}
-
-	return nil
+	return utils.WithProcfd(".", m.Destination, func(procfd string) error {
+		return unix.Mount("", procfd, "", flags, "")
+	})
 }
 
 // Do the mount operation followed by additional mounts required to take care
-// of propagation flags.
-func mountPropagate(m *configs.Mount, mountLabel string) error {
+// of propagation flags. This will always be scoped inside the container rootfs.
+func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
 	var (
-		dest  = m.Destination
 		data  = label.FormatMountLabel(m.Data, mountLabel)
 		flags = m.Flags
 	)
-	if dest == "dev" {
-		flags &= ^unix.MS_RDONLY
-	}
 
-	// Mount it rw to allow chmod operation. A remount will be performed
-	// later to make it ro if set.
-	if m.Device == "tmpfs" {
+	// Delay mounting the filesystem read-only if we need to do further
+	// operations on it. We need to set up files in "/dev" and tmpfs mounts may
+	// need to be chmod-ed after mounting. The mount will be remounted ro later
+	// in finalizeRootfs() if necessary.
+	if libcontainerUtils.CleanPath(m.Destination) == "/dev" || m.Device == "tmpfs" {
 		flags &= ^unix.MS_RDONLY
 	}
 
-	if err := unix.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil {
-		return err
-	}
-
-	for _, pflag := range m.PropagationFlags {
-		if err := unix.Mount("", dest, "", uintptr(pflag), ""); err != nil {
-			return err
+	// Because the destination is inside a container path which might be
+	// mutating underneath us, we verify that we are actually going to mount
+	// inside the container with WithProcfd() -- mounting through a procfd
+	// mounts on the target.
+	if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
+		return unix.Mount(m.Source, procfd, m.Device, uintptr(flags), data)
+	}); err != nil {
+		return fmt.Errorf("mount through procfd: %w", err)
+	}
+	// We have to apply mount propagation flags in a separate WithProcfd() call
+	// because the previous call invalidates the passed procfd -- the mount
+	// target needs to be re-opened.
+	if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
+		for _, pflag := range m.PropagationFlags {
+			if err := unix.Mount("", procfd, "", uintptr(pflag), ""); err != nil {
+				return err
+			}
 		}
-	}
-
-	return nil
-}
-
-func mountNewCgroup(m *configs.Mount) error {
-	var (
-		data   = m.Data
-		source = m.Source
-	)
-	if data == "systemd" {
-		data = cgroups.CgroupNamePrefix + data
-		source = "systemd"
-	}
-	if err := unix.Mount(source, m.Destination, m.Device, uintptr(m.Flags), data); err != nil {
-		return err
+		return nil
+	}); err != nil {
+		return fmt.Errorf("change mount propagation through procfd: %w", err)
 	}
 	return nil
 }
diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go
index a92453e54..bcd3bbf51 100644
--- a/libcontainer/utils/utils.go
+++ b/libcontainer/utils/utils.go
@@ -2,12 +2,15 @@ package utils
 
 import (
 	"encoding/json"
+	"fmt"
 	"io"
 	"os"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"unsafe"
 
+	securejoin "github.com/cyphar/filepath-securejoin"
 	"golang.org/x/sys/unix"
 )
 
@@ -73,6 +76,74 @@ func CleanPath(path string) string {
 	return filepath.Clean(path)
 }
 
+// StripRoot returns the passed path, stripping the root path if it was
+// (lexicially) inside it. Note that both passed paths will always be treated
+// as absolute, and the returned path will also always be absolute. In
+// addition, the paths are cleaned before stripping the root.
+func StripRoot(root, path string) string {
+	// Make the paths clean and absolute.
+	root, path = CleanPath("/"+root), CleanPath("/"+path)
+	switch {
+	case path == root:
+		path = "/"
+	case root == "/":
+		// do nothing
+	case strings.HasPrefix(path, root+"/"):
+		path = strings.TrimPrefix(path, root+"/")
+	}
+	return CleanPath("/" + path)
+}
+
+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
+// corresponding to the unsafePath resolved within the root. Before passing the
+// fd, this path is verified to have been inside the root -- so operating on it
+// through the passed fdpath should be safe. Do not access this path through
+// the original path strings, and do not attempt to use the pathname outside of
+// the passed closure (the file handle will be freed once the closure returns).
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
+
+	// Remove the root then forcefully resolve inside the root.
+	relUnsafePath := StripRoot(root, unsafePath)
+	path, err := securejoin.SecureJoin(root, relUnsafePath)
+	if err != nil {
+		return fmt.Errorf("resolving path inside rootfs failed: %v", err)
+	}
+
+	// Open the target path.
+	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return fmt.Errorf("open o_path procfd: %w", err)
+	}
+	defer fh.Close()
+
+	// Double-check the path is the one we expected.
+	procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
+	realpath, err := os.Readlink(procfd)
+	if err != nil {
+		return err
+	}
+
+	// The realPath has the absolute path; if root is cwd, then we need the relative path
+	if root == "." {
+		rootAbs, err := os.Readlink("/proc/self/cwd")
+		if err != nil {
+			return err
+		}
+		if !strings.HasSuffix(rootAbs, "/") {
+			rootAbs = rootAbs + "/"
+		}
+		realpath = strings.TrimPrefix(realpath, rootAbs)
+	}
+
+	// Verify the path string and /proc/self/fd match
+	if realpath != path {
+		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
+	}
+
+	// Run the closure.
+	return fn(procfd)
+}
+
 // SearchLabels searches a list of key-value pairs for the provided key and
 // returns the corresponding value. The pairs must be separated with '='.
 func SearchLabels(labels []string, query string) string {
diff --git a/libcontainer/utils/utils_test.go b/libcontainer/utils/utils_test.go
index 7f38ed169..d33662238 100644
--- a/libcontainer/utils/utils_test.go
+++ b/libcontainer/utils/utils_test.go
@@ -143,3 +143,38 @@ func TestCleanPath(t *testing.T) {
 		t.Errorf("expected to receive '/foo' and received %s", path)
 	}
 }
+
+func TestStripRoot(t *testing.T) {
+	for _, test := range []struct {
+		root, path, out string
+	}{
+		// Works with multiple components.
+		{"/a/b", "/a/b/c", "/c"},
+		{"/hello/world", "/hello/world/the/quick-brown/fox", "/the/quick-brown/fox"},
+		// '/' must be a no-op.
+		{"/", "/a/b/c", "/a/b/c"},
+		// Must be the correct order.
+		{"/a/b", "/a/c/b", "/a/c/b"},
+		// Must be at start.
+		{"/abc/def", "/foo/abc/def/bar", "/foo/abc/def/bar"},
+		// Must be a lexical parent.
+		{"/foo/bar", "/foo/barSAMECOMPONENT", "/foo/barSAMECOMPONENT"},
+		// Must only strip the root once.
+		{"/foo/bar", "/foo/bar/foo/bar/baz", "/foo/bar/baz"},
+		// Deal with .. in a fairly sane way.
+		{"/foo/bar", "/foo/bar/../baz", "/foo/baz"},
+		{"/foo/bar", "../../../../../../foo/bar/baz", "/baz"},
+		{"/foo/bar", "/../../../../../../foo/bar/baz", "/baz"},
+		{"/foo/bar/../baz", "/foo/baz/bar", "/bar"},
+		{"/foo/bar/../baz", "/foo/baz/../bar/../baz/./foo", "/foo"},
+		// All paths are made absolute before stripping.
+		{"foo/bar", "/foo/bar/baz/bee", "/baz/bee"},
+		{"/foo/bar", "foo/bar/baz/beef", "/baz/beef"},
+		{"foo/bar", "foo/bar/baz/beets", "/baz/beets"},
+	} {
+		got := stripRoot(test.root, test.path)
+		if got != test.out {
+			t.Errorf("stripRoot(%q, %q) -- got %q, expected %q", test.root, test.path, got, test.out)
+		}
+	}
+}
diff --git a/libsysbox/shiftfs/shiftfs.go b/libsysbox/shiftfs/shiftfs.go
index 36a6849e8..0aeeaacf0 100644
--- a/libsysbox/shiftfs/shiftfs.go
+++ b/libsysbox/shiftfs/shiftfs.go
@@ -24,6 +24,8 @@ import (
 	"golang.org/x/sys/unix"
 )
 
+const SHIFTFS_MAGIC int64 = 0x6a656a62
+
 // Mark performs a shiftfs mark-mount for path on the given markPath
 // (e.g., Mark("/a/b", "/c/d") causes "b" to be mounted on "d" and
 // "d" to have a shiftfs mark).