From e935fa6e264f94789a92a0a5b7025b2dece9ff37 Mon Sep 17 00:00:00 2001 From: Will Andrews Date: Sun, 21 Feb 2021 10:19:43 -0600 Subject: [PATCH] Add Linux namespace delegation support This allows ZFS datasets to be delegated to a user/mount namespace Within that namespace, only the delegated datasets are visible Works very similarly to Zones/Jailes on other ZFS OSes As a user: ``` $ unshare -Um $ zfs list no datasets available $ readlink /proc/self/ns/user user:[4026532291] ``` As root: ``` # zfs list NAME ZONED MOUNTPOINT containers off /containers containers/host off /containers/host containers/host/child off /containers/host/child containers/host/child/gchild off /containers/host/child/gchild containers/unpriv on /unpriv containers/unpriv/child on /unpriv/child containers/unpriv/child/gchild on /unpriv/child/gchild # zfs zone 4026532291 containers/unpriv ``` Back to the user namespace: ``` $ zfs list NAME USED AVAIL REFER MOUNTPOINT containers 129M 47.8G 24K /containers containers/unpriv 128M 47.8G 24K /unpriv containers/unpriv/child 128M 47.8G 128M /unpriv/child ``` Signed-off-by: Will Andrews Signed-off-by: Allan Jude Sponsored-by: Buddy --- cmd/zfs/zfs_main.c | 67 ++++ config/kernel-user-ns-inum.m4 | 23 ++ config/kernel.m4 | 2 + include/libzfs.h | 9 + include/os/linux/spl/sys/zone.h | 31 +- include/sys/fs/zfs.h | 2 + lib/libspl/include/sys/types.h | 2 +- lib/libspl/include/zone.h | 12 +- lib/libspl/os/linux/zone.c | 32 +- lib/libuutil/libuutil.abi | 2 +- lib/libzfs/libzfs.abi | 9 +- lib/libzfs/os/linux/libzfs_util_os.c | 52 +++ lib/libzfs_core/libzfs_core.abi | 2 +- man/man7/zfsprops.7 | 3 +- man/man8/zfs-zone.8 | 103 ++++++ module/os/linux/spl/Makefile.in | 1 + module/os/linux/spl/spl-generic.c | 6 + module/os/linux/spl/spl-zone.c | 324 ++++++++++++++++++ module/os/linux/zfs/policy.c | 2 +- module/os/linux/zfs/zfs_ioctl_os.c | 26 ++ module/os/linux/zfs/zfs_vfsops.c | 17 + module/os/linux/zfs/zpl_super.c | 1 + scripts/zfs-tests.sh | 1 - tests/runfiles/linux.run | 2 +- tests/zfs-tests/include/commands.cfg | 2 + .../functional/user_namespace/Makefile.am | 3 +- .../user_namespace/user_namespace_002.ksh | 109 ++++++ 27 files changed, 829 insertions(+), 16 deletions(-) create mode 100644 config/kernel-user-ns-inum.m4 create mode 100644 man/man8/zfs-zone.8 create mode 100644 module/os/linux/spl/spl-zone.c create mode 100755 tests/zfs-tests/tests/functional/user_namespace/user_namespace_002.ksh diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index d05cb29c69d6..2e14997ce31c 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -127,6 +127,11 @@ static int zfs_do_jail(int argc, char **argv); static int zfs_do_unjail(int argc, char **argv); #endif +#ifdef __linux__ +static int zfs_do_zone(int argc, char **argv); +static int zfs_do_unzone(int argc, char **argv); +#endif + /* * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. */ @@ -184,6 +189,8 @@ typedef enum { HELP_JAIL, HELP_UNJAIL, HELP_WAIT, + HELP_ZONE, + HELP_UNZONE, } zfs_help_t; typedef struct zfs_command { @@ -254,6 +261,11 @@ static zfs_command_t command_table[] = { { "jail", zfs_do_jail, HELP_JAIL }, { "unjail", zfs_do_unjail, HELP_UNJAIL }, #endif + +#ifdef __linux__ + { "zone", zfs_do_zone, HELP_ZONE }, + { "unzone", zfs_do_unzone, HELP_UNZONE }, +#endif }; #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) @@ -414,6 +426,10 @@ get_usage(zfs_help_t idx) return (gettext("\tunjail \n")); case HELP_WAIT: return (gettext("\twait [-t ] \n")); + case HELP_ZONE: + return (gettext("\tzone \n")); + case HELP_UNZONE: + return (gettext("\tunzone \n")); default: __builtin_unreachable(); } @@ -8728,6 +8744,57 @@ main(int argc, char **argv) return (ret); } +/* + * zfs zone nsnum filesystem + * + * Add or delete the given dataset to/from the namespace. + */ +#ifdef __linux__ +static int +zfs_do_zone_impl(int argc, char **argv, boolean_t attach) +{ + zfs_handle_t *zhp; + unsigned long nsnum; + int ret; + + if (argc < 3) { + (void) fprintf(stderr, gettext("missing argument(s)\n")); + usage(B_FALSE); + } + if (argc > 3) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + nsnum = strtoul(argv[1], NULL, 10); + if (nsnum > UINT_MAX) { + (void) fprintf(stderr, gettext("invalid namespace number\n")); + usage(B_FALSE); + } + + zhp = zfs_open(g_zfs, argv[3], ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) + return (1); + + ret = (zfs_userns(zhp, (unsigned int)nsnum, attach) != 0); + + zfs_close(zhp); + return (ret); +} + +static int +zfs_do_zone(int argc, char **argv) +{ + return (zfs_do_zone_impl(argc, argv, 1)); +} + +static int +zfs_do_unzone(int argc, char **argv) +{ + return (zfs_do_zone_impl(argc, argv, 0)); +} +#endif + #ifdef __FreeBSD__ #include #include diff --git a/config/kernel-user-ns-inum.m4 b/config/kernel-user-ns-inum.m4 new file mode 100644 index 000000000000..2207a4aa6921 --- /dev/null +++ b/config/kernel-user-ns-inum.m4 @@ -0,0 +1,23 @@ +dnl # +dnl # 3.18 API change +dnl # struct user_namespace inum moved from .proc_inum to .ns.inum. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_USER_NS_COMMON_INUM], [ + ZFS_LINUX_TEST_SRC([user_ns_common_inum], [ + #include + ], [ + struct user_namespace uns; + uns.ns.inum = 0; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_USER_NS_COMMON_INUM], [ + AC_MSG_CHECKING([whether user_namespace->ns.inum exists]) + ZFS_LINUX_TEST_RESULT([user_ns_common_inum], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_USER_NS_COMMON_INUM, 1, + [user_namespace->ns.inum exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 0b94f3bd9cb6..a67809b589e4 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -134,6 +134,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_SET_SPECIAL_STATE ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS ZFS_AC_KERNEL_SRC_STANDALONE_LINUX_STDARG + ZFS_AC_KERNEL_SRC_USER_NS_COMMON_INUM AC_MSG_CHECKING([for available kernel interfaces]) ZFS_LINUX_TEST_COMPILE_ALL([kabi]) @@ -241,6 +242,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_SET_SPECIAL_STATE ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS ZFS_AC_KERNEL_STANDALONE_LINUX_STDARG + ZFS_AC_KERNEL_USER_NS_COMMON_INUM ]) dnl # diff --git a/include/libzfs.h b/include/libzfs.h index c0883a983678..11b167c66d87 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -963,6 +963,15 @@ _LIBZFS_H int zpool_nextboot(libzfs_handle_t *, uint64_t, uint64_t, #endif /* __FreeBSD__ */ +#ifdef __linux__ + +/* + * Add or delete the given filesystem to/from the given user namespace. + */ +_LIBZFS_H int zfs_userns(zfs_handle_t *zhp, unsigned int nsnum, int attach); + +#endif + #ifdef __cplusplus } #endif diff --git a/include/os/linux/spl/sys/zone.h b/include/os/linux/spl/sys/zone.h index 00e30f690c38..17ee0863ba0f 100644 --- a/include/os/linux/spl/sys/zone.h +++ b/include/os/linux/spl/sys/zone.h @@ -25,11 +25,34 @@ #define _SPL_ZONE_H #include +#include -#define GLOBAL_ZONEID 0 +#include +#include -#define zone_dataset_visible(x, y) (1) -#define crgetzoneid(x) (GLOBAL_ZONEID) -#define INGLOBALZONE(z) (1) +/* + * Attach the given dataset to the given user namespace. + */ +extern int zone_dataset_attach(cred_t *, const char *, unsigned int); + +/* + * Detach the given dataset from the given user namespace. + */ +extern int zone_dataset_detach(cred_t *, const char *, unsigned int); + +/* + * Returns true if the named pool/dataset is visible in the current zone. + */ +extern int zone_dataset_visible(const char *dataset, int *write); + +int spl_zone_init(void); +void spl_zone_fini(void); + +extern unsigned int crgetzoneid(const cred_t *); +extern unsigned int global_zoneid(void); +extern boolean_t inglobalzone(proc_t *); + +#define INGLOBALZONE(x) inglobalzone(x) +#define GLOBAL_ZONEID global_zoneid() #endif /* SPL_ZONE_H */ diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 2af11fc7196d..9149144387fe 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1372,7 +1372,9 @@ typedef enum zfs_ioc { ZFS_IOC_EVENTS_SEEK, /* 0x83 (Linux) */ ZFS_IOC_NEXTBOOT, /* 0x84 (FreeBSD) */ ZFS_IOC_JAIL, /* 0x85 (FreeBSD) */ + ZFS_IOC_USERNS_ATTACH = ZFS_IOC_JAIL, /* 0x85 (Linux) */ ZFS_IOC_UNJAIL, /* 0x86 (FreeBSD) */ + ZFS_IOC_USERNS_DETACH = ZFS_IOC_UNJAIL, /* 0x86 (Linux) */ ZFS_IOC_SET_BOOTENV, /* 0x87 */ ZFS_IOC_GET_BOOTENV, /* 0x88 */ ZFS_IOC_LAST diff --git a/lib/libspl/include/sys/types.h b/lib/libspl/include/sys/types.h index ea02ffac93ac..04bcbc5cc0d0 100644 --- a/lib/libspl/include/sys/types.h +++ b/lib/libspl/include/sys/types.h @@ -44,7 +44,7 @@ #include #endif /* HAVE_INTTYPES */ -typedef int zoneid_t; +typedef uint_t zoneid_t; typedef int projid_t; /* diff --git a/lib/libspl/include/zone.h b/lib/libspl/include/zone.h index b0ac2d9bc610..0af4e7a2fa49 100644 --- a/lib/libspl/include/zone.h +++ b/lib/libspl/include/zone.h @@ -33,7 +33,17 @@ extern "C" { #endif -#define GLOBAL_ZONEID 0 +#ifdef __FreeBSD__ +#define GLOBAL_ZONEID 0 +#else +/* + * Hardcoded in the kernel's root user namespace. A "better" way to get + * this would be by using ioctl_ns(2), but this would need to be performed + * recursively on NS_GET_PARENT and then NS_GET_USERNS. Also, that's only + * supported since Linux 4.9. + */ +#define GLOBAL_ZONEID 4026531837U +#endif extern zoneid_t getzoneid(void); diff --git a/lib/libspl/os/linux/zone.c b/lib/libspl/os/linux/zone.c index a71c4e0b275b..c121340487cf 100644 --- a/lib/libspl/os/linux/zone.c +++ b/lib/libspl/os/linux/zone.c @@ -23,10 +23,40 @@ * Use is subject to license terms. */ +#include +#include +#include +#include +#include +#include + #include zoneid_t getzoneid() { - return (GLOBAL_ZONEID); + char path[PATH_MAX]; + char buf[128] = { '\0' }; + char *cp; + + int c = snprintf(path, sizeof (path), "/proc/self/ns/user"); + /* This API doesn't have any error checking... */ + if (c < 0) + return (0); + + ssize_t r = readlink(path, buf, sizeof (buf) - 1); + if (r < 0) + return (0); + + cp = strchr(buf, '['); + if (cp == NULL) + return (0); + cp++; + + unsigned long n = strtoul(cp, NULL, 10); + if (n == ULONG_MAX && errno == ERANGE) + return (0); + zoneid_t z = (zoneid_t)n; + + return (z); } diff --git a/lib/libuutil/libuutil.abi b/lib/libuutil/libuutil.abi index d61416d5b99b..449c2b3b8092 100644 --- a/lib/libuutil/libuutil.abi +++ b/lib/libuutil/libuutil.abi @@ -1137,7 +1137,7 @@ - + diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 86d612f5e326..8cc2fdcae119 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -432,6 +432,7 @@ + @@ -1672,7 +1673,7 @@ - + @@ -5679,6 +5680,12 @@ + + + + + + diff --git a/lib/libzfs/os/linux/libzfs_util_os.c b/lib/libzfs/os/linux/libzfs_util_os.c index 2ac31f1077ca..77c84a40b57c 100644 --- a/lib/libzfs/os/linux/libzfs_util_os.c +++ b/lib/libzfs/os/linux/libzfs_util_os.c @@ -219,3 +219,55 @@ zfs_version_kernel(char *version, int len) return (0); } + +/* + * Add or delete the given filesystem to/from the given user namespace. + */ +int +zfs_userns(zfs_handle_t *zhp, unsigned int nsnum, int attach) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + zfs_cmd_t zc = {"\0"}; + char errbuf[1024]; + unsigned long cmd; + int ret; + + if (attach) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot add '%s' to namespace"), + zhp->zfs_name); + } else { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot remove '%s' from namespace"), + zhp->zfs_name); + } + + switch (zhp->zfs_type) { + case ZFS_TYPE_VOLUME: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "volumes can not be namespaced")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + case ZFS_TYPE_SNAPSHOT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "snapshots can not be namespaced")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + case ZFS_TYPE_BOOKMARK: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "bookmarks can not be namespaced")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + case ZFS_TYPE_POOL: + case ZFS_TYPE_FILESYSTEM: + fallthrough; + } + assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM); + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + zc.zc_objset_type = DMU_OST_ZFS; + zc.zc_zoneid = nsnum; + + cmd = attach ? ZFS_IOC_USERNS_ATTACH : ZFS_IOC_USERNS_DETACH; + if ((ret = zfs_ioctl(hdl, cmd, &zc)) != 0) + zfs_standard_error(hdl, errno, errbuf); + + return (ret); +} diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index b3ae682efcdf..c9ee4c31899e 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -956,7 +956,7 @@ - + diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7 index fcf086f0c271..2abad0fbe29d 100644 --- a/man/man7/zfsprops.7 +++ b/man/man7/zfsprops.7 @@ -1874,8 +1874,7 @@ feature and are not relevant on other platforms. The default value is .Sy off . .It Sy zoned Ns = Ns Sy on Ns | Ns Sy off -Controls whether the dataset is managed from a non-global zone. -Zones are a Solaris feature and are not relevant on other platforms. +Controls whether the dataset is managed from a non-global zone or namespace. The default value is .Sy off . .El diff --git a/man/man8/zfs-zone.8 b/man/man8/zfs-zone.8 new file mode 100644 index 000000000000..712115d86f77 --- /dev/null +++ b/man/man8/zfs-zone.8 @@ -0,0 +1,103 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2011, Pawel Jakub Dawidek +.\" Copyright (c) 2012, Glen Barber +.\" Copyright (c) 2012, Bryan Drewery +.\" Copyright (c) 2013, Steven Hartland +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright (c) 2014, Xin LI +.\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved. +.\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved. +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" Copyright 2021 Klara, Inc. +.\" +.Dd July 29, 2021 +.Dt ZFS-ZONE 8 +.Os +. +.Sh NAME +.Nm zfs-zone , +.Nm zfs-unzone +.Nd attach and detach ZFS filesystems to user namespaces +.Sh SYNOPSIS +.Nm zfs Cm zone +.Ar usernsid +.Ar filesystem +.Nm zfs Cm unzone +.Ar usernsid +.Ar filesystem +. +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm zfs +.Cm zone +.Ar usernsid +.Ar filesystem +.Xc +Attach the specified +.Ar filesystem +to the user namespace identified by +.Ar usernsid . +From now on this file system tree can be managed from within a user namespace if the +.Sy zoned +property has been set. +.Pp +You cannot attach a zoned dataset's children to another user namespace. +You can also not attach the root file system +of the user namespace or any dataset which needs to be mounted before the zfs service +is run inside the user namespace, as it would be attached unmounted until it is +mounted from the service inside the user namespace. +.Pp +To allow management of the dataset from within a user namespace, the +.Sy zoned +property has to be set and the user namespaces needs access to the +.Pa /dev/zfs +device. +The +.Sy quota +property cannot be changed from within a user namespace. +.Pp +After a dataset is attached to a user namespace and the +.Sy zoned +property is set, a zoned file system cannot be mounted outside the user namespace, +since the user namespace administrator might have set the mount point to an unacceptable value. +.It Xo +.Nm zfs +.Cm unzone +.Ar usernsid +.Ar filesystem +.Xc +Detaches the specified +.Ar filesystem +from the user namespace identified by +.Ar usernsid . +.El +.Sh SEE ALSO +.Xr zfsprops 7 diff --git a/module/os/linux/spl/Makefile.in b/module/os/linux/spl/Makefile.in index b2325f91b4a7..6c93f4ac807d 100644 --- a/module/os/linux/spl/Makefile.in +++ b/module/os/linux/spl/Makefile.in @@ -15,3 +15,4 @@ $(MODULE)-objs += ../os/linux/spl/spl-tsd.o $(MODULE)-objs += ../os/linux/spl/spl-vmem.o $(MODULE)-objs += ../os/linux/spl/spl-xdr.o $(MODULE)-objs += ../os/linux/spl/spl-zlib.o +$(MODULE)-objs += ../os/linux/spl/spl-zone.o diff --git a/module/os/linux/spl/spl-generic.c b/module/os/linux/spl/spl-generic.c index 91eeaccfdc47..691652c53189 100644 --- a/module/os/linux/spl/spl-generic.c +++ b/module/os/linux/spl/spl-generic.c @@ -803,8 +803,13 @@ spl_init(void) if ((rc = spl_zlib_init())) goto out7; + if ((rc = spl_zone_init())) + goto out8; + return (rc); +out8: + spl_zlib_fini(); out7: spl_kstat_fini(); out6: @@ -824,6 +829,7 @@ spl_init(void) static void __exit spl_fini(void) { + spl_zone_fini(); spl_zlib_fini(); spl_kstat_fini(); spl_proc_fini(); diff --git a/module/os/linux/spl/spl-zone.c b/module/os/linux/spl/spl-zone.c new file mode 100644 index 000000000000..c73d14589e3d --- /dev/null +++ b/module/os/linux/spl/spl-zone.c @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2021 Klara Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include + +static kmutex_t zone_datasets_lock; +static struct list_head zone_datasets; + +typedef struct zone_datasets { + struct list_head zds_list; /* zone_datasets linkage */ + unsigned int zds_nsnum; /* namespace identifier */ + struct list_head zds_datasets; /* datasets for the namespace */ +} zone_datasets_t; + +typedef struct zone_dataset { + struct list_head zd_list; /* zone_dataset linkage */ + size_t zd_dsnamelen; /* length of name */ + char zd_dsname[0]; /* name of the member dataset */ +} zone_dataset_t; + +static struct zone_datasets * +zone_datasets_lookup(unsigned int nsnum) +{ + zone_datasets_t *zds; + + list_for_each_entry(zds, &zone_datasets, zds_list) { + if (zds->zds_nsnum == nsnum) + return (zds); + } + return (NULL); +} + +static struct zone_dataset * +zone_dataset_lookup(zone_datasets_t *zds, const char *dataset, size_t dsnamelen) +{ + zone_dataset_t *zd; + + list_for_each_entry(zd, &zds->zds_datasets, zd_list) { + if (zd->zd_dsnamelen != dsnamelen) + continue; + if (strncmp(zd->zd_dsname, dataset, dsnamelen) == 0) + return (zd); + } + + return (NULL); +} + +static int +zone_dataset_cred_check(cred_t *cred) +{ + + if (!uid_eq(cred->uid, GLOBAL_ROOT_UID)) + return (EPERM); + + return (0); +} + +static int +zone_dataset_name_check(const char *dataset, size_t *dsnamelen) +{ + + if (dataset[0] == '\0' || dataset[0] == '/') + return (ENOENT); + + *dsnamelen = strlen(dataset); + /* Ignore trailing slash, if supplied. */ + if (dataset[*dsnamelen - 1] == '/') + (*dsnamelen)--; + + return (0); +} + +int +zone_dataset_attach(cred_t *cred, const char *dataset, unsigned int nsnum) +{ +#if defined(CONFIG_USER_NS) + zone_datasets_t *zds; + zone_dataset_t *zd; + int error; + size_t dsnamelen; + + if ((error = zone_dataset_cred_check(cred)) != 0) + return (error); + if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0) + return (error); + + mutex_enter(&zone_datasets_lock); + zds = zone_datasets_lookup(nsnum); + if (zds == NULL) { + zds = kmem_alloc(sizeof (zone_datasets_t), KM_SLEEP); + INIT_LIST_HEAD(&zds->zds_list); + INIT_LIST_HEAD(&zds->zds_datasets); + zds->zds_nsnum = nsnum; + list_add_tail(&zds->zds_list, &zone_datasets); + } else { + zd = zone_dataset_lookup(zds, dataset, dsnamelen); + if (zd != NULL) { + error = EEXIST; + goto done; + } + } + + zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP); + zd->zd_dsnamelen = dsnamelen; + strncpy(zd->zd_dsname, dataset, dsnamelen); + zd->zd_dsname[dsnamelen] = '\0'; + INIT_LIST_HEAD(&zd->zd_list); + list_add_tail(&zd->zd_list, &zds->zds_datasets); + error = 0; + +done: + mutex_exit(&zone_datasets_lock); + return (error); +#else + return (ENXIO); +#endif +} +EXPORT_SYMBOL(zone_dataset_attach); + +int +zone_dataset_detach(cred_t *cred, const char *dataset, unsigned int nsnum) +{ +#if defined(CONFIG_USER_NS) + zone_datasets_t *zds; + zone_dataset_t *zd; + int error; + size_t dsnamelen; + + if ((error = zone_dataset_cred_check(cred)) != 0) + return (error); + if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0) + return (error); + + mutex_enter(&zone_datasets_lock); + zds = zone_datasets_lookup(nsnum); + if (zds != NULL) + zd = zone_dataset_lookup(zds, dataset, dsnamelen); + if (zds == NULL || zd == NULL) { + error = ENOENT; + goto done; + } + + list_del(&zd->zd_list); + kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1); + + /* Prune the namespace entry if it has no more delegations. */ + if (list_empty(&zds->zds_datasets)) { + list_del(&zds->zds_list); + kmem_free(zds, sizeof (*zds)); + } + error = 0; + +done: + mutex_exit(&zone_datasets_lock); + return (error); +#else + return (ENXIO); +#endif +} +EXPORT_SYMBOL(zone_dataset_detach); + +int +zone_dataset_visible(const char *dataset, int *write) +{ + zone_datasets_t *zds; + zone_dataset_t *zd; + size_t dsnamelen, zd_len; + + /* Default to read-only, in case visible is returned. */ + if (write != NULL) + *write = 0; + if (zone_dataset_name_check(dataset, &dsnamelen) != 0) + return (0); + if (INGLOBALZONE(curproc)) { + if (write != NULL) + *write = 1; + return (1); + } + + mutex_enter(&zone_datasets_lock); + int visible = 0; + zds = zone_datasets_lookup(crgetzoneid(curproc->cred)); + if (zds == NULL) + goto done; + + list_for_each_entry(zd, &zds->zds_datasets, zd_list) { + zd_len = strlen(zd->zd_dsname); + /* + * A dataset is writable only if it matches one of the + * namespace's entries, or is a child of an entry. + * The parent datasets of namespace entries are + * visible, but read-only, to provide a path back to the + * root of the pool. + */ + if (zd_len > dsnamelen) { + visible = bcmp(zd->zd_dsname, dataset, + dsnamelen) == 0 && + zd->zd_dsname[dsnamelen] == '/'; + } else { + visible = bcmp(zd->zd_dsname, dataset, zd_len) == 0; + if (visible) { + if (write != NULL) + *write = 1; + } + } + } + +done: + mutex_exit(&zone_datasets_lock); + return (visible); +} +EXPORT_SYMBOL(zone_dataset_visible); + +#if defined(CONFIG_USER_NS) +static unsigned int +user_ns_zoneid(struct user_namespace *user_ns) +{ + unsigned int r; + +#ifdef HAVE_USER_NS_COMMON_INUM + r = user_ns->ns.inum; +#else + r = user_ns->proc_inum; +#endif + + return (r); +} +#endif + +unsigned int +global_zoneid(void) +{ + unsigned int z = 0; + +#if defined(CONFIG_USER_NS) + z = user_ns_zoneid(&init_user_ns); +#endif + + return (z); +} +EXPORT_SYMBOL(global_zoneid); + +unsigned int +crgetzoneid(const cred_t *cr) +{ + unsigned int r = 0; + +#if defined(CONFIG_USER_NS) + r = user_ns_zoneid(cr->user_ns); +#endif + + return (r); +} +EXPORT_SYMBOL(crgetzoneid); + +boolean_t +inglobalzone(proc_t *proc) +{ +#if defined(CONFIG_USER_NS) + return (proc->cred->user_ns == &init_user_ns); +#else + return (B_TRUE); +#endif +} +EXPORT_SYMBOL(inglobalzone); + +int +spl_zone_init(void) +{ + mutex_init(&zone_datasets_lock, NULL, MUTEX_DEFAULT, NULL); + INIT_LIST_HEAD(&zone_datasets); + return (0); +} + +void +spl_zone_fini(void) +{ + zone_datasets_t *zds; + zone_dataset_t *zd; + + /* + * It would be better to assert an empty zone_datasets, but since + * there's no automatic mechanism for cleaning them up if the user + * namespace is destroyed, just do it here, since spl is about to go + * out of context. + */ + while (!list_empty(&zone_datasets)) { + zds = list_entry(zone_datasets.next, zone_datasets_t, zds_list); + while (!list_empty(&zds->zds_datasets)) { + zd = list_entry(zds->zds_datasets.next, + zone_dataset_t, zd_list); + list_del(&zd->zd_list); + kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1); + } + list_del(&zds->zds_list); + kmem_free(zds, sizeof (*zds)); + } + mutex_destroy(&zone_datasets_lock); +} diff --git a/module/os/linux/zfs/policy.c b/module/os/linux/zfs/policy.c index bbccb2e572d9..993051ec284f 100644 --- a/module/os/linux/zfs/policy.c +++ b/module/os/linux/zfs/policy.c @@ -61,7 +61,7 @@ priv_policy_ns(const cred_t *cr, int capability, int err, static int priv_policy(const cred_t *cr, int capability, int err) { - return (priv_policy_ns(cr, capability, err, NULL)); + return (priv_policy_ns(cr, capability, err, cr->user_ns)); } static int diff --git a/module/os/linux/zfs/zfs_ioctl_os.c b/module/os/linux/zfs/zfs_ioctl_os.c index fee3fe540b90..a193c4d31dd0 100644 --- a/module/os/linux/zfs/zfs_ioctl_os.c +++ b/module/os/linux/zfs/zfs_ioctl_os.c @@ -148,6 +148,28 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) } +static int +zfs_ioc_userns_attach(zfs_cmd_t *zc) +{ + + if (zc == NULL) + return (SET_ERROR(EINVAL)); + + return (zone_dataset_attach(CRED(), zc->zc_name, + (unsigned int)zc->zc_zoneid)); +} + +static int +zfs_ioc_userns_detach(zfs_cmd_t *zc) +{ + + if (zc == NULL) + return (SET_ERROR(EINVAL)); + + return (zone_dataset_detach(CRED(), zc->zc_name, + (unsigned int)zc->zc_zoneid)); +} + uint64_t zfs_max_nvlist_src_size_os(void) { @@ -166,6 +188,10 @@ zfs_ioctl_update_mount_cache(const char *dsname) void zfs_ioctl_init_os(void) { + zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERNS_ATTACH, + zfs_ioc_userns_attach, zfs_secpolicy_config, POOL_CHECK_NONE); + zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERNS_DETACH, + zfs_ioc_userns_detach, zfs_secpolicy_config, POOL_CHECK_NONE); } #ifdef CONFIG_COMPAT diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index ff0b0d9df8f0..a8159bbefbfa 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1448,14 +1448,31 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) int error = 0; zfsvfs_t *zfsvfs = NULL; vfs_t *vfs = NULL; + int canwrite; ASSERT(zm); ASSERT(osname); + /* + * Refuse to mount a filesystem if we are in a namespace and the + * dataset is not visible or writable in that namespace. + */ + if (!INGLOBALZONE(curproc) && + (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { + return (SET_ERROR(EPERM)); + } + error = zfsvfs_parse_options(zm->mnt_data, &vfs); if (error) return (error); + /* + * If a non-writable filesystem is being mounted without the + * read-only flag, pretend it was set, as done for snapshots. + */ + if (!canwrite) + vfs->vfs_readonly = true; + error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs); if (error) { zfsvfs_vfs_free(vfs); diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c index c2fd3fee1401..b18efde9b18a 100644 --- a/module/os/linux/zfs/zpl_super.c +++ b/module/os/linux/zfs/zpl_super.c @@ -360,6 +360,7 @@ const struct super_operations zpl_super_operations = { struct file_system_type zpl_fs_type = { .owner = THIS_MODULE, .name = ZFS_DRIVER, + .fs_flags = FS_USERNS_MOUNT, .mount = zpl_mount, .kill_sb = zpl_kill_sb, }; diff --git a/scripts/zfs-tests.sh b/scripts/zfs-tests.sh index ac28788582f9..4c46208edf08 100755 --- a/scripts/zfs-tests.sh +++ b/scripts/zfs-tests.sh @@ -296,7 +296,6 @@ constrain_path() { ln -fs "$STF_PATH/gzip" "$STF_PATH/compress" ln -fs "$STF_PATH/gunzip" "$STF_PATH/uncompress" ln -fs "$STF_PATH/exportfs" "$STF_PATH/share" - ln -fs "$STF_PATH/exportfs" "$STF_PATH/unshare" elif [ "$UNAME" = "FreeBSD" ] ; then ln -fs /usr/local/bin/ksh93 "$STF_PATH/ksh" fi diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 01e1f79e5852..8a3ca0dd5b62 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -165,7 +165,7 @@ tests = ['upgrade_projectquota_001_pos'] tags = ['functional', 'upgrade'] [tests/functional/user_namespace:Linux] -tests = ['user_namespace_001'] +tests = ['user_namespace_001', 'user_namespace_002'] tags = ['functional', 'user_namespace'] [tests/functional/userquota:Linux] diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 1ec73f25bae7..ec71c37dd763 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -164,12 +164,14 @@ export SYSTEM_FILES_LINUX='attr modprobe mpstat nproc + nsenter parted perf setenforce setfattr sha256sum udevadm + unshare useradd userdel usermod' diff --git a/tests/zfs-tests/tests/functional/user_namespace/Makefile.am b/tests/zfs-tests/tests/functional/user_namespace/Makefile.am index 5f95dbf8d967..d964e416c30d 100644 --- a/tests/zfs-tests/tests/functional/user_namespace/Makefile.am +++ b/tests/zfs-tests/tests/functional/user_namespace/Makefile.am @@ -2,7 +2,8 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/user_namespace dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ - user_namespace_001.ksh + user_namespace_001.ksh \ + user_namespace_002.ksh dist_pkgdata_DATA = \ user_namespace_common.kshlib \ diff --git a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_002.ksh b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_002.ksh new file mode 100755 index 000000000000..de72e2b52677 --- /dev/null +++ b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_002.ksh @@ -0,0 +1,109 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/tests/functional/user_namespace/user_namespace_common.kshlib + +# +# DESCRIPTION: +# Regression test for delegation of datasets to user namespaces. +# +# STRATEGY: +# 1. Delegate a dataset to an user namespace. +# 2. Check that 'zfs list' is only able to see inside the delegation. +# 3. Check that 'zfs create' is able to create only inside the delegation. +# 4. Check that the filesystems can be mounted inside the delegation, +# and that file permissions are appropriate. +# 5. Check that 'zfs destroy' is able to destroy only inside the delegation. +# 6. Check that 'zfs unzone' has a desirable effect. +# + +verify_runnable "both" + +user_ns_cleanup() { + if [ -n "$ns_added" ]; then + log_must zfs unzone $ns_added $TESTPOOL/userns + fi + if [ -n "$unshared_pid" ]; then + log_must kill -9 $unshared_pid + # Give it a sec to make the global cleanup more reliable. + sleep 1 + fi +} + +log_onexit user_ns_cleanup + +log_assert "Check zfs/zpool command delegation in user namespaces" + +# Create the baseline datasets. +log_must zfs create -o zoned=on $TESTPOOL/userns +log_must zfs create -o zoned=on $TESTPOOL/userns/testds +# Partial match should be denied; hence we also set this to be 'zoned'. +log_must zfs create -o zoned=on $TESTPOOL/user + +# 1. Create an user namespace with a cloned mount namespace, then delegate. +unshare -Urm /usr/bin/sleep 1h & +unshared_pid=$! +if [ "$?" -eq "0" ]; then + log_unsupported "Failed to create user namespace" +fi +proc_ns=/proc/$unshared_pid/ns/user +sleep 2 # Wait for unshare to acquire user namespace +ns=$(readlink $proc_ns | sed -E -e 's,.*\[(.*)\],\1,g') +log_note "unshare: child=${unshared_pid} ns=${ns}" +if [ -n "$ns" -a "$ns" -ne "4026531837" ]; then + log_unsupported "Failed to detect user namespace" +fi + +NSENTER="nsenter -t $unshared_pid --all" + +# 1b. Pre-test by checking that 'zone' does something new. +list="$($NSENTER zfs list -r -H -o name | tr '\n' ' ')" +log_must test -z "$list" +log_must zfs zone $ns $TESTPOOL/userns +ns_added="$ns" + +# 2. 'zfs list' +list="$($NSENTER zfs list -r -H -o name $TESTPOOL | tr '\n' ' ')" +log_must test "$list" = "testpool testpool/userns testpool/userns/testds " + +# 3. 'zfs create' +log_must $NSENTER zfs create $TESTPOOL/userns/created +log_mustnot $NSENTER zfs create $TESTPOOL/user/created + +# 4. Check file permissions (create mounts the filesystem). The 'permissions' +# check is simply, does it get mapped to user namespace's root/root? +log_must $NSENTER df -h /$TESTPOOL/userns/created +log_must $NSENTER mkfile 8192 /$TESTPOOL/userns/created/testfile +uidgid=$($NSENTER stat -c '%u %g' /$TESTPOOL/userns/created/testfile) +log_must test "${uidgid}" = "0 0" + +# 5. 'zfs destroy' +log_must $NSENTER zfs destroy $TESTPOOL/userns/created +log_mustnot $NSENTER zfs destroy $TESTPOOL/user + +# 6. 'zfs unzone' should have an effect +log_must zfs unzone $ns $TESTPOOL/userns +ns_added="" +list="$($NSENTER zfs list -r -H -o name | tr '\n' ' ')" +log_must test -z "$list" + +log_pass "Check zfs/zpool command delegation in user namespaces"