1,612 changes: 1,612 additions & 0 deletions lib/libzfs/libzfs_crypto.c

Large diffs are not rendered by default.

136 changes: 125 additions & 11 deletions lib/libzfs/libzfs_dataset.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
#include <sys/dnode.h>
#include <sys/spa.h>
#include <sys/zap.h>
#include <sys/dsl_crypt.h>
#include <libzfs.h>

#include "zfs_namecheck.h"
Expand Down Expand Up @@ -965,7 +966,7 @@ zfs_which_resv_prop(zfs_handle_t *zhp, zfs_prop_t *resv_prop)
nvlist_t *
zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
uint64_t zoned, zfs_handle_t *zhp, zpool_handle_t *zpool_hdl,
const char *errbuf)
boolean_t key_params_ok, const char *errbuf)
{
nvpair_t *elem;
uint64_t intval;
Expand Down Expand Up @@ -1124,7 +1125,8 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
}

if (zfs_prop_readonly(prop) &&
(!zfs_prop_setonce(prop) || zhp != NULL)) {
!(zfs_prop_setonce(prop) && zhp == NULL) &&
!(zfs_prop_encryption_key_param(prop) && key_params_ok)) {
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN, "'%s' is readonly"),
propname);
Expand Down Expand Up @@ -1390,6 +1392,48 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,

break;

case ZFS_PROP_KEYLOCATION:
if (!zfs_prop_valid_keylocation(strval, B_FALSE)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid keylocation"));
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}

if (zhp != NULL) {
uint64_t crypt =
zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION);

if (crypt == ZIO_CRYPT_OFF &&
strcmp(strval, "none") != 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"keylocation must not be 'none' "
"for encrypted datasets"));
(void) zfs_error(hdl, EZFS_BADPROP,
errbuf);
goto error;
} else if (crypt != ZIO_CRYPT_OFF &&
strcmp(strval, "none") == 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"keylocation must be 'none' "
"for unencrypted datasets"));
(void) zfs_error(hdl, EZFS_BADPROP,
errbuf);
goto error;
}
}
break;

case ZFS_PROP_PBKDF2_ITERS:
if (intval < MIN_PBKDF2_ITERATIONS) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"minimum pbkdf2 iterations is %u"),
MIN_PBKDF2_ITERATIONS);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
break;

case ZFS_PROP_UTF8ONLY:
chosen_utf = (int)intval;
break;
Expand Down Expand Up @@ -1453,6 +1497,27 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
break;
}
}

/* check encryption properties */
if (zhp != NULL) {
int64_t crypt = zfs_prop_get_int(zhp,
ZFS_PROP_ENCRYPTION);

switch (prop) {
case ZFS_PROP_COPIES:
if (crypt != ZIO_CRYPT_OFF && intval > 2) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"encrypted datasets cannot have "
"3 copies"));
(void) zfs_error(hdl, EZFS_BADPROP,
errbuf);
goto error;
}
break;
default:
break;
}
}
}

/*
Expand Down Expand Up @@ -1609,6 +1674,16 @@ zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err,
}
break;

case EACCES:
if (prop == ZFS_PROP_KEYLOCATION) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"keylocation may only be set on encryption roots"));
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
} else {
(void) zfs_standard_error(hdl, err, errbuf);
}
break;

case EOVERFLOW:
/*
* This platform can't address a volume this big.
Expand Down Expand Up @@ -1700,7 +1775,7 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props)

if ((nvl = zfs_valid_proplist(hdl, zhp->zfs_type, props,
zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, zhp->zpool_hdl,
errbuf)) == NULL)
B_FALSE, errbuf)) == NULL)
goto error;

/*
Expand Down Expand Up @@ -3155,6 +3230,12 @@ parent_name(const char *path, char *buf, size_t buflen)
return (0);
}

int
zfs_parent_name(zfs_handle_t *zhp, char *buf, size_t buflen)
{
return (parent_name(zfs_get_name(zhp), buf, buflen));
}

/*
* If accept_ancestor is false, then check to make sure that the given path has
* a parent, and that it exists. If accept_ancestor is true, then find the
Expand Down Expand Up @@ -3373,10 +3454,13 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
int ret;
uint64_t size = 0;
uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
char errbuf[1024];
uint64_t zoned;
enum lzc_dataset_type ost;
zpool_handle_t *zpool_handle;
uint8_t *wkeydata = NULL;
uint_t wkeylen = 0;
char errbuf[1024];
char parent[ZFS_MAX_DATASET_NAME_LEN];

(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot create '%s'"), path);
Expand Down Expand Up @@ -3420,7 +3504,7 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
return (-1);

if (props && (props = zfs_valid_proplist(hdl, type, props,
zoned, NULL, zpool_handle, errbuf)) == 0) {
zoned, NULL, zpool_handle, B_TRUE, errbuf)) == 0) {
zpool_close(zpool_handle);
return (-1);
}
Expand Down Expand Up @@ -3472,15 +3556,21 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
}
}

(void) parent_name(path, parent, sizeof (parent));
if (zfs_crypto_create(hdl, parent, props, NULL, &wkeydata,
&wkeylen) != 0) {
nvlist_free(props);
return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf));
}

/* create the dataset */
ret = lzc_create(path, ost, props);
ret = lzc_create(path, ost, props, wkeydata, wkeylen);
nvlist_free(props);
if (wkeydata != NULL)
free(wkeydata);

/* check for failure */
if (ret != 0) {
char parent[ZFS_MAX_DATASET_NAME_LEN];
(void) parent_name(path, parent, sizeof (parent));

switch (errno) {
case ENOENT:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
Expand All @@ -3497,6 +3587,13 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
"pool must be upgraded to set this "
"property or value"));
return (zfs_error(hdl, EZFS_BADVERSION, errbuf));

case EACCES:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"encryption root's key is not loaded "
"or provided"));
return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf));

#ifdef _ILP32
case EOVERFLOW:
/*
Expand Down Expand Up @@ -3691,10 +3788,15 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props)
type = ZFS_TYPE_FILESYSTEM;
}
if ((props = zfs_valid_proplist(hdl, type, props, zoned,
zhp, zhp->zpool_hdl, errbuf)) == NULL)
zhp, zhp->zpool_hdl, B_TRUE, errbuf)) == NULL)
return (-1);
}

if (zfs_crypto_clone_check(hdl, zhp, parent, props) != 0) {
nvlist_free(props);
return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf));
}

ret = lzc_clone(target, zhp->zfs_name, props);
nvlist_free(props);

Expand Down Expand Up @@ -3847,7 +3949,7 @@ zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props)

if (props != NULL &&
(props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT,
props, B_FALSE, NULL, zpool_hdl, errbuf)) == NULL) {
props, B_FALSE, NULL, zpool_hdl, B_FALSE, errbuf)) == NULL) {
zpool_close(zpool_hdl);
return (-1);
}
Expand Down Expand Up @@ -4223,6 +4325,18 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive,
"a child dataset already has a snapshot "
"with the new name"));
(void) zfs_error(hdl, EZFS_EXISTS, errbuf);
} else if (errno == EACCES) {
if (zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) ==
ZIO_CRYPT_OFF) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cannot rename an unencrypted dataset to "
"be a decendent of an encrypted one"));
} else {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cannot move encryption child outside of "
"its encryption root"));
}
(void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf);
} else {
(void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf);
}
Expand Down
5 changes: 5 additions & 0 deletions lib/libzfs/libzfs_diff.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ get_stats_for_obj(differ_info_t *di, const char *dsname, uint64_t obj,
"The sys_config privilege or diff delegated permission "
"is needed\nto discover path names"));
return (-1);
} else if (di->zerr == EACCES) {
(void) snprintf(di->errbuf, sizeof (di->errbuf),
dgettext(TEXT_DOMAIN,
"Key must be loaded to discover path names"));
return (-1);
} else {
(void) snprintf(di->errbuf, sizeof (di->errbuf),
dgettext(TEXT_DOMAIN,
Expand Down
49 changes: 49 additions & 0 deletions lib/libzfs/libzfs_mount.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/vfs.h>
#include <sys/dsl_crypt.h>

#include <libzfs.h>

Expand Down Expand Up @@ -465,6 +466,7 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags)
char mntopts[MNT_LINE_MAX];
char overlay[ZFS_MAXPROPLEN];
libzfs_handle_t *hdl = zhp->zfs_hdl;
uint64_t keystatus;
int remount = 0, rc;

if (options == NULL) {
Expand Down Expand Up @@ -501,6 +503,39 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags)
mountpoint));
}

/*
* If the filesystem is encrypted the key must be loaded in order to
* mount. If the key isn't loaded, the MS_CRYPT flag decides whether
* or not we attempt to load the keys. Note: we must call
* zfs_refresh_properties() here since some callers of this function
* (most notably zpool_enable_datasets()) may implicitly load our key
* by loading the parent's key first.
*/
if (zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF) {
zfs_refresh_properties(zhp);
keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS);

/*
* If the key is unavailable and MS_CRYPT is set give the
* user a chance to enter the key. Otherwise just fail
* immediately.
*/
if (keystatus == ZFS_KEYSTATUS_UNAVAILABLE) {
if (flags & MS_CRYPT) {
rc = zfs_crypto_load_key(zhp, B_FALSE, NULL);
if (rc)
return (rc);
} else {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"encryption key not loaded"));
return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED,
dgettext(TEXT_DOMAIN, "cannot mount '%s'"),
mountpoint));
}
}

}

/*
* Append zfsutil option so the mount helper allow the mount
*/
Expand Down Expand Up @@ -1136,6 +1171,12 @@ mount_cb(zfs_handle_t *zhp, void *data)
return (0);
}

if (zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) ==
ZFS_KEYSTATUS_UNAVAILABLE) {
zfs_close(zhp);
return (0);
}

/*
* If this filesystem is inconsistent and has a receive resume
* token, we can not mount it.
Expand Down Expand Up @@ -1225,6 +1266,14 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)

ret = 0;
for (i = 0; i < cb.cb_used; i++) {
/*
* don't attempt to mount encrypted datasets with
* unloaded keys
*/
if (zfs_prop_get_int(cb.cb_handles[i], ZFS_PROP_KEYSTATUS) ==
ZFS_KEYSTATUS_UNAVAILABLE)
continue;

if (zfs_mount(cb.cb_handles[i], mntopts, flags) != 0)
ret = -1;
else
Expand Down
28 changes: 27 additions & 1 deletion lib/libzfs/libzfs_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -1160,6 +1160,9 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
zfs_cmd_t zc = {"\0"};
nvlist_t *zc_fsprops = NULL;
nvlist_t *zc_props = NULL;
nvlist_t *hidden_args = NULL;
uint8_t *wkeydata = NULL;
uint_t wkeylen = 0;
char msg[1024];
int ret = -1;

Expand Down Expand Up @@ -1190,17 +1193,34 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
strcmp(zonestr, "on") == 0);

if ((zc_fsprops = zfs_valid_proplist(hdl, ZFS_TYPE_FILESYSTEM,
fsprops, zoned, NULL, NULL, msg)) == NULL) {
fsprops, zoned, NULL, NULL, B_TRUE, msg)) == NULL) {
goto create_failed;
}
if (!zc_props &&
(nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) {
goto create_failed;
}
if (zfs_crypto_create(hdl, NULL, zc_fsprops, props,
&wkeydata, &wkeylen) != 0) {
zfs_error(hdl, EZFS_CRYPTOFAILED, msg);
goto create_failed;
}
if (nvlist_add_nvlist(zc_props,
ZPOOL_ROOTFS_PROPS, zc_fsprops) != 0) {
goto create_failed;
}
if (wkeydata != NULL) {
if (nvlist_alloc(&hidden_args, NV_UNIQUE_NAME, 0) != 0)
goto create_failed;

if (nvlist_add_uint8_array(hidden_args, "wkeydata",
wkeydata, wkeylen) != 0)
goto create_failed;

if (nvlist_add_nvlist(zc_props, ZPOOL_HIDDEN_ARGS,
hidden_args) != 0)
goto create_failed;
}
}

if (zc_props && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0)
Expand All @@ -1213,6 +1233,9 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
zcmd_free_nvlists(&zc);
nvlist_free(zc_props);
nvlist_free(zc_fsprops);
nvlist_free(hidden_args);
if (wkeydata != NULL)
free(wkeydata);

switch (errno) {
case EBUSY:
Expand Down Expand Up @@ -1282,6 +1305,9 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
zcmd_free_nvlists(&zc);
nvlist_free(zc_props);
nvlist_free(zc_fsprops);
nvlist_free(hidden_args);
if (wkeydata != NULL)
free(wkeydata);
return (ret);
}

Expand Down
499 changes: 455 additions & 44 deletions lib/libzfs/libzfs_sendrecv.c

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions lib/libzfs/libzfs_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,8 @@ libzfs_error_description(libzfs_handle_t *hdl)
case EZFS_ACTIVE_POOL:
return (dgettext(TEXT_DOMAIN, "pool is imported on a "
"different host"));
case EZFS_CRYPTOFAILED:
return (dgettext(TEXT_DOMAIN, "encryption failure"));
case EZFS_UNKNOWN:
return (dgettext(TEXT_DOMAIN, "unknown error"));
default:
Expand Down
138 changes: 110 additions & 28 deletions lib/libzfs_core/libzfs_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -175,34 +175,49 @@ lzc_ioctl(zfs_ioc_t ioc, const char *name,
}

out:
fnvlist_pack_free(packed, size);
if (packed != NULL)
fnvlist_pack_free(packed, size);
free((void *)(uintptr_t)zc.zc_nvlist_dst);
return (error);
}

int
lzc_create(const char *fsname, enum lzc_dataset_type type, nvlist_t *props)
lzc_create(const char *fsname, enum lzc_dataset_type type, nvlist_t *props,
uint8_t *wkeydata, uint_t wkeylen)
{
int error;
nvlist_t *hidden_args = NULL;
nvlist_t *args = fnvlist_alloc();

fnvlist_add_int32(args, "type", (dmu_objset_type_t)type);
if (props != NULL)
fnvlist_add_nvlist(args, "props", props);

if (wkeydata != NULL) {
hidden_args = fnvlist_alloc();
fnvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata,
wkeylen);
fnvlist_add_nvlist(args, ZPOOL_HIDDEN_ARGS, hidden_args);
}

error = lzc_ioctl(ZFS_IOC_CREATE, fsname, args, NULL);
nvlist_free(hidden_args);
nvlist_free(args);
return (error);
}

int
lzc_clone(const char *fsname, const char *origin,
nvlist_t *props)
lzc_clone(const char *fsname, const char *origin, nvlist_t *props)
{
int error;
nvlist_t *hidden_args = NULL;
nvlist_t *args = fnvlist_alloc();

fnvlist_add_string(args, "origin", origin);
if (props != NULL)
fnvlist_add_nvlist(args, "props", props);
error = lzc_ioctl(ZFS_IOC_CLONE, fsname, args, NULL);
nvlist_free(hidden_args);
nvlist_free(args);
return (error);
}
Expand Down Expand Up @@ -532,6 +547,8 @@ lzc_send_resume(const char *snapname, const char *from, int fd,
fnvlist_add_boolean(args, "embedok");
if (flags & LZC_SEND_FLAG_COMPRESS)
fnvlist_add_boolean(args, "compressok");
if (flags & LZC_SEND_FLAG_RAW)
fnvlist_add_boolean(args, "rawok");
if (resumeobj != 0 || resumeoff != 0) {
fnvlist_add_uint64(args, "resume_object", resumeobj);
fnvlist_add_uint64(args, "resume_offset", resumeoff);
Expand Down Expand Up @@ -601,17 +618,17 @@ recv_read(int fd, void *buf, int ilen)
}

/*
* Linux adds ZFS_IOC_RECV_NEW for resumable streams and preserves the legacy
* ZFS_IOC_RECV user/kernel interface. The new interface supports all stream
* options but is currently only used for resumable streams. This way updated
* user space utilities will interoperate with older kernel modules.
* Linux adds ZFS_IOC_RECV_NEW for resumable and raw streams and preserves the
* legacy ZFS_IOC_RECV user/kernel interface. The new interface supports all
* stream options but is currently only used for resumable streams. This way
* updated user space utilities will interoperate with older kernel modules.
*
* Non-Linux OpenZFS platforms have opted to modify the legacy interface.
*/
static int
recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops,
const char *origin, boolean_t force, boolean_t resumable, int input_fd,
const dmu_replay_record_t *begin_record, int cleanup_fd,
const char *origin, boolean_t force, boolean_t resumable, boolean_t raw,
int input_fd, const dmu_replay_record_t *begin_record, int cleanup_fd,
uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle,
nvlist_t **errors)
{
Expand Down Expand Up @@ -651,7 +668,7 @@ recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops,
drr = *begin_record;
}

if (resumable) {
if (resumable || raw) {
nvlist_t *outnvl = NULL;
nvlist_t *innvl = fnvlist_alloc();

Expand Down Expand Up @@ -792,10 +809,10 @@ recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops,
*/
int
lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
boolean_t force, int fd)
boolean_t force, boolean_t raw, int fd)
{
return (recv_impl(snapname, props, NULL, origin, force, B_FALSE, fd,
NULL, -1, NULL, NULL, NULL, NULL));
return (recv_impl(snapname, props, NULL, origin, force, B_FALSE, raw,
fd, NULL, -1, NULL, NULL, NULL, NULL));
}

/*
Expand All @@ -806,10 +823,10 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
*/
int
lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin,
boolean_t force, int fd)
boolean_t force, boolean_t raw, int fd)
{
return (recv_impl(snapname, props, NULL, origin, force, B_TRUE, fd,
NULL, -1, NULL, NULL, NULL, NULL));
return (recv_impl(snapname, props, NULL, origin, force, B_TRUE, raw,
fd, NULL, -1, NULL, NULL, NULL, NULL));
}

/*
Expand All @@ -825,13 +842,14 @@ lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin,
*/
int
lzc_receive_with_header(const char *snapname, nvlist_t *props,
const char *origin, boolean_t force, boolean_t resumable, int fd,
const dmu_replay_record_t *begin_record)
const char *origin, boolean_t force, boolean_t resumable, boolean_t raw,
int fd, const dmu_replay_record_t *begin_record)
{
if (begin_record == NULL)
return (EINVAL);
return (recv_impl(snapname, props, NULL, origin, force, resumable, fd,
begin_record, -1, NULL, NULL, NULL, NULL));

return (recv_impl(snapname, props, NULL, origin, force, resumable, raw,
fd, begin_record, -1, NULL, NULL, NULL, NULL));
}

/*
Expand All @@ -855,13 +873,13 @@ lzc_receive_with_header(const char *snapname, nvlist_t *props,
* property. Callers are responsible for freeing this nvlist.
*/
int lzc_receive_one(const char *snapname, nvlist_t *props,
const char *origin, boolean_t force, boolean_t resumable, int input_fd,
const dmu_replay_record_t *begin_record, int cleanup_fd,
const char *origin, boolean_t force, boolean_t resumable, boolean_t raw,
int input_fd, const dmu_replay_record_t *begin_record, int cleanup_fd,
uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle,
nvlist_t **errors)
{
return (recv_impl(snapname, props, NULL, origin, force, resumable,
input_fd, begin_record, cleanup_fd, read_bytes, errflags,
raw, input_fd, begin_record, cleanup_fd, read_bytes, errflags,
action_handle, errors));
}

Expand All @@ -875,12 +893,13 @@ int lzc_receive_one(const char *snapname, nvlist_t *props,
*/
int lzc_receive_with_cmdprops(const char *snapname, nvlist_t *props,
nvlist_t *cmdprops, const char *origin, boolean_t force,
boolean_t resumable, int input_fd, const dmu_replay_record_t *begin_record,
int cleanup_fd, uint64_t *read_bytes, uint64_t *errflags,
uint64_t *action_handle, nvlist_t **errors)
boolean_t resumable, boolean_t raw, int input_fd,
const dmu_replay_record_t *begin_record, int cleanup_fd,
uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle,
nvlist_t **errors)
{
return (recv_impl(snapname, props, cmdprops, origin, force, resumable,
input_fd, begin_record, cleanup_fd, read_bytes, errflags,
raw, input_fd, begin_record, cleanup_fd, read_bytes, errflags,
action_handle, errors));
}

Expand Down Expand Up @@ -1027,3 +1046,66 @@ lzc_destroy_bookmarks(nvlist_t *bmarks, nvlist_t **errlist)

return (error);
}

/*
* Performs key management functions
*
* crypto_cmd should be a value from zfs_ioc_crypto_cmd_t. If the command
* specifies to load or change a wrapping key, the key should be specified in
* the hidden_args nvlist so that it is not logged
*/
int
lzc_load_key(const char *fsname, boolean_t noop, uint8_t *wkeydata,
uint_t wkeylen)
{
int error;
nvlist_t *ioc_args;
nvlist_t *hidden_args;

if (wkeydata == NULL)
return (EINVAL);

ioc_args = fnvlist_alloc();
hidden_args = fnvlist_alloc();
fnvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata, wkeylen);
fnvlist_add_nvlist(ioc_args, ZPOOL_HIDDEN_ARGS, hidden_args);
if (noop)
fnvlist_add_boolean(ioc_args, "noop");
error = lzc_ioctl(ZFS_IOC_LOAD_KEY, fsname, ioc_args, NULL);
nvlist_free(hidden_args);
nvlist_free(ioc_args);

return (error);
}

int
lzc_unload_key(const char *fsname)
{
return (lzc_ioctl(ZFS_IOC_UNLOAD_KEY, fsname, NULL, NULL));
}

int
lzc_change_key(const char *fsname, uint64_t crypt_cmd, nvlist_t *props,
uint8_t *wkeydata, uint_t wkeylen)
{
int error;
nvlist_t *ioc_args = fnvlist_alloc();
nvlist_t *hidden_args = NULL;

fnvlist_add_uint64(ioc_args, "crypt_cmd", crypt_cmd);

if (wkeydata != NULL) {
hidden_args = fnvlist_alloc();
fnvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata,
wkeylen);
fnvlist_add_nvlist(ioc_args, ZPOOL_HIDDEN_ARGS, hidden_args);
}

if (props != NULL)
fnvlist_add_nvlist(ioc_args, "props", props);

error = lzc_ioctl(ZFS_IOC_CHANGE_KEY, fsname, ioc_args, NULL);
nvlist_free(hidden_args);
nvlist_free(ioc_args);
return (error);
}
2 changes: 2 additions & 0 deletions lib/libzpool/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ KERNEL_C = \
dsl_deadlist.c \
dsl_deleg.c \
dsl_dir.c \
dsl_crypt.c \
dsl_pool.c \
dsl_prop.c \
dsl_scan.c \
Expand Down Expand Up @@ -128,6 +129,7 @@ KERNEL_C = \
zio.c \
zio_checksum.c \
zio_compress.c \
zio_crypt.c \
zio_inject.c \
zle.c \
zrlock.c
Expand Down
21 changes: 21 additions & 0 deletions man/man5/zpool-features.5
Original file line number Diff line number Diff line change
Expand Up @@ -619,5 +619,26 @@ files.

.RE

.sp
.ne 2
.na
\fB\fBencryption\fR\fR
.ad
.RS 4n
.TS
l l .
GUID com.datto:encryption
READ\-ONLY COMPATIBLE no
DEPENDENCIES extensible_dataset
.TE

This feature enables the creation and management of natively encrypted datasets.

This feature becomes \fBactive\fR when an encrypted dataset is created and will
be returned to the \fBenabled\fR state when all datasets that use this feature
are destroyed.

.RE

.SH "SEE ALSO"
\fBzpool\fR(8)
375 changes: 370 additions & 5 deletions man/man8/zfs.8

Large diffs are not rendered by default.

39 changes: 33 additions & 6 deletions man/man8/zpool.8
Original file line number Diff line number Diff line change
Expand Up @@ -92,15 +92,15 @@
.Nm
.Cm import
.Fl a
.Op Fl DfmN
.Op Fl DflmN
.Op Fl F Oo Fl n Oc Oo Fl T Oc Oo Fl X Oc
.Op Fl c Ar cachefile Ns | Ns Fl d Ar dir
.Op Fl o Ar mntopts
.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ...
.Op Fl R Ar root
.Nm
.Cm import
.Op Fl Dfm
.Op Fl Dflm
.Op Fl F Oo Fl n Oc Oo Fl T Oc Oo Fl X Oc
.Op Fl c Ar cachefile Ns | Ns Fl d Ar dir
.Op Fl o Ar mntopts
Expand Down Expand Up @@ -160,7 +160,7 @@
.Ar pool
.Nm
.Cm split
.Op Fl gLnP
.Op Fl gLlnP
.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ...
.Op Fl R Ar root
.Ar pool newpool
Expand Down Expand Up @@ -1186,7 +1186,7 @@ Lists destroyed pools only.
.Nm
.Cm import
.Fl a
.Op Fl DfmN
.Op Fl DflmN
.Op Fl F Oo Fl n Oc Oo Fl T Oc Oo Fl X Oc
.Op Fl c Ar cachefile Ns | Ns Fl d Ar dir
.Op Fl o Ar mntopts
Expand Down Expand Up @@ -1237,6 +1237,15 @@ transactions.
Not all damaged pools can be recovered by using this option.
If successful, the data from the discarded transactions is irretrievably lost.
This option is ignored if the pool is importable or already imported.
.It Fl l
Indicates that this command will request encryption keys for all encrypted
datasets it attempts to mount as it is bringing the pool online. Note that if
any datasets have a
.Sy keylocation
of
.Sy prompt
this command will block waiting for the keys to be entered. Without this flag
encrypted datasets will be left unavailable until the keys are loaded.
.It Fl m
Allows a pool to import when there is a missing log device.
Recent transactions can be lost because the log device will be discarded.
Expand Down Expand Up @@ -1298,7 +1307,7 @@ health of your pool and should only be used as a last resort.
.It Xo
.Nm
.Cm import
.Op Fl Dfm
.Op Fl Dflm
.Op Fl F Oo Fl n Oc Oo Fl t Oc Oo Fl T Oc Oo Fl X Oc
.Op Fl c Ar cachefile Ns | Ns Fl d Ar dir
.Op Fl o Ar mntopts
Expand Down Expand Up @@ -1357,6 +1366,15 @@ transactions.
Not all damaged pools can be recovered by using this option.
If successful, the data from the discarded transactions is irretrievably lost.
This option is ignored if the pool is importable or already imported.
.It Fl l
Indicates that this command will request encryption keys for all encrypted
datasets it attempts to mount as it is bringing the pool online. Note that if
any datasets have a
.Sy keylocation
of
.Sy prompt
this command will block waiting for the keys to be entered. Without this flag
encrypted datasets will be left unavailable until the keys are loaded.
.It Fl m
Allows a pool to import when there is a missing log device.
Recent transactions can be lost because the log device will be discarded.
Expand Down Expand Up @@ -1849,7 +1867,7 @@ values.
.It Xo
.Nm
.Cm split
.Op Fl gLnP
.Op Fl gLlnP
.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ...
.Op Fl R Ar root
.Ar pool newpool
Expand Down Expand Up @@ -1887,6 +1905,15 @@ Display real paths for vdevs resolving all symbolic links. This can
be used to look up the current block device name regardless of the
.Pa /dev/disk/
path used to open it.
.It Fl l
Indicates that this command will request encryption keys for all encrypted
datasets it attempts to mount as it is bringing the new pool online. Note that
if any datasets have a
.Sy keylocation
of
.Sy prompt
this command will block waiting for the keys to be entered. Without this flag
encrypted datasets will be left unavailable until the keys are loaded.
.It Fl n
Do dry run, do not actually perform the split.
Print out the expected configuration of
Expand Down
12 changes: 6 additions & 6 deletions module/icp/algs/sha2/sha2.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
static void Encode(uint8_t *, uint32_t *, size_t);
static void Encode64(uint8_t *, uint64_t *, size_t);

#if defined(__amd64) && defined(_KERNEL)
#if defined(__amd64)
#define SHA512Transform(ctx, in) SHA512TransformBlocks((ctx), (in), 1)
#define SHA256Transform(ctx, in) SHA256TransformBlocks((ctx), (in), 1)

Expand All @@ -62,7 +62,7 @@ void SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num);
#else
static void SHA256Transform(SHA2_CTX *, const uint8_t *);
static void SHA512Transform(SHA2_CTX *, const uint8_t *);
#endif /* __amd64 && _KERNEL */
#endif /* __amd64 */

static uint8_t PADDING[128] = { 0x80, /* all zeros */ };

Expand Down Expand Up @@ -142,7 +142,7 @@ static uint8_t PADDING[128] = { 0x80, /* all zeros */ };
#endif /* _BIG_ENDIAN */


#if !defined(__amd64) || !defined(_KERNEL)
#if !defined(__amd64)
/* SHA256 Transform */

static void
Expand Down Expand Up @@ -600,7 +600,7 @@ SHA512Transform(SHA2_CTX *ctx, const uint8_t *blk)
ctx->state.s64[7] += h;

}
#endif /* !__amd64 || !_KERNEL */
#endif /* !__amd64 */


/*
Expand Down Expand Up @@ -838,7 +838,7 @@ SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len)
i = buf_len;
}

#if !defined(__amd64) || !defined(_KERNEL)
#if !defined(__amd64)
if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) {
for (; i + buf_limit - 1 < input_len; i += buf_limit) {
SHA256Transform(ctx, &input[i]);
Expand Down Expand Up @@ -866,7 +866,7 @@ SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len)
i += block_count << 7;
}
}
#endif /* !__amd64 || !_KERNEL */
#endif /* !__amd64 */

/*
* general optimization:
Expand Down
4 changes: 2 additions & 2 deletions module/icp/core/kcf_prov_lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ crypto_uio_data(crypto_data_t *data, uchar_t *buf, int len, cmd_type_t cmd,
offset -= uiop->uio_iov[vec_idx++].iov_len)
;

if (vec_idx == uiop->uio_iovcnt) {
if (vec_idx == uiop->uio_iovcnt && length > 0) {
/*
* The caller specified an offset that is larger than
* the total size of the buffers it provided.
Expand Down Expand Up @@ -192,7 +192,7 @@ crypto_update_uio(void *ctx, crypto_data_t *input, crypto_data_t *output,
offset >= uiop->uio_iov[vec_idx].iov_len;
offset -= uiop->uio_iov[vec_idx++].iov_len)
;
if (vec_idx == uiop->uio_iovcnt) {
if (vec_idx == uiop->uio_iovcnt && length > 0) {
/*
* The caller specified an offset that is larger than the
* total size of the buffers it provided.
Expand Down
2 changes: 1 addition & 1 deletion module/icp/illumos-crypto.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
* Copyright (c) 2016, Datto, Inc. All rights reserved.
* Copyright (c) 2017, Datto, Inc. All rights reserved.
*/

#ifdef _KERNEL
Expand Down
11 changes: 11 additions & 0 deletions module/zcommon/zfeature_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,17 @@ zpool_feature_init(void)
ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET,
userobj_accounting_deps);
}

{
static const spa_feature_t encryption_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_NONE
};
zfeature_register(SPA_FEATURE_ENCRYPTION,
"com.datto:encryption", "encryption",
"Support for dataset level encryption",
ZFEATURE_FLAG_PER_DATASET, encryption_deps);
}
}

#if defined(_KERNEL) && defined(HAVE_SPL)
Expand Down
2 changes: 2 additions & 0 deletions module/zcommon/zfs_deleg.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
{ZFS_DELEG_PERM_GROUPOBJUSED},
{ZFS_DELEG_PERM_HOLD},
{ZFS_DELEG_PERM_RELEASE},
{ZFS_DELEG_PERM_LOAD_KEY},
{ZFS_DELEG_PERM_CHANGE_KEY},
{NULL}
};

Expand Down
99 changes: 96 additions & 3 deletions module/zcommon/zfs_prop.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <sys/zfs_acl.h>
#include <sys/zfs_ioctl.h>
#include <sys/zfs_znode.h>
#include <sys/dsl_crypt.h>

#include "zfs_prop.h"
#include "zfs_deleg.h"
Expand Down Expand Up @@ -119,6 +120,26 @@ zfs_prop_init(void)
{ NULL }
};

static zprop_index_t crypto_table[] = {
{ "on", ZIO_CRYPT_ON },
{ "off", ZIO_CRYPT_OFF },
{ "aes-128-ccm", ZIO_CRYPT_AES_128_CCM },
{ "aes-192-ccm", ZIO_CRYPT_AES_192_CCM },
{ "aes-256-ccm", ZIO_CRYPT_AES_256_CCM },
{ "aes-128-gcm", ZIO_CRYPT_AES_128_GCM },
{ "aes-192-gcm", ZIO_CRYPT_AES_192_GCM },
{ "aes-256-gcm", ZIO_CRYPT_AES_256_GCM },
{ NULL }
};

static zprop_index_t keyformat_table[] = {
{ "none", ZFS_KEYFORMAT_NONE },
{ "raw", ZFS_KEYFORMAT_RAW },
{ "hex", ZFS_KEYFORMAT_HEX },
{ "passphrase", ZFS_KEYFORMAT_PASSPHRASE },
{ NULL }
};

static zprop_index_t snapdir_table[] = {
{ "hidden", ZFS_SNAPDIR_HIDDEN },
{ "visible", ZFS_SNAPDIR_VISIBLE },
Expand Down Expand Up @@ -193,6 +214,13 @@ zfs_prop_init(void)
{ NULL }
};

static zprop_index_t keystatus_table[] = {
{ "none", ZFS_KEYSTATUS_NONE},
{ "unavailable", ZFS_KEYSTATUS_UNAVAILABLE},
{ "available", ZFS_KEYSTATUS_AVAILABLE},
{ NULL }
};

static zprop_index_t logbias_table[] = {
{ "latency", ZFS_LOGBIAS_LATENCY },
{ "throughput", ZFS_LOGBIAS_THROUGHPUT },
Expand Down Expand Up @@ -351,12 +379,16 @@ zfs_prop_init(void)
PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto",
"CANMOUNT", canmount_table);

/* readonly index (boolean) properties */
/* readonly index properties */
zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY",
boolean_table);
zprop_register_index(ZFS_PROP_KEYSTATUS, "keystatus",
ZFS_KEYSTATUS_NONE, PROP_READONLY, ZFS_TYPE_DATASET,
"none | unavailable | available",
"KEYSTATUS", keystatus_table);

/* set once index properties */
zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
Expand All @@ -367,6 +399,15 @@ zfs_prop_init(void)
ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM |
ZFS_TYPE_SNAPSHOT,
"sensitive | insensitive | mixed", "CASE", case_table);
zprop_register_index(ZFS_PROP_KEYFORMAT, "keyformat",
ZFS_KEYFORMAT_NONE, PROP_ONETIME_DEFAULT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"none | raw | hex | passphrase", "KEYFORMAT", keyformat_table);
zprop_register_index(ZFS_PROP_ENCRYPTION, "encryption",
ZIO_CRYPT_DEFAULT, PROP_ONETIME, ZFS_TYPE_DATASET,
"on | off | aes-128-ccm | aes-192-ccm | aes-256-ccm | "
"aes-128-gcm | aes-192-gcm | aes-256-gcm", "ENCRYPTION",
crypto_table);

/* set once index (boolean) properties */
zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
Expand Down Expand Up @@ -409,6 +450,12 @@ zfs_prop_init(void)
"receive_resume_token",
NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"<string token>", "RESUMETOK");
zprop_register_string(ZFS_PROP_ENCRYPTION_ROOT, "encryptionroot", NULL,
PROP_READONLY, ZFS_TYPE_DATASET, "<filesystem | volume>",
"ENCROOT");
zprop_register_string(ZFS_PROP_KEYLOCATION, "keylocation",
"none", PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"prompt | <file URI>", "KEYLOCATION");

/* readonly number properties */
zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
Expand Down Expand Up @@ -456,6 +503,9 @@ zfs_prop_init(void)
ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "GUID");
zprop_register_number(ZFS_PROP_CREATETXG, "createtxg", 0, PROP_READONLY,
ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "CREATETXG");
zprop_register_number(ZFS_PROP_PBKDF2_ITERS, "pbkdf2iters",
0, PROP_ONETIME_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"<iters>", "PBKDF2ITERS");

/* default number properties */
zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
Expand Down Expand Up @@ -503,6 +553,11 @@ zfs_prop_init(void)
PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT");
zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING,
PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PREVSNAP");
zprop_register_hidden(ZFS_PROP_PBKDF2_SALT, "pbkdf2salt",
PROP_TYPE_NUMBER, PROP_ONETIME_DEFAULT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PBKDF2SALT");
zprop_register_hidden(ZFS_PROP_KEY_GUID, "keyguid", PROP_TYPE_NUMBER,
PROP_READONLY, ZFS_TYPE_DATASET, "KEYGUID");

/*
* Property to be removed once libbe is integrated
Expand Down Expand Up @@ -650,7 +705,8 @@ boolean_t
zfs_prop_readonly(zfs_prop_t prop)
{
return (zfs_prop_table[prop].pd_attr == PROP_READONLY ||
zfs_prop_table[prop].pd_attr == PROP_ONETIME);
zfs_prop_table[prop].pd_attr == PROP_ONETIME ||
zfs_prop_table[prop].pd_attr == PROP_ONETIME_DEFAULT);
}

/*
Expand All @@ -659,7 +715,8 @@ zfs_prop_readonly(zfs_prop_t prop)
boolean_t
zfs_prop_setonce(zfs_prop_t prop)
{
return (zfs_prop_table[prop].pd_attr == PROP_ONETIME);
return (zfs_prop_table[prop].pd_attr == PROP_ONETIME ||
zfs_prop_table[prop].pd_attr == PROP_ONETIME_DEFAULT);
}

const char *
Expand Down Expand Up @@ -694,6 +751,40 @@ zfs_prop_inheritable(zfs_prop_t prop)
zfs_prop_table[prop].pd_attr == PROP_ONETIME);
}

/*
* Returns TRUE if property is one of the encryption properties that requires
* a loaded encryption key to modify.
*/
boolean_t
zfs_prop_encryption_key_param(zfs_prop_t prop)
{
/*
* keylocation does not count as an encryption property. It can be
* changed at will without needing the master keys.
*/
return (prop == ZFS_PROP_PBKDF2_SALT || prop == ZFS_PROP_PBKDF2_ITERS ||
prop == ZFS_PROP_KEYFORMAT);
}

/*
* Helper function used by both kernelspace and userspace to check the
* keylocation property. If encrypted is set, the keylocation must be valid
* for an encrypted dataset.
*/
boolean_t
zfs_prop_valid_keylocation(const char *str, boolean_t encrypted)
{
if (strcmp("none", str) == 0)
return (!encrypted);
else if (strcmp("prompt", str) == 0)
return (B_TRUE);
else if (strlen(str) > 8 && strncmp("file:///", str, 8) == 0)
return (B_TRUE);

return (B_FALSE);
}


#ifndef _KERNEL

/*
Expand Down Expand Up @@ -774,6 +865,8 @@ EXPORT_SYMBOL(zfs_prop_default_string);
EXPORT_SYMBOL(zfs_prop_default_numeric);
EXPORT_SYMBOL(zfs_prop_readonly);
EXPORT_SYMBOL(zfs_prop_inheritable);
EXPORT_SYMBOL(zfs_prop_encryption_key_param);
EXPORT_SYMBOL(zfs_prop_valid_keylocation);
EXPORT_SYMBOL(zfs_prop_setonce);
EXPORT_SYMBOL(zfs_prop_to_name);
EXPORT_SYMBOL(zfs_name_to_prop);
Expand Down
2 changes: 2 additions & 0 deletions module/zfs/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ $(MODULE)-objs += dsl_deadlist.o
$(MODULE)-objs += dsl_deleg.o
$(MODULE)-objs += dsl_bookmark.o
$(MODULE)-objs += dsl_dir.o
$(MODULE)-objs += dsl_crypt.o
$(MODULE)-objs += dsl_pool.o
$(MODULE)-objs += dsl_prop.o
$(MODULE)-objs += dsl_scan.o
Expand Down Expand Up @@ -103,6 +104,7 @@ $(MODULE)-objs += zil.o
$(MODULE)-objs += zio.o
$(MODULE)-objs += zio_checksum.o
$(MODULE)-objs += zio_compress.o
$(MODULE)-objs += zio_crypt.o
$(MODULE)-objs += zio_inject.o
$(MODULE)-objs += zle.o
$(MODULE)-objs += zpl_ctldir.o
Expand Down
1,611 changes: 1,352 additions & 259 deletions module/zfs/arc.c

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion module/zfs/bptree.c
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,8 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
err = 0;
for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
bptree_entry_phys_t bte;
int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST |
TRAVERSE_NO_DECRYPT;

err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
&bte, DMU_READ_NO_PREFETCH);
Expand Down
209 changes: 173 additions & 36 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -964,7 +964,7 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
}

static void
dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;

Expand All @@ -984,7 +984,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
db->db_freed_in_flight = FALSE;
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else if (zio == NULL || zio->io_error == 0) {
} else if (err == 0) {
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else {
Expand All @@ -1003,7 +1003,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
dnode_t *dn;
zbookmark_phys_t zb;
uint32_t aflags = ARC_FLAG_NOWAIT;
int err;
int err, zio_flags = 0;

DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
Expand All @@ -1021,6 +1021,22 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
*/
int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
arc_buf_t *dn_buf = (dn->dn_dbuf != NULL) ?
dn->dn_dbuf->db_buf : NULL;

/* if the underlying dnode block is encrypted, decrypt it */
if (dn_buf != NULL && dn->dn_objset->os_encrypted &&
DMU_OT_IS_ENCRYPTED(dn->dn_bonustype) &&
(flags & DB_RF_NO_DECRYPT) == 0 &&
arc_is_encrypted(dn_buf)) {
err = arc_untransform(dn_buf, dn->dn_objset->os_spa,
dmu_objset_id(dn->dn_objset), B_TRUE);
if (err != 0) {
DB_DNODE_EXIT(db);
mutex_exit(&db->db_mtx);
return (err);
}
}

ASSERT3U(bonuslen, <=, db->db.db_size);
db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
Expand Down Expand Up @@ -1088,11 +1104,27 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
db->db.db_object, db->db_level, db->db_blkid);

/*
* All bps of an encrypted os should have the encryption bit set.
* If this is not true it indicates tampering and we report an error.
*/
if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
spa_log_error(db->db_objset->os_spa, &zb);
zfs_panic_recover("unencrypted block in encrypted "
"object set %llu", dmu_objset_id(db->db_objset));
return (SET_ERROR(EIO));
}

dbuf_add_ref(db, NULL);

zio_flags = (flags & DB_RF_CANFAIL) ?
ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;

if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
zio_flags |= ZIO_FLAG_RAW;

err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
&aflags, &zb);

return (err);
Expand Down Expand Up @@ -1141,18 +1173,31 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
dnode_t *dn = DB_DNODE(db);
int size = arc_buf_size(db->db_buf);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
spa_t *spa = db->db_objset->os_spa;
enum zio_compress compress_type =
arc_get_compression(db->db_buf);

if (compress_type == ZIO_COMPRESS_OFF) {
dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
} else {
if (arc_is_encrypted(db->db_buf)) {
boolean_t byteorder;
uint8_t salt[ZIO_DATA_SALT_LEN];
uint8_t iv[ZIO_DATA_IV_LEN];
uint8_t mac[ZIO_DATA_MAC_LEN];

arc_get_raw_params(db->db_buf, &byteorder, salt,
iv, mac);
dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
compress_type);
} else if (compress_type != ZIO_COMPRESS_OFF) {
ASSERT3U(type, ==, ARC_BUFC_DATA);
dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
size, arc_buf_lsize(db->db_buf), compress_type);
} else {
dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
}
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
} else {
Expand Down Expand Up @@ -1188,16 +1233,21 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)

mutex_enter(&db->db_mtx);
if (db->db_state == DB_CACHED) {
spa_t *spa = dn->dn_objset->os_spa;

/*
* If the arc buf is compressed, we need to decompress it to
* read the data. This could happen during the "zfs receive" of
* a stream which is compressed and deduplicated.
* If the arc buf is compressed or encrypted, we need to
* untransform it to read the data. This could happen during
* the "zfs receive" of a stream which is deduplicated and
* either raw or compressed. We do not need to do this if the
* caller wants raw encrypted data.
*/
if (db->db_buf != NULL &&
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) {
dbuf_fix_old_data(db,
spa_syncing_txg(dmu_objset_spa(db->db_objset)));
err = arc_decompress(db->db_buf);
if (db->db_buf != NULL && (flags & DB_RF_NO_DECRYPT) == 0 &&
(arc_is_encrypted(db->db_buf) ||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
dbuf_fix_old_data(db, spa_syncing_txg(spa));
err = arc_untransform(db->db_buf, spa,
dmu_objset_id(db->db_objset), B_FALSE);
dbuf_set_data(db, db->db_buf);
}
mutex_exit(&db->db_mtx);
Expand Down Expand Up @@ -1316,6 +1366,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)

dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE;
dr->dt.dl.dr_raw = B_FALSE;

/*
* Release the already-written buffer, so we leave it in
Expand Down Expand Up @@ -1908,11 +1959,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
return (B_FALSE);
}

void
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
static void
dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
dbuf_dirty_record_t *dr;

ASSERT(tx->tx_txg != 0);
Expand Down Expand Up @@ -1944,12 +1994,19 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)

DB_DNODE_ENTER(db);
if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
rf |= DB_RF_HAVESTRUCT;
flags |= DB_RF_HAVESTRUCT;
DB_DNODE_EXIT(db);
(void) dbuf_read(db, NULL, rf);
(void) dbuf_read(db, NULL, flags);
(void) dbuf_dirty(db, tx);
}

void
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_will_dirty_impl(db_fake,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
}

void
dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
Expand Down Expand Up @@ -1977,6 +2034,29 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
(void) dbuf_dirty(db, tx);
}

/*
* This function is effectively the same as dmu_buf_will_dirty(), but
* indicates the caller expects raw encrypted data in the db. It will
* also set the raw flag on the created dirty record.
*/
void
dmu_buf_will_change_crypt_params(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dbuf_dirty_record_t *dr;

dmu_buf_will_dirty_impl(db_fake,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);

dr = db->db_last_dirty;
while (dr != NULL && dr->dr_txg > tx->tx_txg)
dr = dr->dr_next;

ASSERT3P(dr, !=, NULL);
ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
dr->dt.dl.dr_raw = B_TRUE;
}

#pragma weak dmu_buf_fill_done = dbuf_fill_done
/* ARGSUSED */
void
Expand Down Expand Up @@ -2117,10 +2197,11 @@ dbuf_destroy(dmu_buf_impl_t *db)
if (db->db_blkid == DMU_BONUS_BLKID) {
int slots = DB_DNODE(db)->dn_num_slots;
int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
ASSERT(db->db.db_data != NULL);
kmem_free(db->db.db_data, bonuslen);
arc_space_return(bonuslen, ARC_SPACE_BONUS);
db->db_state = DB_UNCACHED;
if (db->db.db_data != NULL) {
kmem_free(db->db.db_data, bonuslen);
arc_space_return(bonuslen, ARC_SPACE_BONUS);
db->db_state = DB_UNCACHED;
}
}

dbuf_clear_data(db);
Expand Down Expand Up @@ -2416,7 +2497,7 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
* prefetch if the next block down is our target.
*/
static void
dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private)
{
dbuf_prefetch_arg_t *dpa = private;
uint64_t nextblkid;
Expand All @@ -2438,7 +2519,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
*/
if (zio != NULL) {
ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
if (zio->io_flags & ZIO_FLAG_RAW) {
if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
} else {
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
Expand All @@ -2463,7 +2544,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
bp = ((blkptr_t *)abuf->b_data) +
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
if (BP_IS_HOLE(bp) || err != 0) {
kmem_free(dpa, sizeof (*dpa));
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
Expand Down Expand Up @@ -2491,7 +2572,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
* Issue prefetch reads for the given block on the given level. If the indirect
* blocks above that block are not in memory, we will read them in
* asynchronously. As a result, this call never blocks waiting for a read to
* complete.
* complete. Note that the prefetch might fail if the dataset is encrypted and
* the encryption key is unmapped before the IO completes.
*/
void
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
Expand Down Expand Up @@ -3120,6 +3202,41 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
}
}

/*
* Ensure the dbuf's data is untransformed if the associated dirty
* record requires it. This is used by dbuf_sync_leaf() to ensure
* that a dnode block is decrypted before we write new data to it.
* For raw writes we assert that the buffer is already encrypted.
*/
static void
dbuf_check_crypt(dbuf_dirty_record_t *dr)
{
int err;
dmu_buf_impl_t *db = dr->dr_dbuf;

ASSERT(MUTEX_HELD(&db->db_mtx));

if (!dr->dt.dl.dr_raw && arc_is_encrypted(db->db_buf)) {
/*
* Unfortunately, there is currently no mechanism for
* syncing context to handle decryption errors. An error
* here is only possible if an attacker maliciously
* changed a dnode block and updated the associated
* checksums going up the block tree.
*/
err = arc_untransform(db->db_buf, db->db_objset->os_spa,
dmu_objset_id(db->db_objset), B_TRUE);
if (err)
panic("Invalid dnode block MAC");
} else if (dr->dt.dl.dr_raw) {
/*
* Writing raw encrypted data requires the db's arc buffer
* to be converted to raw by the caller.
*/
ASSERT(arc_is_encrypted(db->db_buf));
}
}

/*
* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
* is critical the we not allow the compiler to inline this function in to
Expand Down Expand Up @@ -3241,9 +3358,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)

ASSERT(*datap != NULL);
ASSERT0(db->db_level);
ASSERT3U(dn->dn_phys->dn_bonuslen, <=,
ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
bcopy(*datap, DN_BONUS(dn->dn_phys),
DN_MAX_BONUS_LEN(dn->dn_phys));
DB_DNODE_EXIT(db);

if (*datap != db->db.db_data) {
Expand Down Expand Up @@ -3290,6 +3408,13 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
}

/*
* If this is a dnode block, ensure it is appropriately encrypted
* or decrypted, depending on what we are writing to it this txg.
*/
if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
dbuf_check_crypt(dr);

if (db->db_state != DB_NOFILL &&
dn->dn_object != DMU_META_DNODE_OBJECT &&
refcount_count(&db->db_holds) > 1 &&
Expand All @@ -3307,16 +3432,27 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
* DNONE_DNODE blocks).
*/
int psize = arc_buf_size(*datap);
int lsize = arc_buf_lsize(*datap);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
enum zio_compress compress_type = arc_get_compression(*datap);

if (compress_type == ZIO_COMPRESS_OFF) {
*datap = arc_alloc_buf(os->os_spa, db, type, psize);
} else {
if (arc_is_encrypted(*datap)) {
boolean_t byteorder;
uint8_t salt[ZIO_DATA_SALT_LEN];
uint8_t iv[ZIO_DATA_IV_LEN];
uint8_t mac[ZIO_DATA_MAC_LEN];

arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
*datap = arc_alloc_raw_buf(os->os_spa, db,
dmu_objset_id(os), byteorder, salt, iv, mac,
dn->dn_type, psize, lsize, compress_type);
} else if (compress_type != ZIO_COMPRESS_OFF) {
ASSERT3U(type, ==, ARC_BUFC_DATA);
int lsize = arc_buf_lsize(*datap);
*datap = arc_alloc_compressed_buf(os->os_spa, db,
psize, lsize, compress_type);
} else {
*datap = arc_alloc_buf(os->os_spa, db, type, psize);
}
bcopy(db->db.db_data, (*datap)->b_data, psize);
}
Expand Down Expand Up @@ -3453,7 +3589,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
DB_DNODE_EXIT(db);

if (!BP_IS_EMBEDDED(bp))
bp->blk_fill = fill;
BP_SET_FILL(bp, fill);

mutex_exit(&db->db_mtx);

Expand Down Expand Up @@ -3778,7 +3914,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
} else {
arc_done_func_t *children_ready_cb = NULL;
arc_write_done_func_t *children_ready_cb = NULL;
ASSERT(arc_released(data));

/*
Expand Down Expand Up @@ -3810,6 +3946,7 @@ EXPORT_SYMBOL(dbuf_free_range);
EXPORT_SYMBOL(dbuf_new_size);
EXPORT_SYMBOL(dbuf_release_bp);
EXPORT_SYMBOL(dbuf_dirty);
EXPORT_SYMBOL(dmu_buf_will_change_crypt_params);
EXPORT_SYMBOL(dmu_buf_will_dirty);
EXPORT_SYMBOL(dmu_buf_will_not_fill);
EXPORT_SYMBOL(dmu_buf_will_fill);
Expand Down
23 changes: 19 additions & 4 deletions module/zfs/ddt.c
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,10 @@ ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
}

/*
* The bp created via this function may be used for repairs and scrub, but it
* will be missing the salt / IV required to do a full decrypting read.
*/
void
ddt_bp_create(enum zio_checksum checksum,
const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
Expand All @@ -279,11 +283,12 @@ ddt_bp_create(enum zio_checksum checksum,
ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);

bp->blk_cksum = ddk->ddk_cksum;
bp->blk_fill = 1;

BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk));
BP_SET_FILL(bp, 1);
BP_SET_CHECKSUM(bp, checksum);
BP_SET_TYPE(bp, DMU_OT_DEDUP);
BP_SET_LEVEL(bp, 0);
Expand All @@ -297,9 +302,12 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
ddk->ddk_cksum = bp->blk_cksum;
ddk->ddk_prop = 0;

ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp));

DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp));
}

void
Expand Down Expand Up @@ -389,7 +397,7 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
if (ddp->ddp_phys_birth == 0)
continue;

for (d = 0; d < SPA_DVAS_PER_BP; d++)
for (d = 0; d < DDE_GET_NDVAS(dde); d++)
dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);

dds->dds_blocks += 1;
Expand Down Expand Up @@ -562,6 +570,7 @@ ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
uint64_t ditto = spa->spa_dedup_ditto;
int total_copies = 0;
int desired_copies = 0;
int copies_needed = 0;
int p;

for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
Expand All @@ -588,7 +597,13 @@ ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
if (total_refcnt >= ditto * ditto)
desired_copies++;

return (MAX(desired_copies, total_copies) - total_copies);
copies_needed = MAX(desired_copies, total_copies) - total_copies;

/* encrypted blocks store their IV in DVA[2] */
if (DDK_GET_CRYPT(&dde->dde_key))
copies_needed = MIN(copies_needed, SPA_DVAS_PER_BP - 1);

return (copies_needed);
}

int
Expand All @@ -599,7 +614,7 @@ ddt_ditto_copies_present(ddt_entry_t *dde)
int copies = 0 - DVA_GET_GANG(dva);
int d;

for (d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
for (d = 0; d < DDE_GET_NDVAS(dde); d++, dva++)
if (DVA_IS_VALID(dva))
copies++;

Expand Down
268 changes: 206 additions & 62 deletions module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,60 +73,60 @@ unsigned long zfs_per_txg_dirty_frees_percent = 30;
int zfs_dmu_offset_next_sync = 0;

const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ DMU_BSWAP_UINT8, TRUE, "unallocated" },
{ DMU_BSWAP_ZAP, TRUE, "object directory" },
{ DMU_BSWAP_UINT64, TRUE, "object array" },
{ DMU_BSWAP_UINT8, TRUE, "packed nvlist" },
{ DMU_BSWAP_UINT64, TRUE, "packed nvlist size" },
{ DMU_BSWAP_UINT64, TRUE, "bpobj" },
{ DMU_BSWAP_UINT64, TRUE, "bpobj header" },
{ DMU_BSWAP_UINT64, TRUE, "SPA space map header" },
{ DMU_BSWAP_UINT64, TRUE, "SPA space map" },
{ DMU_BSWAP_UINT64, TRUE, "ZIL intent log" },
{ DMU_BSWAP_DNODE, TRUE, "DMU dnode" },
{ DMU_BSWAP_OBJSET, TRUE, "DMU objset" },
{ DMU_BSWAP_UINT64, TRUE, "DSL directory" },
{ DMU_BSWAP_ZAP, TRUE, "DSL directory child map"},
{ DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" },
{ DMU_BSWAP_ZAP, TRUE, "DSL props" },
{ DMU_BSWAP_UINT64, TRUE, "DSL dataset" },
{ DMU_BSWAP_ZNODE, TRUE, "ZFS znode" },
{ DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" },
{ DMU_BSWAP_UINT8, FALSE, "ZFS plain file" },
{ DMU_BSWAP_ZAP, TRUE, "ZFS directory" },
{ DMU_BSWAP_ZAP, TRUE, "ZFS master node" },
{ DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" },
{ DMU_BSWAP_UINT8, FALSE, "zvol object" },
{ DMU_BSWAP_ZAP, TRUE, "zvol prop" },
{ DMU_BSWAP_UINT8, FALSE, "other uint8[]" },
{ DMU_BSWAP_UINT64, FALSE, "other uint64[]" },
{ DMU_BSWAP_ZAP, TRUE, "other ZAP" },
{ DMU_BSWAP_ZAP, TRUE, "persistent error log" },
{ DMU_BSWAP_UINT8, TRUE, "SPA history" },
{ DMU_BSWAP_UINT64, TRUE, "SPA history offsets" },
{ DMU_BSWAP_ZAP, TRUE, "Pool properties" },
{ DMU_BSWAP_ZAP, TRUE, "DSL permissions" },
{ DMU_BSWAP_ACL, TRUE, "ZFS ACL" },
{ DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" },
{ DMU_BSWAP_UINT8, TRUE, "FUID table" },
{ DMU_BSWAP_UINT64, TRUE, "FUID table size" },
{ DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"},
{ DMU_BSWAP_ZAP, TRUE, "scan work queue" },
{ DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" },
{ DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" },
{ DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"},
{ DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" },
{ DMU_BSWAP_ZAP, TRUE, "DDT statistics" },
{ DMU_BSWAP_UINT8, TRUE, "System attributes" },
{ DMU_BSWAP_ZAP, TRUE, "SA master node" },
{ DMU_BSWAP_ZAP, TRUE, "SA attr registration" },
{ DMU_BSWAP_ZAP, TRUE, "SA attr layouts" },
{ DMU_BSWAP_ZAP, TRUE, "scan translations" },
{ DMU_BSWAP_UINT8, FALSE, "deduplicated block" },
{ DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" },
{ DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" },
{ DMU_BSWAP_ZAP, TRUE, "DSL dir clones" },
{ DMU_BSWAP_UINT64, TRUE, "bpobj subobj" }
{ DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "object directory" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "object array" },
{ DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" },
{ DMU_BSWAP_UINT64, TRUE, TRUE, "ZIL intent log" },
{ DMU_BSWAP_DNODE, TRUE, TRUE, "DMU dnode" },
{ DMU_BSWAP_OBJSET, TRUE, FALSE, "DMU objset" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "DSL directory" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL directory child map"},
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dataset snap map" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL props" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "DSL dataset" },
{ DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" },
{ DMU_BSWAP_OLDACL, TRUE, TRUE, "ZFS V0 ACL" },
{ DMU_BSWAP_UINT8, FALSE, TRUE, "ZFS plain file" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS directory" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS delete queue" },
{ DMU_BSWAP_UINT8, FALSE, TRUE, "zvol object" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" },
{ DMU_BSWAP_UINT8, FALSE, TRUE, "other uint8[]" },
{ DMU_BSWAP_UINT64, FALSE, TRUE, "other uint64[]" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" },
{ DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "Pool properties" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL permissions" },
{ DMU_BSWAP_ACL, TRUE, TRUE, "ZFS ACL" },
{ DMU_BSWAP_UINT8, TRUE, TRUE, "ZFS SYSACL" },
{ DMU_BSWAP_UINT8, TRUE, TRUE, "FUID table" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dataset next clones"},
{ DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS user/group used" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS user/group quota" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "snapshot refcount tags"},
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" },
{ DMU_BSWAP_UINT8, TRUE, TRUE, "System attributes" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "SA master node" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "SA attr registration" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "SA attr layouts" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" },
{ DMU_BSWAP_UINT8, FALSE, TRUE, "deduplicated block" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL deadlist map" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "DSL deadlist map hdr" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dir clones" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" }
};

const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
Expand Down Expand Up @@ -198,6 +198,8 @@ dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,

if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
if (flags & DMU_READ_NO_DECRYPT)
db_flags |= DB_RF_NO_DECRYPT;

err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
if (err == 0) {
Expand All @@ -221,6 +223,8 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,

if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
if (flags & DMU_READ_NO_DECRYPT)
db_flags |= DB_RF_NO_DECRYPT;

err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
if (err == 0) {
Expand Down Expand Up @@ -321,11 +325,18 @@ dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
* returns ENOENT, EIO, or 0.
*/
int
dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
dmu_bonus_hold_impl(objset_t *os, uint64_t object, void *tag, uint32_t flags,
dmu_buf_t **dbp)
{
dnode_t *dn;
dmu_buf_impl_t *db;
int error;
uint32_t db_flags = DB_RF_MUST_SUCCEED;

if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
if (flags & DMU_READ_NO_DECRYPT)
db_flags |= DB_RF_NO_DECRYPT;

error = dnode_hold(os, object, FTAG, &dn);
if (error)
Expand Down Expand Up @@ -355,12 +366,24 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)

dnode_rele(dn, FTAG);

VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
error = dbuf_read(db, NULL, db_flags);
if (error) {
dnode_evict_bonus(dn);
dbuf_rele(db, tag);
*dbp = NULL;
return (error);
}

*dbp = &db->db;
return (0);
}

int
dmu_bonus_hold(objset_t *os, uint64_t obj, void *tag, dmu_buf_t **dbp)
{
return (dmu_bonus_hold_impl(os, obj, tag, DMU_READ_NO_PREFETCH, dbp));
}

/*
* returns ENOENT, EIO, or 0.
*
Expand Down Expand Up @@ -601,8 +624,8 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
* indirect blocks prefeteched will be those that point to the blocks containing
* the data starting at offset, and continuing to offset + len.
*
* Note that if the indirect blocks above the blocks being prefetched are not in
* cache, they will be asychronously read in.
* Note that if the indirect blocks above the blocks being prefetched are not
* in cache, they will be asychronously read in.
*/
void
dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
Expand Down Expand Up @@ -1462,6 +1485,83 @@ dmu_return_arcbuf(arc_buf_t *buf)
arc_buf_destroy(buf, FTAG);
}

void
dmu_assign_arcbuf_impl(dmu_buf_t *handle, arc_buf_t *buf, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
dbuf_assign_arcbuf(db, buf, tx);
}

void
dmu_convert_to_raw(dmu_buf_t *handle, boolean_t byteorder, const uint8_t *salt,
const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
{
dmu_object_type_t type;
dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
uint64_t dsobj = dmu_objset_id(db->db_objset);

ASSERT3P(db->db_buf, !=, NULL);
ASSERT3U(dsobj, !=, 0);

dmu_buf_will_change_crypt_params(handle, tx);

DB_DNODE_ENTER(db);
type = DB_DNODE(db)->dn_type;
DB_DNODE_EXIT(db);

/*
* This technically violates the assumption the dmu code makes
* that dnode blocks are only released in syncing context.
*/
(void) arc_release(db->db_buf, db);
arc_convert_to_raw(db->db_buf, dsobj, byteorder, type, salt, iv, mac);
}

void
dmu_copy_from_buf(objset_t *os, uint64_t object, uint64_t offset,
dmu_buf_t *handle, dmu_tx_t *tx)
{
dmu_buf_t *dst_handle;
dmu_buf_impl_t *dstdb;
dmu_buf_impl_t *srcdb = (dmu_buf_impl_t *)handle;
arc_buf_t *abuf;
uint64_t datalen;
boolean_t byteorder;
uint8_t salt[ZIO_DATA_SALT_LEN];
uint8_t iv[ZIO_DATA_IV_LEN];
uint8_t mac[ZIO_DATA_MAC_LEN];

ASSERT3P(srcdb->db_buf, !=, NULL);

/* hold the db that we want to write to */
VERIFY0(dmu_buf_hold(os, object, offset, FTAG, &dst_handle,
DMU_READ_NO_DECRYPT));
dstdb = (dmu_buf_impl_t *)dst_handle;
datalen = arc_buf_size(srcdb->db_buf);

/* allocated an arc buffer that matches the type of srcdb->db_buf */
if (arc_is_encrypted(srcdb->db_buf)) {
arc_get_raw_params(srcdb->db_buf, &byteorder, salt, iv, mac);
abuf = arc_loan_raw_buf(os->os_spa, dmu_objset_id(os),
byteorder, salt, iv, mac, DB_DNODE(dstdb)->dn_type,
datalen, arc_buf_lsize(srcdb->db_buf),
arc_get_compression(srcdb->db_buf));
} else {
/* we won't get a compressed db back from dmu_buf_hold() */
ASSERT3U(arc_get_compression(srcdb->db_buf),
==, ZIO_COMPRESS_OFF);
abuf = arc_loan_buf(os->os_spa,
DMU_OT_IS_METADATA(DB_DNODE(dstdb)->dn_type), datalen);
}

ASSERT3U(datalen, ==, arc_buf_size(abuf));

/* copy the data to the new buffer and assign it to the dstdb */
bcopy(srcdb->db_buf->b_data, abuf->b_data, datalen);
dbuf_assign_arcbuf(dstdb, abuf, tx);
dmu_buf_rele(dst_handle, FTAG);
}

/*
* When possible directly assign passed loaned arc buffer to a dbuf.
* If this is not possible copy the contents of passed arc buf via
Expand Down Expand Up @@ -1537,7 +1637,7 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
BP_SET_LSIZE(bp, db->db_size);
} else if (!BP_IS_EMBEDDED(bp)) {
ASSERT(BP_GET_LEVEL(bp) == 0);
bp->blk_fill = 1;
BP_SET_FILL(bp, 1);
}
}
}
Expand Down Expand Up @@ -1842,6 +1942,20 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
return (0);
}

int
dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx)
{
dnode_t *dn;
int err;

err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
err = dnode_set_nlevels(dn, nlevels, tx);
dnode_rele(dn, FTAG);
return (err);
}

int
dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
dmu_tx_t *tx)
Expand Down Expand Up @@ -1916,6 +2030,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
boolean_t dedup = B_FALSE;
boolean_t nopwrite = B_FALSE;
boolean_t dedup_verify = os->os_dedup_verify;
boolean_t encrypt = B_FALSE;
int copies = os->os_copies;

/*
Expand Down Expand Up @@ -2003,16 +2118,44 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
}

zp->zp_checksum = checksum;
zp->zp_compress = compress;
ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
/*
* All objects in an encrypted objset are protected from modification
* via a MAC. Encrypted objects store their IV and salt in the last DVA
* in the bp, so we cannot use all copies. Encrypted objects are also
* not subject to nopwrite since writing the same data will still
* result in a new ciphertext. Only encrypted blocks can be dedup'd
* to avoid ambiguity in the dedup code since the DDT does not store
* object types.
*/
if (os->os_encrypted && (wp & WP_NOFILL) == 0) {
encrypt = B_TRUE;

if (DMU_OT_IS_ENCRYPTED(type)) {
copies = MIN(copies, SPA_DVAS_PER_BP - 1);
nopwrite = B_FALSE;
} else {
dedup = B_FALSE;
}

if (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)
compress = ZIO_COMPRESS_EMPTY;
}

zp->zp_compress = compress;
zp->zp_checksum = checksum;
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
zp->zp_level = level;
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
zp->zp_dedup = dedup;
zp->zp_dedup_verify = dedup && dedup_verify;
zp->zp_nopwrite = nopwrite;
zp->zp_encrypt = encrypt;
zp->zp_byteorder = ZFS_HOST_BYTEORDER;
bzero(zp->zp_salt, ZIO_DATA_SALT_LEN);
bzero(zp->zp_iv, ZIO_DATA_IV_LEN);
bzero(zp->zp_mac, ZIO_DATA_MAC_LEN);

ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
}

/*
Expand Down Expand Up @@ -2267,6 +2410,7 @@ EXPORT_SYMBOL(dmu_object_info_from_dnode);
EXPORT_SYMBOL(dmu_object_info_from_db);
EXPORT_SYMBOL(dmu_object_size_from_db);
EXPORT_SYMBOL(dmu_object_dnsize_from_db);
EXPORT_SYMBOL(dmu_object_set_nlevels);
EXPORT_SYMBOL(dmu_object_set_blocksize);
EXPORT_SYMBOL(dmu_object_set_checksum);
EXPORT_SYMBOL(dmu_object_set_compress);
Expand Down
296 changes: 239 additions & 57 deletions module/zfs/dmu_objset.c

Large diffs are not rendered by default.

853 changes: 681 additions & 172 deletions module/zfs/dmu_send.c

Large diffs are not rendered by default.

43 changes: 35 additions & 8 deletions module/zfs/dmu_traverse.c
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ traverse_zil(traverse_data_t *td, zil_header_t *zh)
zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);

(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
claim_txg);
claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT));

zil_free(zilog);
}
Expand Down Expand Up @@ -181,6 +181,7 @@ traverse_prefetch_metadata(traverse_data_t *td,
const blkptr_t *bp, const zbookmark_phys_t *zb)
{
arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
int zio_flags = ZIO_FLAG_CANFAIL;

if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
return;
Expand All @@ -196,8 +197,11 @@ traverse_prefetch_metadata(traverse_data_t *td,
if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
return;

if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
zio_flags |= ZIO_FLAG_RAW;

(void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
}

static boolean_t
Expand Down Expand Up @@ -294,6 +298,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
zbookmark_phys_t *czb;

ASSERT(!BP_IS_PROTECTED(bp));

err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
Expand Down Expand Up @@ -324,14 +330,23 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,

} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
uint32_t flags = ARC_FLAG_WAIT;
uint32_t zio_flags = ZIO_FLAG_CANFAIL;
int32_t i;
int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
dnode_phys_t *child_dnp;

/*
* dnode blocks might have their bonus buffers encrypted, so
* we must be careful to honor TRAVERSE_NO_DECRYPT
*/
if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
zio_flags |= ZIO_FLAG_RAW;

err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err != 0)
goto post;

child_dnp = buf->b_data;

for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
Expand All @@ -347,11 +362,15 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
break;
}
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
uint32_t zio_flags = ZIO_FLAG_CANFAIL;
arc_flags_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;

if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
zio_flags |= ZIO_FLAG_RAW;

err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err != 0)
goto post;

Expand Down Expand Up @@ -500,6 +519,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
prefetch_data_t *pfd = arg;
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;

ASSERT(pfd->pd_bytes_fetched >= 0);
Expand All @@ -518,8 +538,11 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
cv_broadcast(&pfd->pd_cv);
mutex_exit(&pfd->pd_mtx);

if ((pfd->pd_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
zio_flags |= ZIO_FLAG_RAW;

(void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb);
zio_flags, &aflags, zb);

return (0);
}
Expand Down Expand Up @@ -599,13 +622,17 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,

/* See comment on ZIL traversal in dsl_scan_visitds. */
if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
uint32_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;
arc_buf_t *buf;

err = arc_read(NULL, td->td_spa, rootbp,
arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, czb);
if ((td->td_flags & TRAVERSE_NO_DECRYPT) &&
BP_IS_PROTECTED(rootbp))
zio_flags |= ZIO_FLAG_RAW;

err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func,
&buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, czb);
if (err != 0)
return (err);

Expand Down
111 changes: 73 additions & 38 deletions module/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1246,7 +1246,12 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
rw_exit(&mdn->dn_struct_rwlock);
if (db == NULL)
return (SET_ERROR(EIO));
err = dbuf_read(db, NULL, DB_RF_CANFAIL);

/*
* We do not need to decrypt to read the dnode so it doesn't matter
* if we get the encrypted or decrypted version.
*/
err = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_NO_DECRYPT);
if (err) {
dbuf_rele(db, FTAG);
return (err);
Expand Down Expand Up @@ -1550,11 +1555,73 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
return (SET_ERROR(ENOTSUP));
}

static void
dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)
{
uint64_t txgoff = tx->tx_txg & TXG_MASK;
int old_nlevels = dn->dn_nlevels;
dmu_buf_impl_t *db;
list_t *list;
dbuf_dirty_record_t *new, *dr, *dr_next;

ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));

dn->dn_nlevels = new_nlevels;

ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
dn->dn_next_nlevels[txgoff] = new_nlevels;

/* dirty the left indirects */
db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
ASSERT(db != NULL);
new = dbuf_dirty(db, tx);
dbuf_rele(db, FTAG);

/* transfer the dirty records to the new indirect */
mutex_enter(&dn->dn_mtx);
mutex_enter(&new->dt.di.dr_mtx);
list = &dn->dn_dirty_records[txgoff];
for (dr = list_head(list); dr; dr = dr_next) {
dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
if (dr->dr_dbuf->db_level != new_nlevels-1 &&
dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
list_remove(&dn->dn_dirty_records[txgoff], dr);
list_insert_tail(&new->dt.di.dr_children, dr);
dr->dr_parent = new;
}
}
mutex_exit(&new->dt.di.dr_mtx);
mutex_exit(&dn->dn_mtx);
}

int
dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx)
{
int ret = 0;

rw_enter(&dn->dn_struct_rwlock, RW_WRITER);

if (dn->dn_nlevels == nlevels) {
ret = 0;
goto out;
} else if (nlevels < dn->dn_nlevels) {
ret = SET_ERROR(EINVAL);
goto out;
}

dnode_set_nlevels_impl(dn, nlevels, tx);

out:
rw_exit(&dn->dn_struct_rwlock);
return (ret);
}

/* read-holding callers must not rely on the lock being continuously held */
void
dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
{
uint64_t txgoff = tx->tx_txg & TXG_MASK;
int epbs, new_nlevels;
uint64_t sz;

Expand Down Expand Up @@ -1594,41 +1661,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)

ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);

if (new_nlevels > dn->dn_nlevels) {
int old_nlevels = dn->dn_nlevels;
dmu_buf_impl_t *db;
list_t *list;
dbuf_dirty_record_t *new, *dr, *dr_next;

dn->dn_nlevels = new_nlevels;

ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
dn->dn_next_nlevels[txgoff] = new_nlevels;

/* dirty the left indirects */
db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
ASSERT(db != NULL);
new = dbuf_dirty(db, tx);
dbuf_rele(db, FTAG);

/* transfer the dirty records to the new indirect */
mutex_enter(&dn->dn_mtx);
mutex_enter(&new->dt.di.dr_mtx);
list = &dn->dn_dirty_records[txgoff];
for (dr = list_head(list); dr; dr = dr_next) {
dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
if (dr->dr_dbuf->db_level != new_nlevels-1 &&
dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
list_remove(&dn->dn_dirty_records[txgoff], dr);
list_insert_tail(&new->dt.di.dr_children, dr);
dr->dr_parent = new;
}
}
mutex_exit(&new->dt.di.dr_mtx);
mutex_exit(&dn->dn_mtx);
}
if (new_nlevels > dn->dn_nlevels)
dnode_set_nlevels_impl(dn, new_nlevels, tx);

out:
if (have_read)
Expand Down Expand Up @@ -1987,7 +2021,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
*/
return (SET_ERROR(ESRCH));
}
error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
error = dbuf_read(db, NULL,
DB_RF_CANFAIL | DB_RF_HAVESTRUCT | DB_RF_NO_DECRYPT);
if (error) {
dbuf_rele(db, FTAG);
return (error);
Expand Down
13 changes: 10 additions & 3 deletions module/zfs/dnode_sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_send.h>
#include <sys/dsl_dataset.h>
#include <sys/spa.h>
#include <sys/range_tree.h>
Expand Down Expand Up @@ -557,6 +558,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
void
dnode_sync(dnode_t *dn, dmu_tx_t *tx)
{
objset_t *os = dn->dn_objset;
dnode_phys_t *dnp = dn->dn_phys;
int txgoff = tx->tx_txg & TXG_MASK;
list_t *list = &dn->dn_dirty_records[txgoff];
Expand All @@ -572,8 +574,13 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)

ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));

if (dmu_objset_userused_enabled(dn->dn_objset) &&
!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
/*
* Do user accounting if it is enabled and this is not
* an encrypted receive.
*/
if (dmu_objset_userused_enabled(os) &&
!DMU_OBJECT_IS_SPECIAL(dn->dn_object) &&
(!os->os_encrypted || !dmu_objset_is_receiving(os))) {
mutex_enter(&dn->dn_mtx);
dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
dn->dn_oldflags = dn->dn_phys->dn_flags;
Expand All @@ -584,7 +591,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
} else {
/* Once we account for it, we should always account for it. */
/* Once we account for it, we should always account for it */
ASSERT(!(dn->dn_phys->dn_flags &
DNODE_FLAG_USERUSED_ACCOUNTED));
ASSERT(!(dn->dn_phys->dn_flags &
Expand Down
2,611 changes: 2,611 additions & 0 deletions module/zfs/dsl_crypt.c

Large diffs are not rendered by default.

123 changes: 93 additions & 30 deletions module/zfs/dsl_dataset.c
Original file line number Diff line number Diff line change
Expand Up @@ -386,8 +386,8 @@ dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
}

int
dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
dsl_dataset_t **dsp)
dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj,
ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp)
{
objset_t *mos = dp->dp_meta_objset;
dmu_buf_t *dbuf;
Expand Down Expand Up @@ -548,11 +548,27 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
*dsp = ds;

if ((flags & DS_HOLD_FLAG_DECRYPT) && ds->ds_dir->dd_crypto_obj != 0) {
err = spa_keystore_create_mapping(dp->dp_spa, ds, ds);
if (err != 0) {
dsl_dataset_rele(ds, tag);
return (SET_ERROR(EACCES));
}
}

return (0);
}

int
dsl_dataset_hold(dsl_pool_t *dp, const char *name,
dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
dsl_dataset_t **dsp)
{
return (dsl_dataset_hold_obj_flags(dp, dsobj, 0, tag, dsp));
}

int
dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
void *tag, dsl_dataset_t **dsp)
{
dsl_dir_t *dd;
Expand All @@ -568,7 +584,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
ASSERT(dsl_pool_config_held(dp));
obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
if (obj != 0)
err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag, &ds);
else
err = SET_ERROR(ENOENT);

Expand All @@ -577,16 +593,18 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
dsl_dataset_t *snap_ds;

if (*snapname++ != '@') {
dsl_dataset_rele(ds, tag);
dsl_dataset_rele_flags(ds, flags, tag);
dsl_dir_rele(dd, FTAG);
return (SET_ERROR(ENOENT));
}

dprintf("looking for snapshot '%s'\n", snapname);
err = dsl_dataset_snap_lookup(ds, snapname, &obj);
if (err == 0)
err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
dsl_dataset_rele(ds, tag);
if (err == 0) {
err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag,
&snap_ds);
}
dsl_dataset_rele_flags(ds, flags, tag);

if (err == 0) {
mutex_enter(&snap_ds->ds_lock);
Expand All @@ -604,29 +622,36 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
}

int
dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag,
dsl_dataset_t **dsp)
{
return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp));
}

int
dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
void *tag, dsl_dataset_t **dsp)
{
int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp);
if (err != 0)
return (err);
if (!dsl_dataset_tryown(*dsp, tag)) {
dsl_dataset_rele(*dsp, tag);
dsl_dataset_rele_flags(*dsp, flags, tag);
*dsp = NULL;
return (SET_ERROR(EBUSY));
}
return (0);
}

int
dsl_dataset_own(dsl_pool_t *dp, const char *name,
dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
void *tag, dsl_dataset_t **dsp)
{
int err = dsl_dataset_hold(dp, name, tag, dsp);
int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp);
if (err != 0)
return (err);
if (!dsl_dataset_tryown(*dsp, tag)) {
dsl_dataset_rele(*dsp, tag);
dsl_dataset_rele_flags(*dsp, flags, tag);
return (SET_ERROR(EBUSY));
}
return (0);
Expand Down Expand Up @@ -707,13 +732,25 @@ dsl_dataset_namelen(dsl_dataset_t *ds)
}

void
dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
{
if (ds->ds_dir != NULL && ds->ds_dir->dd_crypto_obj != 0 &&
(flags & DS_HOLD_FLAG_DECRYPT)) {
(void) spa_keystore_remove_mapping(ds->ds_dir->dd_pool->dp_spa,
ds->ds_object, ds);
}

dmu_buf_rele(ds->ds_dbuf, tag);
}

void
dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
{
dsl_dataset_rele_flags(ds, 0, tag);
}

void
dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
{
ASSERT3P(ds->ds_owner, ==, tag);
ASSERT(ds->ds_dbuf != NULL);
Expand All @@ -722,7 +759,7 @@ dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
ds->ds_owner = NULL;
mutex_exit(&ds->ds_lock);
dsl_dataset_long_rele(ds, tag);
dsl_dataset_rele(ds, tag);
dsl_dataset_rele_flags(ds, flags, tag);
}

boolean_t
Expand Down Expand Up @@ -751,7 +788,7 @@ dsl_dataset_has_owner(dsl_dataset_t *ds)
return (rv);
}

static void
void
dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
Expand Down Expand Up @@ -781,7 +818,7 @@ dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)

uint64_t
dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
uint64_t flags, dmu_tx_t *tx)
dsl_crypto_params_t *dcp, uint64_t flags, dmu_tx_t *tx)
{
dsl_pool_t *dp = dd->dd_pool;
dmu_buf_t *dbuf;
Expand Down Expand Up @@ -881,6 +918,9 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
}
}

/* handle encryption */
dsl_dataset_create_crypt_sync(dsobj, dd, origin, dcp, tx);

if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;

Expand All @@ -903,6 +943,8 @@ dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
zio_t *zio;

bzero(&os->os_zil_header, sizeof (os->os_zil_header));
if (os->os_encrypted)
os->os_next_write_raw = B_TRUE;

zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
dsl_dataset_sync(ds, zio, tx);
Expand All @@ -916,7 +958,8 @@ dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)

uint64_t
dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
dsl_dataset_t *origin, uint64_t flags, cred_t *cr,
dsl_crypto_params_t *dcp, dmu_tx_t *tx)
{
dsl_pool_t *dp = pdd->dd_pool;
uint64_t dsobj, ddobj;
Expand All @@ -928,7 +971,7 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));

dsobj = dsl_dataset_create_sync_dd(dd, origin,
dsobj = dsl_dataset_create_sync_dd(dd, origin, dcp,
flags & ~DS_CREATE_FLAG_NODIRTY, tx);

dsl_deleg_set_create_perms(dd, tx, cr);
Expand Down Expand Up @@ -1821,6 +1864,10 @@ get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
DS_FIELD_RESUME_COMPRESSOK) == 0) {
fnvlist_add_boolean(token_nv, "compressok");
}
if (zap_contains(dp->dp_meta_objset, ds->ds_object,
DS_FIELD_RESUME_RAWOK) == 0) {
fnvlist_add_boolean(token_nv, "rawok");
}
packed = fnvlist_pack(token_nv, &packed_size);
fnvlist_free(token_nv);
compressed = kmem_alloc(packed_size, KM_SLEEP);
Expand Down Expand Up @@ -1851,6 +1898,7 @@ get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
void
dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
{
int err;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
uint64_t refd, avail, uobjs, aobjs, ratio;

Expand Down Expand Up @@ -1901,12 +1949,12 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
ds->ds_userrefs);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
dsl_dataset_crypt_stats(ds, nv);

if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
uint64_t written, comp, uncomp;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
dsl_dataset_t *prev;
int err;

err = dsl_dataset_hold_obj(dp,
dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
Expand Down Expand Up @@ -2340,7 +2388,7 @@ dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
fnvlist_add_string(ddra->ddra_result, "target", namebuf);

cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, NULL, tx);

VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));

Expand Down Expand Up @@ -2427,6 +2475,23 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
return (SET_ERROR(EXDEV));
}

snap = list_head(&ddpa->shared_snaps);
if (snap == NULL) {
err = SET_ERROR(ENOENT);
goto out;
}
origin_ds = snap->ds;

/*
* Encrypted clones share a DSL Crypto Key with their origin's dsl dir.
* When doing a promote we must make sure the encryption root for
* both the target and the target's origin does not change to avoid
* needing to rewrap encryption keys
*/
err = dsl_dataset_promote_crypt_check(hds->ds_dir, origin_ds->ds_dir);
if (err != 0)
goto out;

/*
* Compute and check the amount of space to transfer. Since this is
* so expensive, don't do the preliminary check.
Expand All @@ -2436,13 +2501,6 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
return (0);
}

snap = list_head(&ddpa->shared_snaps);
if (snap == NULL) {
err = SET_ERROR(ENOENT);
goto out;
}
origin_ds = snap->ds;

/* compute origin's new unique space */
snap = list_tail(&ddpa->clone_snaps);
ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
Expand Down Expand Up @@ -2611,6 +2669,8 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
NULL, FTAG, &odd));

dsl_dataset_promote_crypt_sync(hds->ds_dir, odd, tx);

/* change origin's next snap */
dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
Expand Down Expand Up @@ -3692,11 +3752,14 @@ MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size");
#endif

EXPORT_SYMBOL(dsl_dataset_hold);
EXPORT_SYMBOL(dsl_dataset_hold_flags);
EXPORT_SYMBOL(dsl_dataset_hold_obj);
EXPORT_SYMBOL(dsl_dataset_hold_obj_flags);
EXPORT_SYMBOL(dsl_dataset_own);
EXPORT_SYMBOL(dsl_dataset_own_obj);
EXPORT_SYMBOL(dsl_dataset_name);
EXPORT_SYMBOL(dsl_dataset_rele);
EXPORT_SYMBOL(dsl_dataset_rele_flags);
EXPORT_SYMBOL(dsl_dataset_disown);
EXPORT_SYMBOL(dsl_dataset_tryown);
EXPORT_SYMBOL(dsl_dataset_create_sync);
Expand Down
14 changes: 10 additions & 4 deletions module/zfs/dsl_destroy.c
Original file line number Diff line number Diff line change
Expand Up @@ -598,8 +598,8 @@ old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
ka.ds = ds;
ka.tx = tx;
VERIFY0(traverse_dataset(ds,
dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
kill_blkptr, &ka));
dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST |
TRAVERSE_NO_DECRYPT, kill_blkptr, &ka));
ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
dsl_dataset_phys(ds)->ds_unique_bytes == 0);
}
Expand Down Expand Up @@ -706,6 +706,11 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
for (t = 0; t < DD_USED_NUM; t++)
ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);

if (dd->dd_crypto_obj != 0) {
dsl_crypto_key_destroy_sync(dd->dd_crypto_obj, tx);
(void) spa_keystore_unload_wkey_impl(dp->dp_spa, dd->dd_object);
}

VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
Expand Down Expand Up @@ -951,7 +956,8 @@ dsl_destroy_head(const char *name)
* remove the objects from open context so that the txg sync
* is not too long.
*/
error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, B_FALSE,
FTAG, &os);
if (error == 0) {
uint64_t obj;
uint64_t prev_snap_txg =
Expand All @@ -963,7 +969,7 @@ dsl_destroy_head(const char *name)
(void) dmu_free_long_object(os, obj);
/* sync out all frees */
txg_wait_synced(dmu_objset_pool(os), 0);
dmu_objset_disown(os, FTAG);
dmu_objset_disown(os, B_FALSE, FTAG);
}
}

Expand Down
43 changes: 30 additions & 13 deletions module/zfs/dsl_dir.c
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
{
dmu_buf_t *dbuf;
dsl_dir_t *dd;
dmu_object_info_t doi;
int err;

ASSERT(dsl_pool_config_held(dp));
Expand All @@ -167,21 +168,27 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
if (err != 0)
return (err);
dd = dmu_buf_get_user(dbuf);
#ifdef ZFS_DEBUG
{
dmu_object_info_t doi;
dmu_object_info_from_db(dbuf, &doi);
ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
}
#endif

dmu_object_info_from_db(dbuf, &doi);
ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));

if (dd == NULL) {
dsl_dir_t *winner;

dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
dd->dd_object = ddobj;
dd->dd_dbuf = dbuf;
dd->dd_pool = dp;

if (dsl_dir_is_zapified(dd) &&
zap_contains(dp->dp_meta_objset, ddobj,
DD_FIELD_CRYPTO_KEY_OBJ) == 0) {
VERIFY0(zap_lookup(dp->dp_meta_objset,
ddobj, DD_FIELD_CRYPTO_KEY_OBJ,
sizeof (uint64_t), 1, &dd->dd_crypto_obj));
}

mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
dsl_prop_init(dd);

Expand Down Expand Up @@ -918,6 +925,7 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;

dmu_buf_rele(dbuf, FTAG);

return (ddobj);
Expand All @@ -935,6 +943,8 @@ dsl_dir_is_clone(dsl_dir_t *dd)
void
dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
{
uint64_t intval;

mutex_enter(&dd->dd_lock);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
dsl_dir_phys(dd)->dd_used_bytes);
Expand Down Expand Up @@ -962,18 +972,17 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
mutex_exit(&dd->dd_lock);

if (dsl_dir_is_zapified(dd)) {
uint64_t count;
objset_t *os = dd->dd_pool->dp_meta_objset;

if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
sizeof (count), 1, &count) == 0) {
sizeof (intval), 1, &intval) == 0) {
dsl_prop_nvlist_add_uint64(nv,
ZFS_PROP_FILESYSTEM_COUNT, count);
ZFS_PROP_FILESYSTEM_COUNT, intval);
}
if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
sizeof (count), 1, &count) == 0) {
sizeof (intval), 1, &intval) == 0) {
dsl_prop_nvlist_add_uint64(nv,
ZFS_PROP_SNAPSHOT_COUNT, count);
ZFS_PROP_SNAPSHOT_COUNT, intval);
}
}

Expand Down Expand Up @@ -1814,6 +1823,14 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
}
}

/* check for encryption errors */
error = dsl_dir_rename_crypt_check(dd, newparent);
if (error != 0) {
dsl_dir_rele(newparent, FTAG);
dsl_dir_rele(dd, FTAG);
return (SET_ERROR(EACCES));
}

/* no rename into our descendant */
if (closest_common_ancestor(dd, newparent) == dd) {
dsl_dir_rele(newparent, FTAG);
Expand Down
19 changes: 16 additions & 3 deletions module/zfs/dsl_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,8 @@ dsl_pool_close(dsl_pool_t *dp)
}

dsl_pool_t *
dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
dsl_pool_create(spa_t *spa, nvlist_t *zplprops, dsl_crypto_params_t *dcp,
uint64_t txg)
{
int err;
dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
Expand All @@ -373,6 +374,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
/* create and open the MOS (meta-objset) */
dp->dp_meta_objset = dmu_objset_create_impl(spa,
NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
spa->spa_meta_objset = dp->dp_meta_objset;

/* create the pool directory */
err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
Expand Down Expand Up @@ -410,8 +412,19 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
dsl_pool_create_origin(dp, tx);

/*
* Some features may be needed when creating the root dataset, so we
* create the feature objects here.
*/
if (spa_version(spa) >= SPA_VERSION_FEATURES)
spa_feature_create_zap_objects(spa, tx);

if (dcp != NULL && dcp->cp_crypt != ZIO_CRYPT_OFF &&
dcp->cp_crypt != ZIO_CRYPT_INHERIT)
spa_feature_enable(spa, SPA_FEATURE_ENCRYPTION, tx);

/* create the root dataset */
obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, dcp, 0, tx);

/* create the root objset */
VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
Expand Down Expand Up @@ -865,7 +878,7 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)

/* create the origin dir, ds, & snap-ds */
dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
NULL, 0, kcred, tx);
NULL, 0, kcred, NULL, tx);
VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
Expand Down
3 changes: 2 additions & 1 deletion module/zfs/dsl_prop.c
Original file line number Diff line number Diff line change
Expand Up @@ -963,7 +963,7 @@ typedef enum dsl_prop_getflags {
DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */
DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */
DSL_PROP_GET_LOCAL = 0x4, /* local properties */
DSL_PROP_GET_RECEIVED = 0x8 /* received properties */
DSL_PROP_GET_RECEIVED = 0x8, /* received properties */
} dsl_prop_getflags_t;

static int
Expand Down Expand Up @@ -1130,6 +1130,7 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp,
if (err)
break;
}

out:
if (err) {
nvlist_free(*nvp);
Expand Down
17 changes: 14 additions & 3 deletions module/zfs/dsl_scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -683,7 +683,7 @@ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
zilog = zil_alloc(dp->dp_meta_objset, zh);

(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
claim_txg);
claim_txg, B_FALSE);

zil_free(zilog);
}
Expand All @@ -695,6 +695,7 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
{
zbookmark_phys_t czb;
arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;

if (zfs_no_scrub_prefetch)
return;
Expand All @@ -703,11 +704,16 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
(BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
return;

if (BP_IS_PROTECTED(bp)) {
ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE);
ASSERT3U(BP_GET_LEVEL(bp), ==, 0);
zio_flags |= ZIO_FLAG_RAW;
}

SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);

(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
NULL, NULL, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, &czb);
}

static boolean_t
Expand Down Expand Up @@ -793,6 +799,11 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
arc_buf_t *buf;

if (BP_IS_PROTECTED(bp)) {
ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
zio_flags |= ZIO_FLAG_RAW;
}

err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
Expand Down
Loading