From d3f96c69144aa7e76f1dd2f91bcca826ec9b239b Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Tue, 3 Oct 2017 16:45:31 -0700 Subject: [PATCH] Implement Redacted Send/Receive --- cmd/zdb/zdb.c | 280 +- cmd/zfs/zfs_main.c | 183 +- cmd/zstreamdump/zstreamdump.c | 16 + configure.ac | 3 + include/libzfs.h | 9 +- include/libzfs_core.h | 13 +- include/spl/sys/debug.h | 73 +- include/sys/Makefile.am | 3 + include/sys/bqueue.h | 6 +- include/sys/dbuf.h | 4 + include/sys/dmu.h | 3 + include/sys/dmu_impl.h | 41 +- include/sys/dmu_recv.h | 87 + include/sys/dmu_redact.h | 58 + include/sys/dmu_send.h | 65 +- include/sys/dmu_traverse.h | 16 +- include/sys/dsl_bookmark.h | 85 +- include/sys/dsl_dataset.h | 48 +- include/sys/dsl_destroy.h | 3 +- include/sys/fs/zfs.h | 23 +- include/sys/nvpair.h | 40 +- include/sys/objlist.h | 51 + include/sys/spa.h | 26 +- include/sys/zfs_context.h | 3 +- include/sys/zfs_ioctl.h | 14 +- include/sys/zfs_vfsops.h | 3 + include/zfeature_common.h | 17 +- lib/libzfs/libzfs_dataset.c | 55 +- lib/libzfs/libzfs_iter.c | 11 +- lib/libzfs/libzfs_mount.c | 20 +- lib/libzfs/libzfs_sendrecv.c | 897 ++- lib/libzfs/libzfs_util.c | 2 + lib/libzfs_core/libzfs_core.c | 128 +- lib/libzpool/Makefile.am | 3 + lib/libzpool/kernel.c | 1 + man/man5/zfs-module-parameters.5 | 78 +- man/man5/zpool-features.5 | 41 +- man/man8/zfs.8 | 140 +- module/nvpair/fnvpair.c | 81 +- module/zcommon/zfeature_common.c | 104 +- module/zcommon/zfs_namecheck.c | 1 + module/zcommon/zfs_prop.c | 22 +- module/zfs/Makefile.in | 3 + module/zfs/arc.c | 2 + module/zfs/bptree.c | 5 +- module/zfs/bqueue.c | 73 +- module/zfs/dbuf.c | 103 +- module/zfs/dmu.c | 16 +- module/zfs/dmu_diff.c | 5 +- module/zfs/dmu_object.c | 3 +- module/zfs/dmu_objset.c | 21 +- module/zfs/dmu_recv.c | 3011 +++++++++ module/zfs/dmu_redact.c | 1108 ++++ module/zfs/dmu_send.c | 5642 ++++++----------- module/zfs/dmu_traverse.c | 28 +- module/zfs/dnode_sync.c | 6 +- module/zfs/dsl_bookmark.c | 1223 +++- module/zfs/dsl_crypt.c | 8 +- module/zfs/dsl_dataset.c | 634 +- module/zfs/dsl_deadlist.c | 68 +- module/zfs/dsl_destroy.c | 184 +- module/zfs/dsl_pool.c | 3 +- module/zfs/dsl_scan.c | 14 +- module/zfs/objlist.c | 84 + module/zfs/refcount.c | 6 +- module/zfs/spa.c | 3 +- module/zfs/zfs_ioctl.c | 382 +- module/zfs/zfs_vfsops.c | 102 +- module/zfs/zio.c | 3 + tests/runfiles/linux.run | 9 + tests/zfs-tests/cmd/Makefile.am | 4 +- tests/zfs-tests/cmd/get_diff/Makefile.am | 6 + tests/zfs-tests/cmd/get_diff/get_diff.c | 109 + tests/zfs-tests/cmd/stride_dd/.gitignore | 1 + tests/zfs-tests/cmd/stride_dd/Makefile.am | 7 + tests/zfs-tests/cmd/stride_dd/stride_dd.c | 214 + tests/zfs-tests/include/commands.cfg | 5 +- tests/zfs-tests/include/libtest.shlib | 3 +- tests/zfs-tests/tests/functional/Makefile.am | 1 + .../zfs_clone/zfs_clone_rm_nested.ksh | 77 + .../cli_root/zfs_send/zfs_send-b.ksh | 1 - .../cli_root/zfs_send/zfs_send_006_pos.ksh | 4 +- .../cli_root/zpool_get/zpool_get.cfg | 3 + .../functional/redacted_send/Makefile.am | 25 + .../functional/redacted_send/cleanup.ksh | 33 + .../functional/redacted_send/redacted.cfg | 86 + .../functional/redacted_send/redacted.kshlib | 269 + .../redacted_send/redacted_compressed.ksh | 71 + .../redacted_send/redacted_contents.ksh | 162 + .../redacted_send/redacted_deleted.ksh | 102 + .../redacted_disabled_feature.ksh | 71 + .../redacted_send/redacted_embedded.ksh | 103 + .../redacted_send/redacted_holes.ksh | 120 + .../redacted_send/redacted_incrementals.ksh | 152 + .../redacted_send/redacted_largeblocks.ksh | 63 + .../redacted_send/redacted_many_clones.ksh | 68 + .../redacted_send/redacted_mixed_recsize.ksh | 77 + .../redacted_send/redacted_mounts.ksh | 109 + .../redacted_send/redacted_negative.ksh | 80 + .../redacted_send/redacted_origin.ksh | 87 + .../redacted_send/redacted_props.ksh | 77 + .../redacted_send/redacted_resume.ksh | 87 + .../redacted_send/redacted_size.ksh | 64 + .../redacted_send/redacted_volume.ksh | 105 + .../tests/functional/redacted_send/setup.ksh | 36 + .../tests/functional/rsend/rsend.kshlib | 4 +- .../tests/functional/rsend/rsend_016_neg.ksh | 33 + 107 files changed, 13568 insertions(+), 4457 deletions(-) create mode 100644 include/sys/dmu_recv.h create mode 100644 include/sys/dmu_redact.h create mode 100644 include/sys/objlist.h create mode 100644 module/zfs/dmu_recv.c create mode 100644 module/zfs/dmu_redact.c create mode 100644 module/zfs/objlist.c create mode 100644 tests/zfs-tests/cmd/get_diff/Makefile.am create mode 100644 tests/zfs-tests/cmd/get_diff/get_diff.c create mode 100644 tests/zfs-tests/cmd/stride_dd/.gitignore create mode 100644 tests/zfs-tests/cmd/stride_dd/Makefile.am create mode 100644 tests/zfs-tests/cmd/stride_dd/stride_dd.c create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_rm_nested.ksh create mode 100644 tests/zfs-tests/tests/functional/redacted_send/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/redacted_send/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/redacted_send/redacted.cfg create mode 100644 tests/zfs-tests/tests/functional/redacted_send/redacted.kshlib create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_compressed.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_contents.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_deleted.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_disabled_feature.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_embedded.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_holes.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_incrementals.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_largeblocks.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_mixed_recsize.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_mounts.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_negative.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_origin.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_props.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_resume.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_size.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_volume.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/rsend/rsend_016_neg.ksh diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 21113da2f03c..eeac20bf036d 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -50,11 +50,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -137,6 +139,7 @@ usage(void) "\t\t[ [ ...]]\n" "\t%s [-AdiPv] [-e [-V] [-p ...]] [-U ] \n" "\t\t[ ...]\n" + "\t%s [-v] \n" "\t%s -C [-A] [-U ]\n" "\t%s -l [-Aqu] \n" "\t%s -m [-AFLPX] [-e [-V] [-p ...]] [-t ] " @@ -148,7 +151,7 @@ usage(void) "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " "\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, - cmdname, cmdname); + cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); @@ -411,6 +414,43 @@ dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) static void dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) { + uint64_t *arr; + + if (dump_opt['d'] < 6) + return; + if (data == NULL) { + dmu_object_info_t doi; + + VERIFY0(dmu_object_info(os, object, &doi)); + size = doi.doi_max_offset; + arr = kmem_alloc(size, KM_SLEEP); + + int err = dmu_read(os, object, 0, size, arr, 0); + if (err != 0) { + (void) printf("got error %u from dmu_read\n", err); + kmem_free(arr, size); + return; + } + } else { + arr = data; + } + + if (size == 0) { + (void) printf("\t\t[]\n"); + return; + } + + (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]); + for (size_t i = 1; i * sizeof (uint64_t) < size; i++) { + if (i % 4 != 0) + (void) printf(", %0llx", (u_longlong_t)arr[i]); + else + (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]); + } + (void) printf("]\n"); + + if (data == NULL) + kmem_free(arr, size); } /*ARGSUSED*/ @@ -1488,6 +1528,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; uint64_t fill = 0; + ASSERT(!BP_IS_REDACTED(bp)); err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); @@ -1772,6 +1813,126 @@ dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) } } +static int +dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact, + boolean_t print_list) +{ + int err = 0; + zfs_bookmark_phys_t prop; + objset_t *mos = dp->dp_spa->spa_meta_objset; + err = dsl_bookmark_lookup(dp, name, NULL, &prop); + + if (err != 0) { + return (err); + } + + (void) printf("\t#%s: ", strchr(name, '#') + 1); + (void) printf("{guid: %llx creation_txg: %llu creation_time: " + "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid, + (u_longlong_t)prop.zbm_creation_txg, + (u_longlong_t)prop.zbm_creation_time, + (u_longlong_t)prop.zbm_redaction_obj); + + IMPLY(print_list, print_redact); + if (!print_redact || prop.zbm_redaction_obj == 0) + return (0); + + redaction_list_t *rl; + VERIFY0(dsl_redaction_list_hold_obj(dp, + prop.zbm_redaction_obj, FTAG, &rl)); + + redaction_list_phys_t *rlp = rl->rl_phys; + (void) printf("\tRedacted:\n\t\tProgress: "); + if (rlp->rlp_last_object != UINT64_MAX || + rlp->rlp_last_blkid != UINT64_MAX) { + (void) printf("%llu %llu (incomplete)\n", + (u_longlong_t)rlp->rlp_last_object, + (u_longlong_t)rlp->rlp_last_blkid); + } else { + (void) printf("complete\n"); + } + (void) printf("\t\tSnapshots: ["); + for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) { + if (i > 0) + (void) printf(", "); + (void) printf("%0llu", + (u_longlong_t)rlp->rlp_snaps[i]); + } + (void) printf("]\n\t\tLength: %llu\n", + (u_longlong_t)rlp->rlp_num_entries); + + if (!print_list) { + dsl_redaction_list_rele(rl, FTAG); + return (0); + } + + if (rlp->rlp_num_entries == 0) { + dsl_redaction_list_rele(rl, FTAG); + (void) printf("\t\tRedaction List: []\n\n"); + return (0); + } + + redact_block_phys_t *rbp_buf; + uint64_t size; + dmu_object_info_t doi; + + VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi)); + size = doi.doi_max_offset; + rbp_buf = kmem_alloc(size, KM_SLEEP); + + err = dmu_read(mos, prop.zbm_redaction_obj, 0, size, + rbp_buf, 0); + if (err != 0) { + dsl_redaction_list_rele(rl, FTAG); + kmem_free(rbp_buf, size); + return (err); + } + + (void) printf("\t\tRedaction List: [{object: %llx, offset: " + "%llx, blksz: %x, count: %llx}", + (u_longlong_t)rbp_buf[0].rbp_object, + (u_longlong_t)rbp_buf[0].rbp_blkid, + (uint_t)(redact_block_get_size(&rbp_buf[0])), + (u_longlong_t)redact_block_get_count(&rbp_buf[0])); + + for (size_t i = 1; i < rlp->rlp_num_entries; i++) { + (void) printf(",\n\t\t{object: %llx, offset: %llx, " + "blksz: %x, count: %llx}", + (u_longlong_t)rbp_buf[i].rbp_object, + (u_longlong_t)rbp_buf[i].rbp_blkid, + (uint_t)(redact_block_get_size(&rbp_buf[i])), + (u_longlong_t)redact_block_get_count(&rbp_buf[i])); + } + dsl_redaction_list_rele(rl, FTAG); + kmem_free(rbp_buf, size); + (void) printf("]\n\n"); + return (0); +} + +static void +dump_bookmarks(objset_t *os, const char *osname, int verbosity) +{ + zap_cursor_t zc; + zap_attribute_t attr; + dsl_dataset_t *ds = dmu_objset_ds(os); + dsl_pool_t *dp = spa_get_dsl(os->os_spa); + objset_t *mos = os->os_spa->spa_meta_objset; + if (verbosity < 4) + return; + VERIFY0(dsl_pool_hold(osname, FTAG, &dp)); + + for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + char buf[ZFS_MAX_DATASET_NAME_LEN]; + VERIFY0(snprintf(buf, sizeof (buf), "%s#%s", osname, + attr.za_name)); + (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6); + } + zap_cursor_fini(&zc); + dsl_pool_rele(dp, FTAG); +} + static void dump_deadlist(dsl_deadlist_t *dl) { @@ -1834,19 +1995,26 @@ static objset_t *sa_os = NULL; static sa_attr_type_t *sa_attr_table = NULL; static int -open_objset(const char *path, dmu_objset_type_t type, void *tag, objset_t **osp) +open_objset(const char *path, void *tag, objset_t **osp) { int err; uint64_t sa_attrs = 0; uint64_t version = 0; VERIFY3P(sa_os, ==, NULL); - err = dmu_objset_own(path, type, B_TRUE, B_FALSE, tag, osp); + /* + * We can't own an objset if it's redacted. Therefore, we do this + * dance: hold the objset, then acquire a long hold on its dataset, then + * release the pool. + */ + err = dmu_objset_hold(path, tag, osp); if (err != 0) { - (void) fprintf(stderr, "failed to own dataset '%s': %s\n", path, - strerror(err)); + (void) fprintf(stderr, "failed to hold dataset '%s': %s\n", + path, strerror(err)); return (err); } + dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); + dsl_pool_rele(dmu_objset_pool(*osp), tag); if (dmu_objset_type(*osp) == DMU_OST_ZFS && !(*osp)->os_encrypted) { (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, @@ -1860,7 +2028,8 @@ open_objset(const char *path, dmu_objset_type_t type, void *tag, objset_t **osp) if (err != 0) { (void) fprintf(stderr, "sa_setup failed: %s\n", strerror(err)); - dmu_objset_disown(*osp, B_FALSE, tag); + dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); + dsl_dataset_rele(dmu_objset_ds(*osp), tag); *osp = NULL; } } @@ -1875,7 +2044,8 @@ close_objset(objset_t *os, void *tag) VERIFY3P(os, ==, sa_os); if (os->os_sa != NULL) sa_tear_down(os); - dmu_objset_disown(os, B_FALSE, tag); + dsl_dataset_long_rele(dmu_objset_ds(os), tag); + dsl_dataset_rele(dmu_objset_ds(os), tag); sa_attr_table = NULL; sa_os = NULL; } @@ -2153,8 +2323,8 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { }; static void -dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header, - uint64_t *dnode_slots_used) +dump_object(objset_t *os, uint64_t object, int verbosity, + boolean_t *print_header, uint64_t *dnode_slots_used) { dmu_buf_t *db = NULL; dmu_object_info_t doi; @@ -2273,7 +2443,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header, (void) printf("\t\t(object encrypted)\n"); } - *print_header = 1; + *print_header = B_TRUE; } if (verbosity >= 5) @@ -2334,7 +2504,7 @@ dump_dir(objset_t *os) char osname[ZFS_MAX_DATASET_NAME_LEN]; const char *type = "UNKNOWN"; int verbosity = dump_opt['d']; - int print_header = 1; + boolean_t print_header; unsigned i; int error; uint64_t total_slots_used = 0; @@ -2348,6 +2518,8 @@ dump_dir(objset_t *os) dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + print_header = B_TRUE; + if (dds.dds_type < DMU_OST_NUMTYPES) type = objset_types[dds.dds_type]; @@ -2381,9 +2553,10 @@ dump_dir(objset_t *os) numbuf, (u_longlong_t)usedobjs, blkbuf); if (zopt_objects != 0) { - for (i = 0; i < zopt_objects; i++) + for (i = 0; i < zopt_objects; i++) { dump_object(os, zopt_object[i], verbosity, &print_header, NULL); + } (void) printf("\n"); return; } @@ -2401,6 +2574,9 @@ dump_dir(objset_t *os) } } + if (dmu_objset_ds(os) != NULL) + dump_bookmarks(os, osname, verbosity); + if (verbosity < 2) return; @@ -2865,7 +3041,7 @@ static int dump_path_impl(objset_t *os, uint64_t obj, char *name) { int err; - int header = 1; + boolean_t header = B_TRUE; uint64_t child_obj; char *s; dmu_buf_t *db; @@ -2936,7 +3112,7 @@ dump_path(char *ds, char *path) objset_t *os; uint64_t root_obj; - err = open_objset(ds, DMU_OST_ZFS, FTAG, &os); + err = open_objset(ds, FTAG, &os); if (err != 0) return (err); @@ -2944,7 +3120,7 @@ dump_path(char *ds, char *path) if (err != 0) { (void) fprintf(stderr, "can't lookup root znode: %s\n", strerror(err)); - dmu_objset_disown(os, B_FALSE, FTAG); + close_objset(os, FTAG); return (EINVAL); } @@ -3125,6 +3301,7 @@ dump_label(const char *dev) } static uint64_t dataset_feature_count[SPA_FEATURES]; +static uint64_t global_feature_count[SPA_FEATURES]; static uint64_t remap_deadlist_count = 0; /*ARGSUSED*/ @@ -3135,12 +3312,12 @@ dump_one_dir(const char *dsname, void *arg) objset_t *os; spa_feature_t f; - error = open_objset(dsname, DMU_OST_ANY, FTAG, &os); + error = open_objset(dsname, FTAG, &os); if (error != 0) return (0); for (f = 0; f < SPA_FEATURES; f++) { - if (!dmu_objset_ds(os)->ds_feature_inuse[f]) + if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f)) continue; ASSERT(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); @@ -3151,6 +3328,15 @@ dump_one_dir(const char *dsname, void *arg) remap_deadlist_count++; } + for (dsl_bookmark_node_t *dbn = + avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL; + dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) { + if (dbn->dbn_phys.zbm_redaction_obj != 0) + global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++; + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) + global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++; + } + dump_dir(os); close_objset(os, FTAG); fuid_table_destroy(); @@ -3387,7 +3573,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type; boolean_t is_metadata; - if (bp == NULL) + if (zb->zb_level == ZB_DNODE_LEVEL) return (0); if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { @@ -3402,7 +3588,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, blkbuf); } - if (BP_IS_HOLE(bp)) + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) return (0); type = BP_GET_TYPE(bp); @@ -4436,7 +4622,8 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, avl_index_t where; zdb_ddt_entry_t *zdde, zdde_search; - if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || + BP_IS_EMBEDDED(bp)) return (0); if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { @@ -5100,27 +5287,43 @@ dump_zpool(spa_t *spa) } dump_dtl(spa->spa_root_vdev, 0); } + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) + global_feature_count[f] = UINT64_MAX; + global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0; + global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0; + (void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); for (f = 0; f < SPA_FEATURES; f++) { uint64_t refcount; + uint64_t *arr; if (!(spa_feature_table[f].fi_flags & - ZFEATURE_FLAG_PER_DATASET) || - !spa_feature_is_enabled(spa, f)) { - ASSERT0(dataset_feature_count[f]); - continue; + ZFEATURE_FLAG_PER_DATASET)) { + if (global_feature_count[f] == UINT64_MAX) + continue; + if (!spa_feature_is_enabled(spa, f)) { + ASSERT0(global_feature_count[f]); + continue; + } + arr = global_feature_count; + } else { + if (!spa_feature_is_enabled(spa, f)) { + ASSERT0(dataset_feature_count[f]); + continue; + } + arr = dataset_feature_count; } if (feature_get_refcount(spa, &spa_feature_table[f], &refcount) == ENOTSUP) continue; - if (dataset_feature_count[f] != refcount) { + if (arr[f] != refcount) { (void) printf("%s feature refcount mismatch: " - "%lld datasets != %lld refcount\n", + "%lld consumers != %lld refcount\n", spa_feature_table[f].fi_uname, - (longlong_t)dataset_feature_count[f], - (longlong_t)refcount); + (longlong_t)arr[f], (longlong_t)refcount); rc = 2; } else { (void) printf("Verified %s feature refcount " @@ -5891,9 +6094,23 @@ main(int argc, char **argv) FTAG, policy, NULL); } } + } else if (strpbrk(target, "#") != NULL) { + dsl_pool_t *dp; + error = dsl_pool_hold(target, FTAG, &dp); + if (error != 0) { + fatal("can't dump '%s': %s", target, + strerror(error)); + } + error = dump_bookmark(dp, target, B_TRUE, verbose > 1); + dsl_pool_rele(dp, FTAG); + if (error != 0) { + fatal("can't dump '%s': %s", target, + strerror(error)); + } + return (error); } else { zdb_set_skip_mmp(target); - error = open_objset(target, DMU_OST_ANY, FTAG, &os); + error = open_objset(target, FTAG, &os); if (error == 0) spa = dmu_objset_spa(os); } @@ -5952,10 +6169,11 @@ main(int argc, char **argv) free(checkpoint_target); } - if (os != NULL) + if (os != NULL) { close_objset(os, FTAG); - else + } else { spa_close(spa, FTAG); + } fuid_table_destroy(); diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index dcfb6e80a48c..4f25e8244c1b 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -116,6 +117,7 @@ static int zfs_do_load_key(int argc, char **argv); static int zfs_do_unload_key(int argc, char **argv); static int zfs_do_change_key(int argc, char **argv); static int zfs_do_project(int argc, char **argv); +static int zfs_do_redact(int argc, char **argv); /* * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. @@ -170,6 +172,7 @@ typedef enum { HELP_LOAD_KEY, HELP_UNLOAD_KEY, HELP_CHANGE_KEY, + HELP_REDACT, } zfs_help_t; typedef struct zfs_command { @@ -232,6 +235,7 @@ static zfs_command_t command_table[] = { { "load-key", zfs_do_load_key, HELP_LOAD_KEY }, { "unload-key", zfs_do_unload_key, HELP_UNLOAD_KEY }, { "change-key", zfs_do_change_key, HELP_CHANGE_KEY }, + { "redact", zfs_do_redact, HELP_REDACT }, }; #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) @@ -273,7 +277,7 @@ get_usage(zfs_help_t idx) "[filesystem|volume|snapshot] ...\n")); case HELP_MOUNT: return (gettext("\tmount\n" - "\tmount [-lvO] [-o opts] <-a | filesystem>\n")); + "\tmount [-flvO] [-o opts] <-a | filesystem>\n")); case HELP_PROMOTE: return (gettext("\tpromote \n")); case HELP_RECEIVE: @@ -296,6 +300,9 @@ get_usage(zfs_help_t idx) "\n" "\tsend [-nvPLecw] [-i snapshot|bookmark] " "\n" + "[-i bookmark] \n" + "\tsend [-DnPpvLecr] [-i bookmark|snapshot] " + "--redact \n" "\tsend [-nvPe] -t \n")); case HELP_SET: return (gettext("\tset ... " @@ -378,6 +385,9 @@ get_usage(zfs_help_t idx) "\t [-o keylocation=] [-o pbkfd2iters=]\n" "\t \n" "\tchange-key -i [-l] \n")); + case HELP_REDACT: + return (gettext("\tredact " + " ...")); } abort(); @@ -535,6 +545,8 @@ usage(boolean_t requested) (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "written@"); (void) fprintf(fp, " NO NO \n"); + (void) fprintf(fp, "\t%-15s ", "written#"); + (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, gettext("\nSizes are specified in bytes " "with standard units such as K, M, G, etc.\n")); @@ -1493,6 +1505,13 @@ zfs_do_destroy(int argc, char **argv) return (-1); } + /* + * Unfortunately, zfs_bookmark() doesn't honor the + * casesensitivity setting. However, we can't simply + * remove this check, because lzc_destroy_bookmarks() + * ignores non-existent bookmarks, so this is necessary + * to get a proper error message. + */ if (!zfs_bookmark_exists(argv[0])) { (void) fprintf(stderr, gettext("bookmark '%s' " "does not exist.\n"), argv[0]); @@ -3557,6 +3576,73 @@ zfs_do_promote(int argc, char **argv) return (ret); } +static int +zfs_do_redact(int argc, char **argv) +{ + char *snap = NULL; + char *bookname = NULL; + char **rsnaps = NULL; + int numrsnaps = 0; + argv++; + argc--; + if (argc < 3) { + (void) fprintf(stderr, gettext("too few arguments")); + usage(B_FALSE); + } + + snap = argv[0]; + bookname = argv[1]; + rsnaps = argv + 2; + numrsnaps = argc - 2; + + nvlist_t *rsnapnv = fnvlist_alloc(); + + for (int i = 0; i < numrsnaps; i++) { + fnvlist_add_boolean(rsnapnv, rsnaps[i]); + } + + int err = lzc_redact(snap, bookname, rsnapnv); + fnvlist_free(rsnapnv); + + switch (err) { + case 0: + break; + case ENOENT: + (void) fprintf(stderr, + gettext("provided snapshot %s does not exist"), snap); + break; + case EEXIST: + (void) fprintf(stderr, gettext("specified redaction bookmark " + "(%s) provided already exists"), bookname); + break; + case ENAMETOOLONG: + (void) fprintf(stderr, gettext("provided bookmark name cannot " + "be used, final name would be too long")); + break; + case E2BIG: + (void) fprintf(stderr, gettext("too many redaction snapshots " + "specified")); + break; + case EINVAL: + (void) fprintf(stderr, gettext("redaction snapshot must be " + "descendent of snapshot being redacted")); + break; + case EALREADY: + (void) fprintf(stderr, gettext("attempted to redact redacted " + "dataset or with respect to redacted dataset")); + break; + case ENOTSUP: + (void) fprintf(stderr, gettext("redaction bookmarks feature " + "not enabled")); + break; + default: + (void) fprintf(stderr, gettext("internal error: %s"), + strerror(errno)); + } + + return (err); +} + /* * zfs rollback [-rRf] * @@ -3941,6 +4027,9 @@ zfs_do_snapshot(int argc, char **argv) return (-1); } + +#define REDACT_OPT 1024 + /* * Send a backup stream to stdout. */ @@ -3955,10 +4044,11 @@ zfs_do_send(int argc, char **argv) sendflags_t flags = { 0 }; int c, err; nvlist_t *dbgnv = NULL; - boolean_t extraverbose = B_FALSE; + char *redactbook = NULL; struct option long_options[] = { {"replicate", no_argument, NULL, 'R'}, + {"redact-bookmark", required_argument, NULL, REDACT_OPT}, {"props", no_argument, NULL, 'p'}, {"parsable", no_argument, NULL, 'P'}, {"dedup", no_argument, NULL, 'D'}, @@ -3991,6 +4081,9 @@ zfs_do_send(int argc, char **argv) case 'R': flags.replicate = B_TRUE; break; + case REDACT_OPT: + redactbook = optarg; + break; case 'p': flags.props = B_TRUE; break; @@ -3999,12 +4092,9 @@ zfs_do_send(int argc, char **argv) break; case 'P': flags.parsable = B_TRUE; - flags.verbose = B_TRUE; break; case 'v': - if (flags.verbose) - extraverbose = B_TRUE; - flags.verbose = B_TRUE; + flags.verbosity++; flags.progress = B_TRUE; break; case 'D': @@ -4072,19 +4162,21 @@ zfs_do_send(int argc, char **argv) } } + if (flags.parsable && flags.verbosity == 0) + flags.verbosity = 1; + argc -= optind; argv += optind; if (resume_token != NULL) { if (fromname != NULL || flags.replicate || flags.props || - flags.backup || flags.dedup) { + flags.backup || flags.dedup || redactbook != NULL) { (void) fprintf(stderr, gettext("invalid flags combined with -t\n")); usage(B_FALSE); } - if (argc != 0) { - (void) fprintf(stderr, gettext("no additional " - "arguments are permitted with -t\n")); + if (argc > 0) { + (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } else { @@ -4112,43 +4204,70 @@ zfs_do_send(int argc, char **argv) } /* - * Special case sending a filesystem, or from a bookmark. + * For everything except -R and -I, use the new, cleaner code path. */ - if (strchr(argv[0], '@') == NULL || - (fromname && strchr(fromname, '#') != NULL)) { + if (!(flags.replicate || flags.doall)) { char frombuf[ZFS_MAX_DATASET_NAME_LEN]; - if (flags.replicate || flags.doall || flags.props || - flags.backup || flags.dedup || - (strchr(argv[0], '@') == NULL && - (flags.dryrun || flags.verbose || flags.progress))) { - (void) fprintf(stderr, gettext("Error: " - "Unsupported flag with filesystem or bookmark.\n")); - return (1); + if (redactbook != NULL) { + if (strchr(argv[0], '@') == NULL) { + (void) fprintf(stderr, gettext("Error: Cannot " + "do a redacted send to a filesystem.\n")); + return (1); + } } zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); if (zhp == NULL) return (1); + if (fromname != NULL && (strchr(fromname, '#') == NULL && + strchr(fromname, '@') == NULL)) { + /* + * Neither bookmark or snapshot was specified. Print a + * warning, and assume snapshot. + */ + (void) fprintf(stderr, "Warning: incremental source " + "didn't specify type, assuming snapshot. Use '@' " + "or '#' prefix to avoid ambiguity.\n"); + (void) snprintf(frombuf, sizeof (frombuf), "@%s", + fromname); + fromname = frombuf; + } if (fromname != NULL && (fromname[0] == '#' || fromname[0] == '@')) { /* * Incremental source name begins with # or @. * Default to same fs as target. */ + char tmpbuf[ZFS_MAX_DATASET_NAME_LEN]; + (void) strlcpy(tmpbuf, fromname, sizeof (tmpbuf)); (void) strlcpy(frombuf, argv[0], sizeof (frombuf)); cp = strchr(frombuf, '@'); if (cp != NULL) *cp = '\0'; - (void) strlcat(frombuf, fromname, sizeof (frombuf)); + (void) strlcat(frombuf, tmpbuf, sizeof (frombuf)); fromname = frombuf; } - err = zfs_send_one(zhp, fromname, STDOUT_FILENO, flags); + err = zfs_send_one(zhp, fromname, STDOUT_FILENO, &flags, + redactbook); zfs_close(zhp); return (err != 0); } + if (fromname != NULL && strchr(fromname, '#')) { + (void) fprintf(stderr, + gettext("Error: multiple snapshots cannot be " + "sent from a bookmark.\n")); + return (1); + } + + if (redactbook != NULL) { + (void) fprintf(stderr, gettext("Error: multiple snapshots " + "cannot be sent redacted.\n")); + return (1); + } + cp = strchr(argv[0], '@'); *cp = '\0'; toname = cp + 1; @@ -4192,9 +4311,9 @@ zfs_do_send(int argc, char **argv) flags.doall = B_TRUE; err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0, - extraverbose ? &dbgnv : NULL); + flags.verbosity >= 3 ? &dbgnv : NULL); - if (extraverbose && dbgnv != NULL) { + if (flags.verbosity >= 3 && dbgnv != NULL) { /* * dump_nvlist prints to stdout, but that's been * redirected to a file. Make it print to stderr @@ -6278,6 +6397,17 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, return (1); } + if (zfs_prop_get_int(zhp, ZFS_PROP_REDACTED) && !(flags & MS_FORCE)) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "Dataset is not complete, was created by receiving " + "a redacted zfs send stream.\n"), cmdname, + zfs_get_name(zhp)); + return (1); + } + /* * At this point, we have verified that the mountpoint and/or * shareopts are appropriate for auto management. If the @@ -6413,7 +6543,7 @@ share_mount(int op, int argc, char **argv) int flags = 0; /* check options */ - while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:O" : "al")) + while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:Of" : "al")) != -1) { switch (c) { case 'a': @@ -6441,6 +6571,9 @@ share_mount(int op, int argc, char **argv) case 'O': flags |= MS_OVERLAY; break; + case 'f': + flags |= MS_FORCE; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); diff --git a/cmd/zstreamdump/zstreamdump.c b/cmd/zstreamdump/zstreamdump.c index a2edefa92f37..91ba297f45bb 100644 --- a/cmd/zstreamdump/zstreamdump.c +++ b/cmd/zstreamdump/zstreamdump.c @@ -236,6 +236,7 @@ main(int argc, char *argv[]) struct drr_spill *drrs = &thedrr.drr_u.drr_spill; struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded; struct drr_object_range *drror = &thedrr.drr_u.drr_object_range; + struct drr_redact *drrr = &thedrr.drr_u.drr_redact; struct drr_checksum *drrc = &thedrr.drr_u.drr_checksum; char c; boolean_t verbose = B_FALSE; @@ -707,6 +708,21 @@ main(int argc, char *argv[]) mac); } break; + case DRR_REDACT: + if (do_byteswap) { + drrr->drr_object = BSWAP_64(drrr->drr_object); + drrr->drr_offset = BSWAP_64(drrr->drr_offset); + drrr->drr_length = BSWAP_64(drrr->drr_length); + drrr->drr_toguid = BSWAP_64(drrr->drr_toguid); + } + if (verbose) { + (void) printf("REDACT object = %llu offset = " + "%llu length = %llu\n", + (u_longlong_t)drrr->drr_object, + (u_longlong_t)drrr->drr_offset, + (u_longlong_t)drrr->drr_length); + } + break; case DRR_NUMTYPES: /* should never be reached */ exit(1); diff --git a/configure.ac b/configure.ac index 18d91b359911..51b69f8bcce7 100644 --- a/configure.ac +++ b/configure.ac @@ -168,6 +168,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/cmd/file_check/Makefile tests/zfs-tests/cmd/file_trunc/Makefile tests/zfs-tests/cmd/file_write/Makefile + tests/zfs-tests/cmd/get_diff/Makefile tests/zfs-tests/cmd/largest_file/Makefile tests/zfs-tests/cmd/libzfs_input_check/Makefile tests/zfs-tests/cmd/mkbusy/Makefile @@ -185,6 +186,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/cmd/rm_lnkcnt_zero_file/Makefile tests/zfs-tests/cmd/threadsappend/Makefile tests/zfs-tests/cmd/xattrtest/Makefile + tests/zfs-tests/cmd/stride_dd/Makefile tests/zfs-tests/include/Makefile tests/zfs-tests/tests/Makefile tests/zfs-tests/tests/functional/Makefile @@ -305,6 +307,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/pyzfs/Makefile tests/zfs-tests/tests/functional/quota/Makefile tests/zfs-tests/tests/functional/raidz/Makefile + tests/zfs-tests/tests/functional/redacted_send/Makefile tests/zfs-tests/tests/functional/redundancy/Makefile tests/zfs-tests/tests/functional/refquota/Makefile tests/zfs-tests/tests/functional/refreserv/Makefile diff --git a/include/libzfs.h b/include/libzfs.h index 08142786bd0e..51742c46de1c 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -158,6 +158,7 @@ typedef enum zfs_error { EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */ EZFS_VDEV_TOO_BIG, /* a device is too big to be used */ EZFS_IOC_NOTSUPPORTED, /* operation not supported by zfs module */ + EZFS_TOOMANY, /* argument list too long */ EZFS_UNKNOWN } zfs_error_t; @@ -650,8 +651,8 @@ extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); extern int zfs_rename(zfs_handle_t *, const char *, boolean_t, boolean_t); typedef struct sendflags { - /* print informational messages (ie, -v was specified) */ - boolean_t verbose; + /* Amount of extra information to print. */ + int verbosity; /* recursive send (ie, -R) */ boolean_t replicate; @@ -697,7 +698,9 @@ typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); extern int zfs_send(zfs_handle_t *, const char *, const char *, sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); -extern int zfs_send_one(zfs_handle_t *, const char *, int, sendflags_t flags); +extern int zfs_send_one(zfs_handle_t *, const char *, int, sendflags_t *, + const char *); +extern int zfs_send_progress(zfs_handle_t *, int, uint64_t *, uint64_t *); extern int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd, const char *); extern nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, diff --git a/include/libzfs_core.h b/include/libzfs_core.h index c22cbf18e2cc..9afd167e6623 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. @@ -58,10 +58,12 @@ int lzc_promote(const char *, char *, int); int lzc_destroy_snaps(nvlist_t *, boolean_t, nvlist_t **); int lzc_bookmark(nvlist_t *, nvlist_t **); int lzc_get_bookmarks(const char *, nvlist_t *, nvlist_t **); +int lzc_get_bookmark_props(const char *, nvlist_t **); int lzc_destroy_bookmarks(nvlist_t *, nvlist_t **); int lzc_load_key(const char *, boolean_t, uint8_t *, uint_t); int lzc_unload_key(const char *); int lzc_change_key(const char *, uint64_t, nvlist_t *, uint8_t *, uint_t); +int lzc_redact(const char *, const char *, nvlist_t *); int lzc_snaprange_space(const char *, const char *, uint64_t *); @@ -83,6 +85,10 @@ int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *); struct dmu_replay_record; +int lzc_send_redacted(const char *, const char *, int, enum lzc_send_flags, + const char *); +int lzc_send_resume_redacted(const char *, const char *, int, + enum lzc_send_flags, uint64_t, uint64_t, const char *); int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, boolean_t, int); int lzc_receive_resumable(const char *, nvlist_t *, const char *, boolean_t, @@ -96,6 +102,11 @@ int lzc_receive_with_cmdprops(const char *, nvlist_t *, nvlist_t *, uint8_t *, uint_t, const char *, boolean_t, boolean_t, boolean_t, int, const struct dmu_replay_record *, int, uint64_t *, uint64_t *, uint64_t *, nvlist_t **); +int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *); +int lzc_send_space_resume_redacted(const char *, const char *, + enum lzc_send_flags, uint64_t, uint64_t, uint64_t, const char *, + int, uint64_t *); +uint64_t lzc_send_progress(int); boolean_t lzc_exists(const char *); diff --git a/include/spl/sys/debug.h b/include/spl/sys/debug.h index 692d6c61ecbe..f3fb51c3657a 100644 --- a/include/spl/sys/debug.h +++ b/include/spl/sys/debug.h @@ -63,22 +63,59 @@ void spl_dumpstack(void); spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "%s", "VERIFY(" #cond ") failed\n")) -#define VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE, FMT, CAST) do { \ - TYPE _verify3_left = (TYPE)(LEFT); \ - TYPE _verify3_right = (TYPE)(RIGHT); \ +#define VERIFY3B(LEFT, OP, RIGHT) do { \ + boolean_t _verify3_left = (boolean_t)(LEFT); \ + boolean_t _verify3_right = (boolean_t)(RIGHT); \ if (!(_verify3_left OP _verify3_right)) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (" FMT " " #OP " " FMT ")\n", \ - CAST (_verify3_left), CAST (_verify3_right)); \ + "VERIFY3(" #LEFT #OP #RIGHT ") " \ + "failed (%d " #OP " %d)\n", \ + (boolean_t) (_verify3_left), \ + (boolean_t) (_verify3_right)); \ } while (0) -#define VERIFY3B(x,y,z) VERIFY3_IMPL(x, y, z, boolean_t, "%d", (boolean_t)) -#define VERIFY3S(x,y,z) VERIFY3_IMPL(x, y, z, int64_t, "%lld", (long long)) -#define VERIFY3U(x,y,z) VERIFY3_IMPL(x, y, z, uint64_t, "%llu", \ - (unsigned long long)) -#define VERIFY3P(x,y,z) VERIFY3_IMPL(x, y, z, uintptr_t, "%p", (void *)) -#define VERIFY0(x) VERIFY3_IMPL(0, ==, x, int64_t, "%lld", (long long)) +#define VERIFY3S(LEFT, OP, RIGHT) do { \ + int64_t _verify3_left = (int64_t)(LEFT); \ + int64_t _verify3_right = (int64_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT #OP #RIGHT ") " \ + "failed (%lld " #OP " %lld)\n", \ + (long long) (_verify3_left), \ + (long long) (_verify3_right)); \ + } while (0) + +#define VERIFY3U(LEFT, OP, RIGHT) do { \ + uint64_t _verify3_left = (uint64_t)(LEFT); \ + uint64_t _verify3_right = (uint64_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT #OP #RIGHT ") " \ + "failed (%llu " #OP " %llu)\n", \ + (unsigned long long) (_verify3_left), \ + (unsigned long long) (_verify3_right)); \ + } while (0) + +#define VERIFY3P(LEFT, OP, RIGHT) do { \ + uintptr_t _verify3_left = (uintptr_t)(LEFT); \ + uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT #OP #RIGHT ") " \ + "failed (%p " #OP " %p)\n", \ + (void *) (_verify3_left), \ + (void *) (_verify3_right)); \ + } while (0) + +#define VERIFY0(RIGHT) do { \ + int64_t _verify3_left = (int64_t)(0); \ + int64_t _verify3_right = (int64_t)(RIGHT); \ + if (!(_verify3_left == _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(0 == " #RIGHT ") " \ + "failed (0 == %lld)\n", \ + (long long) (_verify3_right)); \ + } while (0) #define CTASSERT_GLOBAL(x) _CTASSERT(x, __LINE__) #define CTASSERT(x) { _CTASSERT(x, __LINE__); } @@ -107,13 +144,13 @@ void spl_dumpstack(void); */ #else -#define ASSERT(cond) VERIFY(cond) +#define ASSERT3B VERIFY3B +#define ASSERT3S VERIFY3S +#define ASSERT3U VERIFY3U +#define ASSERT3P VERIFY3P +#define ASSERT0 VERIFY0 +#define ASSERT VERIFY #define ASSERTV(x) x -#define ASSERT3B(x,y,z) VERIFY3B(x, y, z) -#define ASSERT3S(x,y,z) VERIFY3S(x, y, z) -#define ASSERT3U(x,y,z) VERIFY3U(x, y, z) -#define ASSERT3P(x,y,z) VERIFY3P(x, y, z) -#define ASSERT0(x) VERIFY0(x) #define IMPLY(A, B) \ ((void)(((!(A)) || (B)) || \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 644ddaab3f33..f852fb0e6c36 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -20,6 +20,8 @@ COMMON_H = \ $(top_srcdir)/include/sys/dmu.h \ $(top_srcdir)/include/sys/dmu_impl.h \ $(top_srcdir)/include/sys/dmu_objset.h \ + $(top_srcdir)/include/sys/dmu_recv.h \ + $(top_srcdir)/include/sys/dmu_redact.h \ $(top_srcdir)/include/sys/dmu_send.h \ $(top_srcdir)/include/sys/dmu_traverse.h \ $(top_srcdir)/include/sys/dmu_tx.h \ @@ -49,6 +51,7 @@ COMMON_H = \ $(top_srcdir)/include/sys/note.h \ $(top_srcdir)/include/sys/nvpair.h \ $(top_srcdir)/include/sys/nvpair_impl.h \ + $(top_srcdir)/include/sys/objlist.h \ $(top_srcdir)/include/sys/pathname.h \ $(top_srcdir)/include/sys/policy.h \ $(top_srcdir)/include/sys/range_tree.h \ diff --git a/include/sys/bqueue.h b/include/sys/bqueue.h index 63722df1bbf3..797aecd791a3 100644 --- a/include/sys/bqueue.h +++ b/include/sys/bqueue.h @@ -13,7 +13,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2018 by Delphix. All rights reserved. */ #ifndef _BQUEUE_H @@ -32,6 +32,7 @@ typedef struct bqueue { kcondvar_t bq_pop_cv; uint64_t bq_size; uint64_t bq_maxsize; + uint64_t bq_fill_fraction; size_t bq_node_offset; } bqueue_t; @@ -41,9 +42,10 @@ typedef struct bqueue_node { } bqueue_node_t; -int bqueue_init(bqueue_t *, uint64_t, size_t); +int bqueue_init(bqueue_t *, uint64_t, uint64_t, size_t); void bqueue_destroy(bqueue_t *); void bqueue_enqueue(bqueue_t *, void *, uint64_t); +void bqueue_enqueue_flush(bqueue_t *, void *, uint64_t); void *bqueue_dequeue(bqueue_t *); boolean_t bqueue_empty(bqueue_t *); diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index ab0950c83c20..89369f8441d1 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -329,6 +329,7 @@ void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); +void dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx); void dbuf_destroy(dmu_buf_impl_t *db); void dbuf_unoverride(dbuf_dirty_record_t *dr); @@ -345,6 +346,9 @@ void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); void dbuf_stats_init(dbuf_hash_table_t *hash); void dbuf_stats_destroy(void); +int dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid, + blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift); + #define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) #define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock) #define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index bc7046fdced8..1e240bf482cf 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -497,6 +497,8 @@ int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t txg); void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); +void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx); /* * Decide how to write a block: checksum, compression, number of copies, etc. @@ -943,6 +945,7 @@ typedef struct dmu_objset_stats { dmu_objset_type_t dds_type; uint8_t dds_is_snapshot; uint8_t dds_inconsistent; + uint8_t dds_redacted; char dds_origin[ZFS_MAX_DATASET_NAME_LEN]; } dmu_objset_stats_t; diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index 03a63077f101..8d0b960840c8 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -24,7 +24,7 @@ */ /* * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_IMPL_H @@ -243,38 +243,13 @@ typedef struct dmu_xuio { iovec_t *iovp; } dmu_xuio_t; -/* - * The list of data whose inclusion in a send stream can be pending from - * one call to backup_cb to another. Multiple calls to dump_free() and - * dump_freeobjects() can be aggregated into a single DRR_FREE or - * DRR_FREEOBJECTS replay record. - */ -typedef enum { - PENDING_NONE, - PENDING_FREE, - PENDING_FREEOBJECTS -} dmu_pendop_t; - -typedef struct dmu_sendarg { - list_node_t dsa_link; - dmu_replay_record_t *dsa_drr; - vnode_t *dsa_vp; - int dsa_outfd; - proc_t *dsa_proc; - offset_t *dsa_off; - objset_t *dsa_os; - zio_cksum_t dsa_zc; - uint64_t dsa_toguid; - int dsa_err; - dmu_pendop_t dsa_pending_op; - uint64_t dsa_featureflags; - uint64_t dsa_last_data_object; - uint64_t dsa_last_data_offset; - uint64_t dsa_resume_object; - uint64_t dsa_resume_offset; - boolean_t dsa_sent_begin; - boolean_t dsa_sent_end; -} dmu_sendarg_t; +typedef struct dmu_sendstatus { + list_node_t dss_link; + int dss_outfd; + proc_t *dss_proc; + offset_t *dss_off; + uint64_t dss_blocks; /* blocks visited during the sending process */ +} dmu_sendstatus_t; void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); diff --git a/include/sys/dmu_recv.h b/include/sys/dmu_recv.h new file mode 100644 index 000000000000..5f8b893e55f9 --- /dev/null +++ b/include/sys/dmu_recv.h @@ -0,0 +1,87 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ + +#ifndef _DMU_RECV_H +#define _DMU_RECV_H + +#include +#include +#include +#include +#include + +extern const char *recv_clone_name; + +typedef struct dmu_recv_cookie { + struct dsl_dataset *drc_ds; + struct dmu_replay_record *drc_drr_begin; + struct drr_begin *drc_drrb; + const char *drc_tofs; + const char *drc_tosnap; + boolean_t drc_newfs; + boolean_t drc_byteswap; + boolean_t drc_raw; + uint64_t drc_featureflags; + boolean_t drc_force; + boolean_t drc_resumable; + boolean_t drc_clone; + struct avl_tree *drc_guid_to_ds_map; + nvlist_t *drc_keynvl; + uint64_t drc_newsnapobj; + void *drc_owner; + cred_t *drc_cred; + nvlist_t *drc_begin_nvl; + + objset_t *drc_os; + vnode_t *drc_vp; /* The vnode to read the stream from */ + uint64_t drc_voff; /* The current offset in the stream */ + uint64_t drc_bytes_read; + /* + * A record that has had its payload read in, but hasn't yet been handed + * off to the worker thread. + */ + struct receive_record_arg *drc_rrd; + /* A record that has had its header read in, but not its payload. */ + struct receive_record_arg *drc_next_rrd; + zio_cksum_t drc_cksum; + zio_cksum_t drc_prev_cksum; + int drc_err; + /* Sorted list of objects not to issue prefetches for. */ + objlist_t *drc_ignore_objlist; +} dmu_recv_cookie_t; + +int dmu_recv_begin(char *tofs, char *tosnap, + struct dmu_replay_record *drr_begin, boolean_t force, boolean_t resumable, + nvlist_t *localprops, nvlist_t *hidden_args, char *origin, + dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp); +int dmu_recv_stream(dmu_recv_cookie_t *drc, int cleanup_fd, + uint64_t *action_handlep, offset_t *voffp); +int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner); +boolean_t dmu_objset_is_receiving(objset_t *os); + +#endif /* _DMU_RECV_H */ diff --git a/include/sys/dmu_redact.h b/include/sys/dmu_redact.h new file mode 100644 index 000000000000..207fdbb5cfda --- /dev/null +++ b/include/sys/dmu_redact.h @@ -0,0 +1,58 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ +#ifndef _DMU_REDACT_H_ +#define _DMU_REDACT_H_ + +#include +#include + +#define REDACT_BLOCK_MAX_COUNT (1ULL << 48) + +static inline uint64_t +redact_block_get_size(redact_block_phys_t *rbp) +{ + return (BF64_GET_SB((rbp)->rbp_size_count, 48, 16, SPA_MINBLOCKSHIFT, + 0)); +} + +static inline void +redact_block_set_size(redact_block_phys_t *rbp, uint64_t size) +{ + BF64_SET_SB((rbp)->rbp_size_count, 48, 16, SPA_MINBLOCKSHIFT, 0, size); +} + +static inline uint64_t +redact_block_get_count(redact_block_phys_t *rbp) +{ + return (BF64_GET_SB((rbp)->rbp_size_count, 0, 48, 0, 1)); +} + +static inline void +redact_block_set_count(redact_block_phys_t *rbp, uint64_t count) +{ + BF64_SET_SB((rbp)->rbp_size_count, 0, 48, 0, 1, count); +} + +int dmu_redact_snap(const char *, nvlist_t *, const char *); +#endif /* _DMU_REDACT_H_ */ diff --git a/include/sys/dmu_send.h b/include/sys/dmu_send.h index 396710470f5f..9cbb7afa9e58 100644 --- a/include/sys/dmu_send.h +++ b/include/sys/dmu_send.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ @@ -31,54 +31,41 @@ #include #include +#include #include +#include +#include +#include + +#define BEGINNV_REDACT_SNAPS "redact_snaps" +#define BEGINNV_REDACT_FROM_SNAPS "redact_from_snaps" +#define BEGINNV_RESUME_OBJECT "resume_object" +#define BEGINNV_RESUME_OFFSET "resume_offset" struct vnode; struct dsl_dataset; struct drr_begin; struct avl_tree; struct dmu_replay_record; +struct dmu_send_outparams; -extern const char *recv_clone_name; - -int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, - boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, int outfd, - uint64_t resumeobj, uint64_t resumeoff, struct vnode *vp, offset_t *off); -int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds, - boolean_t stream_compressed, uint64_t *sizep); -int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg, - boolean_t stream_compressed, uint64_t *sizep); +int +dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, + boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, + uint64_t resumeobj, uint64_t resumeoff, const char *redactbook, int outfd, + offset_t *off, struct dmu_send_outparams *dsop); +int dmu_send_estimate_fast(struct dsl_dataset *ds, struct dsl_dataset *fromds, + zfs_bookmark_phys_t *frombook, boolean_t stream_compressed, + uint64_t *sizep); int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, - boolean_t rawok, int outfd, struct vnode *vp, offset_t *off); - -typedef struct dmu_recv_cookie { - struct dsl_dataset *drc_ds; - struct dmu_replay_record *drc_drr_begin; - struct drr_begin *drc_drrb; - const char *drc_tofs; - const char *drc_tosnap; - boolean_t drc_newfs; - boolean_t drc_byteswap; - boolean_t drc_force; - boolean_t drc_resumable; - boolean_t drc_raw; - boolean_t drc_clone; - struct avl_tree *drc_guid_to_ds_map; - nvlist_t *drc_keynvl; - zio_cksum_t drc_cksum; - uint64_t drc_newsnapobj; - void *drc_owner; - cred_t *drc_cred; -} dmu_recv_cookie_t; + boolean_t rawok, int outfd, offset_t *off, struct dmu_send_outparams *dso); -int dmu_recv_begin(char *tofs, char *tosnap, - struct dmu_replay_record *drr_begin, boolean_t force, boolean_t resumable, - nvlist_t *localprops, nvlist_t *hidden_args, char *origin, - dmu_recv_cookie_t *drc); -int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, - int cleanup_fd, uint64_t *action_handlep); -int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner); -boolean_t dmu_objset_is_receiving(objset_t *os); +typedef int (*dmu_send_outfunc_t)(objset_t *os, void *buf, int len, void *arg); +typedef struct dmu_send_outparams { + dmu_send_outfunc_t dso_outfunc; + void *dso_arg; + boolean_t dso_dryrun; +} dmu_send_outparams_t; #endif /* _DMU_SEND_H */ diff --git a/include/sys/dmu_traverse.h b/include/sys/dmu_traverse.h index 8ceef5cf13e1..d76bfe3c9af3 100644 --- a/include/sys/dmu_traverse.h +++ b/include/sys/dmu_traverse.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_TRAVERSE_H @@ -71,6 +71,20 @@ int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, int traverse_pool(spa_t *spa, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); +/* + * Note that this calculation cannot overflow with the current maximum indirect + * block size (128k). If that maximum is increased to 1M, however, this + * calculation can overflow, and handling would need to be added to ensure + * continued correctness. + */ +static inline uint64_t +bp_span_in_blocks(uint8_t indblkshift, uint64_t level) +{ + unsigned int shift = level * (indblkshift - SPA_BLKPTRSHIFT); + ASSERT3U(shift, <, 64); + return (1ULL << shift); +} + #ifdef __cplusplus } #endif diff --git a/include/sys/dsl_bookmark.h b/include/sys/dsl_bookmark.h index 3591986d7bd7..807ca0a54698 100644 --- a/include/sys/dsl_bookmark.h +++ b/include/sys/dsl_bookmark.h @@ -13,21 +13,21 @@ * CDDL HEADER END */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_BOOKMARK_H #define _SYS_DSL_BOOKMARK_H #include +#include +#include +#include #ifdef __cplusplus extern "C" { #endif -struct dsl_pool; -struct dsl_dataset; - /* * On disk zap object. */ @@ -35,14 +35,91 @@ typedef struct zfs_bookmark_phys { uint64_t zbm_guid; /* guid of bookmarked dataset */ uint64_t zbm_creation_txg; /* birth transaction group */ uint64_t zbm_creation_time; /* bookmark creation time */ + uint64_t zbm_redaction_obj; /* redaction list object */ + uint64_t zbm_flags; /* ZBM_FLAG_* */ + uint64_t zbm_referenced_bytes_refd; + uint64_t zbm_compressed_bytes_refd; + uint64_t zbm_uncompressed_bytes_refd; + uint64_t zbm_referenced_freed_before_next_snap; + uint64_t zbm_compressed_freed_before_next_snap; + uint64_t zbm_uncompressed_freed_before_next_snap; } zfs_bookmark_phys_t; +typedef enum zbm_flags { + ZBM_FLAG_HAS_FBN = (1 << 0), + ZBM_FLAG_SNAPSHOT_EXISTS = (1 << 1), +} zbm_flags_t; + +typedef struct redaction_list_phys { + uint64_t rlp_last_object; + uint64_t rlp_last_blkid; + uint64_t rlp_num_entries; + uint64_t rlp_num_snaps; + uint64_t rlp_snaps[]; /* variable length */ +} redaction_list_phys_t; + +typedef struct redaction_list { + dmu_buf_user_t rl_dbu; + redaction_list_phys_t *rl_phys; + dmu_buf_t *rl_dbuf; + uint64_t rl_object; + refcount_t rl_longholds; + objset_t *rl_mos; +} redaction_list_t; + +/* node in ds_bookmarks */ +typedef struct dsl_bookmark_node { + char *dbn_name; /* free with strfree() */ + kmutex_t dbn_lock; /* protects dirty/phys in block_killed */ + boolean_t dbn_dirty; /* in currently syncing txg */ + zfs_bookmark_phys_t dbn_phys; + avl_node_t dbn_node; +} dsl_bookmark_node_t; + +typedef struct redact_block_phys { + uint64_t rbp_object; + uint64_t rbp_blkid; + /* + * The top 16 bits of this field represent the block size in sectors of + * the blocks in question; the bottom 48 bits are used to store the + * number of consecutive blocks that are in the redaction list. They + * should be accessed using the inline functions below. + */ + uint64_t rbp_size_count; + uint64_t rbp_padding; +} redact_block_phys_t; + +typedef int (*rl_traverse_callback_t)(redact_block_phys_t *, void *); + int dsl_bookmark_create(nvlist_t *, nvlist_t *); +int dsl_bookmark_create_redacted(const char *, const char *, uint64_t, + uint64_t *, void *, redaction_list_t **); int dsl_get_bookmarks(const char *, nvlist_t *, nvlist_t *); int dsl_get_bookmarks_impl(dsl_dataset_t *, nvlist_t *, nvlist_t *); +int dsl_get_bookmark_props(const char *, const char *, nvlist_t *); int dsl_bookmark_destroy(nvlist_t *, nvlist_t *); int dsl_bookmark_lookup(struct dsl_pool *, const char *, struct dsl_dataset *, zfs_bookmark_phys_t *); +int dsl_bookmark_lookup_impl(dsl_dataset_t *, const char *, + zfs_bookmark_phys_t *); +int dsl_redaction_list_hold_obj(struct dsl_pool *, uint64_t, void *, + redaction_list_t **); +void dsl_redaction_list_rele(redaction_list_t *, void *); +void dsl_redaction_list_long_hold(struct dsl_pool *, redaction_list_t *, + void *); +void dsl_redaction_list_long_rele(redaction_list_t *, void *); +boolean_t dsl_redaction_list_long_held(redaction_list_t *); +int dsl_bookmark_init_ds(dsl_dataset_t *); +void dsl_bookmark_fini_ds(dsl_dataset_t *); +boolean_t dsl_bookmark_ds_destroyed(dsl_dataset_t *, dmu_tx_t *); +void dsl_bookmark_snapshotted(dsl_dataset_t *, dmu_tx_t *); +void dsl_bookmark_block_killed(dsl_dataset_t *, const blkptr_t *, dmu_tx_t *); +void dsl_bookmark_sync_done(dsl_dataset_t *, dmu_tx_t *); +void dsl_bookmark_node_add(dsl_dataset_t *, dsl_bookmark_node_t *, dmu_tx_t *); +uint64_t dsl_bookmark_latest_txg(dsl_dataset_t *); +int dsl_redaction_list_traverse(redaction_list_t *, zbookmark_phys_t *, + rl_traverse_callback_t, void *); +void dsl_bookmark_next_changed(dsl_dataset_t *, dsl_dataset_t *, dmu_tx_t *); #ifdef __cplusplus } diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index dbe4cb706a1f..da23f37da1d2 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -45,10 +45,12 @@ extern "C" { #endif +extern int zfs_allow_redacted_dataset_mount; struct dsl_dataset; struct dsl_dir; struct dsl_pool; struct dsl_crypto_params; +struct zfs_bookmark_phys; #define DS_FLAG_INCONSISTENT (1ULL<<0) #define DS_IS_INCONSISTENT(ds) \ @@ -113,6 +115,13 @@ struct dsl_crypto_params; */ #define DS_FIELD_REMAP_DEADLIST "com.delphix:remap_deadlist" +/* + * We were receiving an incremental from a redaction bookmark, and these are the + * guids of its snapshots. + */ +#define DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS \ + "com.delphix:resume_redact_book_snaps" + /* * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose * name lookups should be performed case-insensitively. @@ -168,7 +177,8 @@ typedef struct dsl_dataset { /* only used in syncing context, only valid for non-snapshots: */ struct dsl_dataset *ds_prev; - uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */ + uint64_t ds_bookmarks_obj; /* DMU_OTN_ZAP_METADATA */ + avl_tree_t ds_bookmarks; /* dsl_bookmark_node_t */ /* has internal locking: */ dsl_deadlist_t ds_deadlist; @@ -240,13 +250,13 @@ typedef struct dsl_dataset { * For ZFEATURE_FLAG_PER_DATASET features, set if this dataset * uses this feature. */ - uint8_t ds_feature_inuse[SPA_FEATURES]; + void *ds_feature[SPA_FEATURES]; /* * Set if we need to activate the feature on this dataset this txg * (used only in syncing context). */ - uint8_t ds_feature_activation_needed[SPA_FEATURES]; + void *ds_feature_activation[SPA_FEATURES]; /* Protected by ds_lock; keep at end of struct for better locality */ char ds_snapname[ZFS_MAX_DATASET_NAME_LEN]; @@ -305,22 +315,26 @@ int dsl_dataset_hold_flags(struct dsl_pool *dp, const char *name, ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); boolean_t dsl_dataset_try_add_ref(struct dsl_pool *dp, dsl_dataset_t *ds, void *tag); -int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag, - dsl_dataset_t **); int dsl_dataset_hold_obj_flags(struct dsl_pool *dp, uint64_t dsobj, ds_hold_flags_t flags, void *tag, dsl_dataset_t **); -void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); +int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, + void *tag, dsl_dataset_t **); void dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag); +void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); int dsl_dataset_own(struct dsl_pool *dp, const char *name, ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); +int dsl_dataset_own_force(struct dsl_pool *dp, const char *name, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); +int dsl_dataset_own_obj_force(struct dsl_pool *dp, uint64_t dsobj, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); void dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag); void dsl_dataset_name(dsl_dataset_t *ds, char *name); +boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag, boolean_t override); int dsl_dataset_namelen(dsl_dataset_t *ds); boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds); -boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag); uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *, struct dsl_crypto_params *, dmu_tx_t *); @@ -377,9 +391,11 @@ uint64_t dsl_get_defer_destroy(dsl_dataset_t *ds); uint64_t dsl_get_referenced(dsl_dataset_t *ds); uint64_t dsl_get_numclones(dsl_dataset_t *ds); uint64_t dsl_get_inconsistent(dsl_dataset_t *ds); +uint64_t dsl_get_redacted(dsl_dataset_t *ds); uint64_t dsl_get_available(dsl_dataset_t *ds); int dsl_get_written(dsl_dataset_t *ds, uint64_t *written); int dsl_get_prev_snap(dsl_dataset_t *ds, char *snap); +void dsl_get_redact_snaps(dsl_dataset_t *ds, nvlist_t *propval); int dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value, char *source); @@ -393,6 +409,8 @@ void dsl_dataset_space(dsl_dataset_t *ds, uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds); int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +int dsl_dataset_space_written_bookmark(struct zfs_bookmark_phys *bmp, + dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); @@ -445,10 +463,16 @@ void dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx); boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds); void dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx); -void dsl_dataset_activate_feature(uint64_t dsobj, - spa_feature_t f, dmu_tx_t *tx); -void dsl_dataset_deactivate_feature(uint64_t dsobj, - spa_feature_t f, dmu_tx_t *tx); +void dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, void *arg, + dmu_tx_t *tx); +void dsl_dataset_deactivate_feature(dsl_dataset_t *ds, spa_feature_t f, + dmu_tx_t *tx); +boolean_t dsl_dataset_feature_is_active(dsl_dataset_t *ds, spa_feature_t f); +boolean_t dsl_dataset_get_uint64_array_feature(dsl_dataset_t *ds, + spa_feature_t f, uint64_t *outlength, uint64_t **outp); + +void dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps, + uint64_t num_redact_snaps, dmu_tx_t *tx); #ifdef ZFS_DEBUG #define dprintf_ds(ds, fmt, ...) do { \ diff --git a/include/sys/dsl_destroy.h b/include/sys/dsl_destroy.h index ae3ca0cfbd5e..c4dbea26b454 100644 --- a/include/sys/dsl_destroy.h +++ b/include/sys/dsl_destroy.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -45,6 +45,7 @@ int dsl_destroy_inconsistent(const char *, void *); int dsl_destroy_snapshot_check_impl(struct dsl_dataset *, boolean_t); void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *, boolean_t, struct dmu_tx *); +void dsl_dir_remove_clones_key(dsl_dir_t *, uint64_t, dmu_tx_t *); typedef struct dsl_destroy_snapshot_arg { const char *ddsa_name; diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 6bbf8434619c..0af87e0c7617 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -118,11 +118,11 @@ typedef enum { ZFS_PROP_PRIVATE, /* not exposed to user, temporary */ ZFS_PROP_ACLINHERIT, ZFS_PROP_CREATETXG, - ZFS_PROP_NAME, /* not exposed to the user */ + ZFS_PROP_NAME, ZFS_PROP_CANMOUNT, - ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ + ZFS_PROP_ISCSIOPTIONS, ZFS_PROP_XATTR, - ZFS_PROP_NUMCLONES, /* not exposed to the user */ + ZFS_PROP_NUMCLONES, ZFS_PROP_COPIES, ZFS_PROP_VERSION, ZFS_PROP_UTF8ONLY, @@ -140,12 +140,12 @@ typedef enum { ZFS_PROP_USEDDS, ZFS_PROP_USEDCHILD, ZFS_PROP_USEDREFRESERV, - ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ - ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ + ZFS_PROP_USERACCOUNTING, + ZFS_PROP_STMF_SHAREINFO, ZFS_PROP_DEFER_DESTROY, ZFS_PROP_USERREFS, ZFS_PROP_LOGBIAS, - ZFS_PROP_UNIQUE, /* not exposed to the user */ + ZFS_PROP_UNIQUE, ZFS_PROP_OBJSETID, ZFS_PROP_DEDUP, ZFS_PROP_MLSLABEL, @@ -156,7 +156,7 @@ typedef enum { ZFS_PROP_CLONES, ZFS_PROP_LOGICALUSED, ZFS_PROP_LOGICALREFERENCED, - ZFS_PROP_INCONSISTENT, /* not exposed to the user */ + ZFS_PROP_INCONSISTENT, ZFS_PROP_VOLMODE, ZFS_PROP_FILESYSTEM_LIMIT, ZFS_PROP_SNAPSHOT_LIMIT, @@ -183,6 +183,8 @@ typedef enum { ZFS_PROP_KEYSTATUS, ZFS_PROP_REMAPTXG, /* not exposed to the user */ ZFS_PROP_SPECIAL_SMALL_BLOCKS, + ZFS_PROP_REDACTED, + ZFS_PROP_REDACT_SNAPS, ZFS_NUM_PROPS } zfs_prop_t; @@ -207,8 +209,7 @@ extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS]; /* * Pool properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected - * by the change. If you make any changes to this list, be sure to update - * the property table in module/zcommon/zpool_prop.c. + * by the change. Properties must be registered in zfs_prop_init(). */ typedef enum { ZPOOL_PROP_INVAL = -1, @@ -1094,7 +1095,7 @@ typedef struct ddt_histogram { */ typedef enum zfs_ioc { /* - * Illumos - 71/128 numbers reserved. + * Illumos - 73/128 numbers reserved. */ ZFS_IOC_FIRST = ('Z' << 8), ZFS_IOC = ZFS_IOC_FIRST, @@ -1177,6 +1178,8 @@ typedef enum zfs_ioc { ZFS_IOC_REMAP, /* 0x5a4c */ ZFS_IOC_POOL_CHECKPOINT, /* 0x5a4d */ ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x5a4e */ + ZFS_IOC_REDACT, /* 0x5a4f */ + ZFS_IOC_GET_BOOKMARK_PROPS, /* 0x5a50 */ /* * Linux - 3/64 numbers reserved. diff --git a/include/sys/nvpair.h b/include/sys/nvpair.h index ad881854e484..e8567933d2bc 100644 --- a/include/sys/nvpair.h +++ b/include/sys/nvpair.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_NVPAIR_H @@ -313,20 +313,30 @@ void fnvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t); void fnvlist_remove(nvlist_t *, const char *); void fnvlist_remove_nvpair(nvlist_t *, nvpair_t *); -nvpair_t *fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name); -boolean_t fnvlist_lookup_boolean(nvlist_t *nvl, const char *name); -boolean_t fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name); -uchar_t fnvlist_lookup_byte(nvlist_t *nvl, const char *name); -int8_t fnvlist_lookup_int8(nvlist_t *nvl, const char *name); -int16_t fnvlist_lookup_int16(nvlist_t *nvl, const char *name); -int32_t fnvlist_lookup_int32(nvlist_t *nvl, const char *name); -int64_t fnvlist_lookup_int64(nvlist_t *nvl, const char *name); -uint8_t fnvlist_lookup_uint8(nvlist_t *nvl, const char *name); -uint16_t fnvlist_lookup_uint16(nvlist_t *nvl, const char *name); -uint32_t fnvlist_lookup_uint32(nvlist_t *nvl, const char *name); -uint64_t fnvlist_lookup_uint64(nvlist_t *nvl, const char *name); -char *fnvlist_lookup_string(nvlist_t *nvl, const char *name); -nvlist_t *fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name); +nvpair_t *fnvlist_lookup_nvpair(nvlist_t *, const char *); +boolean_t fnvlist_lookup_boolean(nvlist_t *, const char *); +boolean_t fnvlist_lookup_boolean_value(nvlist_t *, const char *); +uchar_t fnvlist_lookup_byte(nvlist_t *, const char *); +int8_t fnvlist_lookup_int8(nvlist_t *, const char *); +int16_t fnvlist_lookup_int16(nvlist_t *, const char *); +int32_t fnvlist_lookup_int32(nvlist_t *, const char *); +int64_t fnvlist_lookup_int64(nvlist_t *, const char *); +uint8_t fnvlist_lookup_uint8(nvlist_t *, const char *); +uint16_t fnvlist_lookup_uint16(nvlist_t *, const char *); +uint32_t fnvlist_lookup_uint32(nvlist_t *, const char *); +uint64_t fnvlist_lookup_uint64(nvlist_t *, const char *); +char *fnvlist_lookup_string(nvlist_t *, const char *); +nvlist_t *fnvlist_lookup_nvlist(nvlist_t *, const char *); +boolean_t *fnvlist_lookup_boolean_array(nvlist_t *, const char *, uint_t *); +uchar_t *fnvlist_lookup_byte_array(nvlist_t *, const char *, uint_t *); +int8_t *fnvlist_lookup_int8_array(nvlist_t *, const char *, uint_t *); +uint8_t *fnvlist_lookup_uint8_array(nvlist_t *, const char *, uint_t *); +int16_t *fnvlist_lookup_int16_array(nvlist_t *, const char *, uint_t *); +uint16_t *fnvlist_lookup_uint16_array(nvlist_t *, const char *, uint_t *); +int32_t *fnvlist_lookup_int32_array(nvlist_t *, const char *, uint_t *); +uint32_t *fnvlist_lookup_uint32_array(nvlist_t *, const char *, uint_t *); +int64_t *fnvlist_lookup_int64_array(nvlist_t *, const char *, uint_t *); +uint64_t *fnvlist_lookup_uint64_array(nvlist_t *, const char *, uint_t *); boolean_t fnvpair_value_boolean_value(nvpair_t *nvp); uchar_t fnvpair_value_byte(nvpair_t *nvp); diff --git a/include/sys/objlist.h b/include/sys/objlist.h new file mode 100644 index 000000000000..a124a61fdc95 --- /dev/null +++ b/include/sys/objlist.h @@ -0,0 +1,51 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#ifndef _OBJLIST_H +#define _OBJLIST_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +typedef struct objlist_node { + list_node_t on_node; + uint64_t on_object; +} objlist_node_t; + +typedef struct objlist { + list_t ol_list; /* List of struct objnode. */ + /* + * Last object looked up. Used to assert that objects are being looked + * up in ascending order. + */ + uint64_t ol_last_lookup; +} objlist_t; + +objlist_t *objlist_create(void); +void objlist_destroy(objlist_t *); +boolean_t objlist_exists(objlist_t *, uint64_t); +void objlist_insert(objlist_t *, uint64_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _OBJLIST_H */ diff --git a/include/sys/spa.h b/include/sys/spa.h index b86c655575bc..16fe2889bdde 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -94,13 +94,19 @@ _NOTE(CONSTCOND) } while (0) #define BF64_GET_SB(x, low, len, shift, bias) \ ((BF64_GET(x, low, len) + (bias)) << (shift)) +/* + * We use ASSERT3U instead of ASSERT in these macros to prevent a lint error in + * the case where val is a constant. We can't fix ASSERT because it's used as + * an expression in several places in the kernel; as a result, changing it to + * the do{} while() syntax to allow us to _NOTE the CONSTCOND is not an option. + */ #define BF32_SET_SB(x, low, len, shift, bias, val) do { \ - ASSERT(IS_P2ALIGNED(val, 1U << shift)); \ + ASSERT3U(IS_P2ALIGNED(val, 1U << shift), !=, B_FALSE); \ ASSERT3S((val) >> (shift), >=, bias); \ BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ _NOTE(CONSTCOND) } while (0) #define BF64_SET_SB(x, low, len, shift, bias, val) do { \ - ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \ + ASSERT3U(IS_P2ALIGNED(val, 1ULL << shift), !=, B_FALSE); \ ASSERT3S((val) >> (shift), >=, bias); \ BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ _NOTE(CONSTCOND) } while (0) @@ -402,6 +408,7 @@ _NOTE(CONSTCOND) } while (0) typedef enum bp_embedded_type { BP_EMBEDDED_TYPE_DATA, BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */ + BP_EMBEDDED_TYPE_REDACTED, NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED } bp_embedded_type_t; @@ -602,6 +609,14 @@ _NOTE(CONSTCOND) } while (0) #define BP_IS_HOLE(bp) \ (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp))) +#define BP_SET_REDACTED(bp) \ +{ \ + BP_SET_EMBEDDED(bp, B_TRUE); \ + BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_REDACTED); \ +} +#define BP_IS_REDACTED(bp) \ + (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_REDACTED) + /* BP_IS_RAIDZ(bp) assumes no block compression */ #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ BP_GET_PSIZE(bp)) @@ -678,6 +693,13 @@ _NOTE(CONSTCOND) } while (0) (u_longlong_t)BPE_GET_LSIZE(bp), \ (u_longlong_t)BPE_GET_PSIZE(bp), \ (u_longlong_t)bp->blk_birth); \ + } else if (BP_IS_REDACTED(bp)) { \ + len += func(buf + len, size - len, \ + "REDACTED [L%llu %s] size=%llxL birth=%lluL", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + (u_longlong_t)BP_GET_LSIZE(bp), \ + (u_longlong_t)bp->blk_birth); \ } else { \ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ const dva_t *dva = &bp->blk_dva[d]; \ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 6f502897ec18..ae7ee5a652a2 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -303,6 +303,7 @@ typedef pthread_cond_t kcondvar_t; extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg); extern void cv_destroy(kcondvar_t *cv); extern void cv_wait(kcondvar_t *cv, kmutex_t *mp); +extern int cv_wait_sig(kcondvar_t *cv, kmutex_t *mp); extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag); diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index b63ceffac57d..0090dece73cd 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2017, Intel Corporation. */ @@ -101,7 +101,7 @@ typedef enum drr_headertype { /* flag #18 is reserved for a Delphix feature */ #define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19) #define DMU_BACKUP_FEATURE_RESUMING (1 << 20) -/* flag #21 is reserved for a Delphix feature */ +#define DMU_BACKUP_FEATURE_REDACTED (1 << 21) #define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22) #define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23) #define DMU_BACKUP_FEATURE_RAW (1 << 24) @@ -115,7 +115,7 @@ typedef enum drr_headertype { DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \ DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_LARGE_BLOCKS | \ DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_LARGE_DNODE | \ - DMU_BACKUP_FEATURE_RAW) + DMU_BACKUP_FEATURE_RAW | DMU_BACKUP_FEATURE_REDACTED) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) @@ -189,7 +189,7 @@ typedef struct dmu_replay_record { enum { DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, - DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_OBJECT_RANGE, + DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_OBJECT_RANGE, DRR_REDACT, DRR_NUMTYPES } drr_type; uint32_t drr_payloadlen; @@ -314,6 +314,12 @@ typedef struct dmu_replay_record { uint8_t drr_flags; uint8_t drr_pad[3]; } drr_object_range; + struct drr_redact { + uint64_t drr_object; + uint64_t drr_offset; + uint64_t drr_length; + uint64_t drr_toguid; + } drr_redact; /* * Nore: drr_checksum is overlaid with all record types diff --git a/include/sys/zfs_vfsops.h b/include/sys/zfs_vfsops.h index 0a4f52f2f5b6..706a150bbfd8 100644 --- a/include/sys/zfs_vfsops.h +++ b/include/sys/zfs_vfsops.h @@ -36,6 +36,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -195,6 +196,7 @@ extern uint_t zfs_fsyncer_key; extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); +extern int zfs_end_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t *valuep); extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, @@ -212,6 +214,7 @@ extern int zfsvfs_create(const char *name, boolean_t readony, zfsvfs_t **zfvp); extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); extern void zfsvfs_free(zfsvfs_t *zfsvfs); extern int zfs_check_global_label(const char *dsname, const char *hexsl); +extern objlist_t *zfs_get_deleteq(objset_t *os); extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); extern int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent); diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 84bc7f816734..f5c2d54caa14 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. @@ -65,6 +65,9 @@ typedef enum spa_feature { SPA_FEATURE_POOL_CHECKPOINT, SPA_FEATURE_SPACEMAP_V2, SPA_FEATURE_ALLOCATION_CLASSES, + SPA_FEATURE_REDACTION_BOOKMARKS, + SPA_FEATURE_REDACTED_DATASETS, + SPA_FEATURE_BOOKMARK_WRITTEN, SPA_FEATURES } spa_feature_t; @@ -73,7 +76,10 @@ typedef enum spa_feature { typedef enum zfeature_flags { /* Can open pool readonly even if this feature is not supported. */ ZFEATURE_FLAG_READONLY_COMPAT = (1 << 0), - /* Is this feature necessary to read the MOS? */ + /* + * Is this feature necessary to load the pool? i.e. do we need this + * feature to read the full feature list out of the MOS? + */ ZFEATURE_FLAG_MOS = (1 << 1), /* Activate this feature at the same time it is enabled. */ ZFEATURE_FLAG_ACTIVATE_ON_ENABLE = (1 << 2), @@ -81,6 +87,12 @@ typedef enum zfeature_flags { ZFEATURE_FLAG_PER_DATASET = (1 << 3) } zfeature_flags_t; +typedef enum zfeature_type { + ZFEATURE_TYPE_BOOLEAN, + ZFEATURE_TYPE_UINT64_ARRAY, + ZFEATURE_NUM_TYPES +} zfeature_type_t; + typedef struct zfeature_info { spa_feature_t fi_feature; const char *fi_uname; /* User-facing feature name */ @@ -88,6 +100,7 @@ typedef struct zfeature_info { const char *fi_desc; /* Feature description */ zfeature_flags_t fi_flags; boolean_t fi_zfs_mod_supported; /* supported by running zfs module */ + zfeature_type_t fi_type; /* Only relevant for PER_DATASET features */ /* array of dependencies, terminated by SPA_FEATURE_NONE */ const spa_feature_t *fi_depends; } zfeature_info_t; diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 76a316a9c8f4..97094883e864 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * Copyright (c) 2013 Martin Matuska. All rights reserved. @@ -582,7 +582,6 @@ zfs_bookmark_exists(const char *path) int err; boolean_t rv; - (void) strlcpy(fsname, path, sizeof (fsname)); pound = strchr(fsname, '#'); if (pound == NULL) @@ -2368,6 +2367,10 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, *val = zhp->zfs_dmustats.dds_inconsistent; break; + case ZFS_PROP_REDACTED: + *val = zhp->zfs_dmustats.dds_redacted; + break; + default: switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: @@ -2580,6 +2583,37 @@ zfs_get_clones_nvl(zfs_handle_t *zhp) return (value); } +static int +get_rsnaps_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) +{ + nvlist_t *value; + uint64_t *snaps; + uint_t nsnaps; + + if (nvlist_lookup_nvlist(zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &value) != 0) + return (-1); + if (nvlist_lookup_uint64_array(value, ZPROP_VALUE, &snaps, + &nsnaps) != 0) + return (-1); + if (nsnaps == 0) { + /* There's no redaction snapshots; pass a special value back */ + (void) snprintf(propbuf, proplen, "none"); + return (0); + } + propbuf[0] = '\0'; + for (int i = 0; i < nsnaps; i++) { + char buf[128]; + if (propbuf[0] != '\0') + (void) strlcat(propbuf, ",", proplen); + (void) snprintf(buf, sizeof (buf), "%llu", + (u_longlong_t)snaps[i]); + (void) strlcat(propbuf, buf, proplen); + } + + return (0); +} + /* * Accepts a property and value and checks that the value * matches the one found by the channel program. If they are @@ -2774,6 +2808,11 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, zcp_check(zhp, prop, 0, str); break; + case ZFS_PROP_REDACT_SNAPS: + if (get_rsnaps_string(zhp, propbuf, proplen) != 0) + return (-1); + break; + case ZFS_PROP_CLONES: if (get_clones_string(zhp, propbuf, proplen) != 0) return (-1); @@ -3291,6 +3330,9 @@ zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, return (0); } +/* + * propname must start with "written@" or "written#". + */ int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue) @@ -3301,8 +3343,10 @@ zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - snapname = strchr(propname, '@') + 1; - if (strchr(snapname, '@')) { + assert(zfs_prop_written(propname)); + snapname = propname + strlen("written@"); + if (strchr(snapname, '@') != NULL || strchr(snapname, '#') != NULL) { + /* full snapshot or bookmark name specified */ (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); } else { /* snapname is the short name, append it to zhp's fsname */ @@ -3313,8 +3357,7 @@ zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, cp = strchr(zc.zc_value, '@'); if (cp != NULL) *cp = '\0'; - (void) strlcat(zc.zc_value, "@", sizeof (zc.zc_value)); - (void) strlcat(zc.zc_value, snapname, sizeof (zc.zc_value)); + (void) strlcat(zc.zc_value, snapname - 1, sizeof (zc.zc_value)); } err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SPACE_WRITTEN, &zc); diff --git a/lib/libzfs/libzfs_iter.c b/lib/libzfs/libzfs_iter.c index 6f9b3f465785..4e6f8f9fc430 100644 --- a/lib/libzfs/libzfs_iter.c +++ b/lib/libzfs/libzfs_iter.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. */ @@ -188,9 +188,12 @@ zfs_iter_bookmarks(zfs_handle_t *zhp, zfs_iter_f func, void *data) /* Setup the requested properties nvlist. */ props = fnvlist_alloc(); - fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_GUID)); - fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_CREATETXG)); - fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_CREATION)); + for (zfs_prop_t p = 0; p < ZFS_NUM_PROPS; p++) { + if (zfs_prop_valid_for_type(p, ZFS_TYPE_BOOKMARK, B_FALSE)) { + fnvlist_add_boolean(props, zfs_prop_to_name(p)); + } + } + fnvlist_add_boolean(props, "redact_complete"); if ((err = lzc_get_bookmarks(zhp->zfs_name, props, &bmarks)) != 0) goto out; diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c index 59b98a28450e..f544392673b0 100644 --- a/lib/libzfs/libzfs_mount.c +++ b/lib/libzfs/libzfs_mount.c @@ -22,7 +22,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014, 2018 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright 2017 RackTop Systems. */ @@ -300,7 +300,7 @@ zfs_is_mounted(zfs_handle_t *zhp, char **where) */ static boolean_t zfs_is_mountable(zfs_handle_t *zhp, char *buf, size_t buflen, - zprop_source_t *source) + zprop_source_t *source, int flags) { char sourceloc[MAXNAMELEN]; zprop_source_t sourcetype; @@ -323,6 +323,13 @@ zfs_is_mountable(zfs_handle_t *zhp, char *buf, size_t buflen, getzoneid() == GLOBAL_ZONEID) return (B_FALSE); + if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED) && + getzoneid() == GLOBAL_ZONEID) + return (B_FALSE); + + if (zfs_prop_get_int(zhp, ZFS_PROP_REDACTED) && !(flags & MS_FORCE)) + return (B_FALSE); + if (source) *source = sourcetype; @@ -489,8 +496,10 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags) if (zpool_get_prop_int(zhp->zpool_hdl, ZPOOL_PROP_READONLY, NULL)) (void) strlcat(mntopts, "," MNTOPT_RO, sizeof (mntopts)); - if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL)) + if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL, + flags)) { return (0); + } /* * Append default mount options which apply to the mount point. @@ -861,7 +870,7 @@ zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto) zprop_source_t sourcetype; int ret; - if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL)) + if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL, 0)) return (0); for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) { @@ -1128,8 +1137,7 @@ remove_mountpoint(zfs_handle_t *zhp) char mountpoint[ZFS_MAXPROPLEN]; zprop_source_t source; - if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), - &source)) + if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), &source, 0)) return; if (source == ZPROP_SRC_DEFAULT || diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index b5c91ec20a60..b897d25bc9dd 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * All rights reserved @@ -72,6 +72,9 @@ extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *, recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int, uint64_t *, const char *, nvlist_t *); +static int guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent, + uint64_t guid, boolean_t bookmark_ok, uint64_t *redact_snap_guids, + uint64_t num_redact_snaps, char *name); static int guid_to_name(libzfs_handle_t *, const char *, uint64_t, boolean_t, char *); @@ -87,6 +90,8 @@ typedef struct progress_arg { zfs_handle_t *pa_zhp; int pa_fd; boolean_t pa_parsable; + boolean_t pa_estimate; + int pa_verbosity; } progress_arg_t; typedef struct dataref { @@ -868,7 +873,7 @@ send_iterate_fs(zfs_handle_t *zhp, void *arg) "exist\n"), sd->fsname, sd->tosnap, sd->recursive ? dgettext(TEXT_DOMAIN, " recursively") : "", zhp->zfs_name, sd->tosnap); - rv = -1; + rv = EZFS_NOENT; } goto out; } @@ -1006,7 +1011,7 @@ typedef struct send_dump_data { char prevsnap[ZFS_MAX_DATASET_NAME_LEN]; uint64_t prevsnap_obj; boolean_t seenfrom, seento, replicate, doall, fromorigin; - boolean_t verbose, dryrun, parsable, progress, embed_data, std_out; + boolean_t dryrun, parsable, progress, embed_data, std_out; boolean_t large_block, compress, raw; int outfd; boolean_t err; @@ -1018,6 +1023,7 @@ typedef struct send_dump_data { nvlist_t *debugnv; char holdtag[ZFS_MAX_DATASET_NAME_LEN]; int cleanup_fd; + int verbosity; uint64_t size; } send_dump_data_t; @@ -1176,41 +1182,73 @@ gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd) fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag); } +int +zfs_send_progress(zfs_handle_t *zhp, int fd, uint64_t *bytes_written, + uint64_t *blocks_visited) +{ + zfs_cmd_t zc = { {0} }; + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + zc.zc_cookie = fd; + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0) + return (errno); + if (bytes_written != NULL) + *bytes_written = zc.zc_cookie; + if (blocks_visited != NULL) + *blocks_visited = zc.zc_objset_type; + return (0); +} + static void * send_progress_thread(void *arg) { progress_arg_t *pa = arg; - zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp = pa->pa_zhp; - libzfs_handle_t *hdl = zhp->zfs_hdl; - unsigned long long bytes; + uint64_t bytes; + uint64_t blocks; char buf[16]; time_t t; struct tm *tm; - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - - if (!pa->pa_parsable) - (void) fprintf(stderr, "TIME SENT SNAPSHOT\n"); + boolean_t firstloop = B_TRUE; /* * Print the progress from ZFS_IOC_SEND_PROGRESS every second. */ for (;;) { + int err; (void) sleep(1); + if ((err = zfs_send_progress(zhp, pa->pa_fd, &bytes, + &blocks)) != 0) { + if (err == EINTR || err == ENOENT) + return ((void *)0); + return ((void *)(uintptr_t)err); + } - zc.zc_cookie = pa->pa_fd; - if (zfs_ioctl(hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0) - return ((void *)-1); + if (firstloop && !pa->pa_parsable) { + (void) fprintf(stderr, "TIME %s %sSNAPSHOT\n", + pa->pa_estimate ? "BYTES" : " SENT", + pa->pa_verbosity >= 2 ? " BLOCKS " : ""); + firstloop = B_FALSE; + } (void) time(&t); tm = localtime(&t); - bytes = zc.zc_cookie; - if (pa->pa_parsable) { + if (pa->pa_verbosity >= 2 && pa->pa_parsable) { + (void) fprintf(stderr, + "%02d:%02d:%02d\t%llu\t%llu\t%s\n", + tm->tm_hour, tm->tm_min, tm->tm_sec, + (u_longlong_t)bytes, (u_longlong_t)blocks, + zhp->zfs_name); + } else if (pa->pa_verbosity >= 2) { + zfs_nicenum(bytes, buf, sizeof (buf)); + (void) fprintf(stderr, + "%02d:%02d:%02d %5s %8llu %s\n", + tm->tm_hour, tm->tm_min, tm->tm_sec, + buf, (u_longlong_t)blocks, zhp->zfs_name); + } else if (pa->pa_parsable) { (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", tm->tm_hour, tm->tm_min, tm->tm_sec, - bytes, zhp->zfs_name); + (u_longlong_t)bytes, zhp->zfs_name); } else { zfs_nicebytes(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", @@ -1352,7 +1390,7 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) fromorigin = sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate); - if (sdd->verbose) { + if (sdd->verbosity != 0) { uint64_t size = 0; char fromds[ZFS_MAX_DATASET_NAME_LEN]; @@ -1381,6 +1419,8 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) pa.pa_zhp = zhp; pa.pa_fd = sdd->outfd; pa.pa_parsable = sdd->parsable; + pa.pa_estimate = B_FALSE; + pa.pa_verbosity = sdd->verbosity; if ((err = pthread_create(&tid, NULL, send_progress_thread, &pa)) != 0) { @@ -1393,8 +1433,18 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) fromorigin, sdd->outfd, flags, sdd->debugnv); if (sdd->progress) { + void *status = NULL; (void) pthread_cancel(tid); - (void) pthread_join(tid, NULL); + (void) pthread_join(tid, &status); + int error = (int)(uintptr_t)status; + if (error != 0 && status != PTHREAD_CANCELED) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "progress thread exited nonzero")); + return (zfs_standard_error(zhp->zfs_hdl, error, + errbuf)); + } } } @@ -1650,6 +1700,183 @@ zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token) } return (nv); } +static enum lzc_send_flags +lzc_flags_from_sendflags(const sendflags_t *flags) +{ + enum lzc_send_flags lzc_flags = 0; + if (flags->largeblock) + lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; + if (flags->embed_data) + lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; + if (flags->compress) + lzc_flags |= LZC_SEND_FLAG_COMPRESS; + if (flags->raw) + lzc_flags |= LZC_SEND_FLAG_RAW; + return (lzc_flags); +} + +static int +estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, + uint64_t resumeobj, uint64_t resumeoff, uint64_t bytes, + const char *redactbook, char *errbuf) +{ + uint64_t size; + FILE *fout = flags->dryrun ? stdout : stderr; + progress_arg_t pa = { 0 }; + int err = 0; + pthread_t ptid; + + if (flags->progress) { + pa.pa_zhp = zhp; + pa.pa_fd = fd; + pa.pa_parsable = flags->parsable; + pa.pa_estimate = B_TRUE; + pa.pa_verbosity = flags->verbosity; + + err = pthread_create(&ptid, NULL, + send_progress_thread, &pa); + if (err != 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + return (zfs_error(zhp->zfs_hdl, + EZFS_THREADCREATEFAILED, errbuf)); + } + } + + err = lzc_send_space_resume_redacted(zhp->zfs_name, from, + lzc_flags_from_sendflags(flags), resumeobj, resumeoff, bytes, + redactbook, fd, &size); + + if (flags->progress) { + void *status = NULL; + (void) pthread_cancel(ptid); + (void) pthread_join(ptid, &status); + int error = (int)(uintptr_t)status; + if (error != 0 && status != PTHREAD_CANCELED) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "progress thread exited " + "nonzero")); + return (zfs_standard_error(zhp->zfs_hdl, error, + errbuf)); + } + } + + if (err != 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(err)); + return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, + errbuf)); + } + send_print_verbose(fout, zhp->zfs_name, from, size, + flags->parsable); + + if (flags->parsable) { + (void) fprintf(fout, "size\t%llu\n", (longlong_t)size); + } else { + char buf[16]; + zfs_nicenum(size, buf, sizeof (buf)); + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "total estimated size is %s\n"), buf); + } + return (0); +} + +static boolean_t +redact_snaps_contains(const uint64_t *snaps, uint64_t num_snaps, uint64_t guid) +{ + for (int i = 0; i < num_snaps; i++) { + if (snaps[i] == guid) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +redact_snaps_equal(const uint64_t *snaps1, uint64_t num_snaps1, + const uint64_t *snaps2, uint64_t num_snaps2) +{ + if (num_snaps1 != num_snaps2) + return (B_FALSE); + for (int i = 0; i < num_snaps1; i++) { + if (!redact_snaps_contains(snaps2, num_snaps2, snaps1[i])) + return (B_FALSE); + } + return (B_TRUE); +} + +/* + * Check that the list of redaction snapshots in the bookmark matches the send + * we're resuming, and return whether or not it's complete. + * + * Note that the caller needs to free the contents of *bookname with free() if + * this function returns successfully. + */ +static int +find_redact_book(libzfs_handle_t *hdl, const char *path, + const uint64_t *redact_snap_guids, int num_redact_snaps, + char **bookname) +{ + char errbuf[1024]; + int error = 0; + nvlist_t *props = fnvlist_alloc(); + nvlist_t *bmarks; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot resume send")); + + fnvlist_add_boolean(props, "redact_complete"); + fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); + error = lzc_get_bookmarks(path, props, &bmarks); + nvlist_free(props); + if (error != 0) { + if (error == ESRCH) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "nonexistent redaction bookmark provided")); + } else if (error == ENOENT) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset to be sent no longer exists")); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "unknown error: %s"), strerror(error)); + } + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + nvpair_t *pair; + for (pair = nvlist_next_nvpair(bmarks, NULL); pair; + pair = nvlist_next_nvpair(bmarks, pair)) { + + nvlist_t *bmark = fnvpair_value_nvlist(pair); + nvlist_t *vallist = fnvlist_lookup_nvlist(bmark, + zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); + uint_t len = 0; + uint64_t *bmarksnaps = fnvlist_lookup_uint64_array(vallist, + ZPROP_VALUE, &len); + if (redact_snaps_equal(redact_snap_guids, + num_redact_snaps, bmarksnaps, len)) { + break; + } + } + if (pair == NULL) { + fnvlist_free(bmarks); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "no appropriate redaction bookmark exists")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + char *name = nvpair_name(pair); + nvlist_t *bmark = fnvpair_value_nvlist(pair); + nvlist_t *vallist = fnvlist_lookup_nvlist(bmark, "redact_complete"); + boolean_t complete = fnvlist_lookup_boolean_value(vallist, + ZPROP_VALUE); + if (!complete) { + fnvlist_free(bmarks); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incomplete redaction bookmark provided")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + *bookname = strndup(name, ZFS_MAX_DATASET_NAME_LEN); + ASSERT3P(*bookname, !=, NULL); + fnvlist_free(bmarks); + return (0); +} int zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, @@ -1663,7 +1890,10 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, int error = 0; char name[ZFS_MAX_DATASET_NAME_LEN]; enum lzc_send_flags lzc_flags = 0; - FILE *fout = (flags->verbose && flags->dryrun) ? stdout : stderr; + FILE *fout = (flags->verbosity > 0 && flags->dryrun) ? stdout : stderr; + uint64_t *redact_snap_guids = NULL; + int num_redact_snaps = 0; + char *redact_book = NULL; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot resume send")); @@ -1677,7 +1907,7 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, */ return (zfs_error(hdl, EZFS_FAULT, errbuf)); } - if (flags->verbose) { + if (flags->verbosity != 0) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "resume token contents:\n")); nvlist_print(fout, resume_nvl); @@ -1723,8 +1953,14 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } + if (nvlist_lookup_uint64_array(resume_nvl, "book_redact_snaps", + &redact_snap_guids, (uint_t *)&num_redact_snaps) != 0) { + num_redact_snaps = -1; + } + if (fromguid != 0) { - if (guid_to_name(hdl, toname, fromguid, B_TRUE, name) != 0) { + if (guid_to_name_redact_snaps(hdl, toname, fromguid, B_TRUE, + redact_snap_guids, num_redact_snaps, name) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source %#llx no longer exists"), (longlong_t)fromguid); @@ -1733,14 +1969,39 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, fromname = name; } - if (flags->verbose) { - uint64_t size = 0; - error = lzc_send_space(zhp->zfs_name, fromname, - lzc_flags, &size); - if (error == 0) - size = MAX(0, (int64_t)(size - bytes)); - send_print_verbose(fout, zhp->zfs_name, fromname, - size, flags->parsable); + redact_snap_guids = NULL; + + if (nvlist_lookup_uint64_array(resume_nvl, + zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &redact_snap_guids, + (uint_t *)&num_redact_snaps) == 0) { + char path[ZFS_MAX_DATASET_NAME_LEN]; + + (void) strlcpy(path, toname, sizeof (path)); + char *at = strchr(path, '@'); + ASSERT3P(at, !=, NULL); + + *at = '\0'; + + if ((error = find_redact_book(hdl, path, redact_snap_guids, + num_redact_snaps, &redact_book)) != 0) { + return (error); + } + } + + if (flags->verbosity != 0) { + /* + * Some of these may have come from the resume token, set them + * here for size estimate purposes. + */ + sendflags_t tmpflags = *flags; + if (lzc_flags & LZC_SEND_FLAG_LARGE_BLOCK) + tmpflags.largeblock = B_TRUE; + if (lzc_flags & LZC_SEND_FLAG_COMPRESS) + tmpflags.compress = B_TRUE; + if (lzc_flags & LZC_SEND_FLAG_EMBED_DATA) + tmpflags.embed_data = B_TRUE; + error = estimate_size(zhp, fromname, outfd, &tmpflags, + resumeobj, resumeoff, bytes, redact_book, errbuf); } if (!flags->dryrun) { @@ -1754,21 +2015,36 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, pa.pa_zhp = zhp; pa.pa_fd = outfd; pa.pa_parsable = flags->parsable; + pa.pa_estimate = B_FALSE; + pa.pa_verbosity = flags->verbosity; error = pthread_create(&tid, NULL, send_progress_thread, &pa); if (error != 0) { + if (redact_book != NULL) + free(redact_book); zfs_close(zhp); return (error); } } - error = lzc_send_resume(zhp->zfs_name, fromname, outfd, - lzc_flags, resumeobj, resumeoff); + error = lzc_send_resume_redacted(zhp->zfs_name, fromname, outfd, + lzc_flags, resumeobj, resumeoff, redact_book); + if (redact_book != NULL) + free(redact_book); if (flags->progress) { + void *status = NULL; (void) pthread_cancel(tid); - (void) pthread_join(tid, NULL); + (void) pthread_join(tid, &status); + int error = (int)(uintptr_t)status; + if (error != 0 && status != PTHREAD_CANCELED) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "progress thread exited nonzero")); + return (zfs_standard_error(hdl, error, errbuf)); + } } char errbuf[1024]; @@ -1784,6 +2060,12 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "source key must be loaded")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); + case ESRCH: + if (lzc_exists(zhp->zfs_name)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental source could not be found")); + } + return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EXDEV: case ENOENT: @@ -1804,16 +2086,145 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, default: return (zfs_standard_error(hdl, errno, errbuf)); } + } else { + if (redact_book != NULL) + free(redact_book); } - zfs_close(zhp); return (error); } /* - * Generate a send stream for the dataset identified by the argument zhp. + * This function informs the target system that the recursive send is complete. + * The record is also expected in the case of a send -p. + */ +static int +send_conclusion_record(int fd, zio_cksum_t *zc) +{ + dmu_replay_record_t drr = { 0 }; + drr.drr_type = DRR_END; + if (zc != NULL) + drr.drr_u.drr_end.drr_checksum = *zc; + if (write(fd, &drr, sizeof (drr)) == -1) { + return (errno); + } + return (0); +} + +/* + * This function is responsible for sending the records that contain the + * necessary information for the target system's libzfs to be able to set the + * properties of the filesystem being received, or to be able to prepare for + * a recursive receive. + * + * The "zhp" argument is the handle of the snapshot we are sending + * (the "tosnap"). The "from" argument is the short snapshot name (the part + * after the @) of the incremental source. + */ +static int +send_prelim_records(zfs_handle_t *zhp, const char *from, int fd, + boolean_t gather_props, boolean_t recursive, boolean_t verbose, + boolean_t dryrun, boolean_t raw, boolean_t backup, nvlist_t **fssp, + avl_tree_t **fsavlp) +{ + int err = 0; + char *packbuf = NULL; + size_t buflen = 0; + zio_cksum_t zc = { {0} }; + int featureflags = 0; + /* name of filesystem/volume that contains snapshot we are sending */ + char tofs[ZFS_MAX_DATASET_NAME_LEN]; + /* short name of snap we are sending */ + char *tosnap = ""; + + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot send '%s'"), zhp->zfs_name); + if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM && zfs_prop_get_int(zhp, + ZFS_PROP_VERSION) >= ZPL_VERSION_SA) { + featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; + } + + (void) strlcpy(tofs, zhp->zfs_name, ZFS_MAX_DATASET_NAME_LEN); + char *at = strchr(tofs, '@'); + if (at != NULL) { + *at = '\0'; + tosnap = at + 1; + } + + if (gather_props) { + nvlist_t *hdrnv = fnvlist_alloc(); + nvlist_t *fss = NULL; + + if (from != NULL) + fnvlist_add_string(hdrnv, "fromsnap", from); + fnvlist_add_string(hdrnv, "tosnap", tosnap); + if (!recursive) + fnvlist_add_boolean(hdrnv, "not_recursive"); + + if (raw) { + VERIFY0(nvlist_add_boolean(hdrnv, "raw")); + } + + if ((err = gather_nvlist(zhp->zfs_hdl, tofs, + from, tosnap, recursive, raw, verbose, backup, &fss, + fsavlp)) != 0) { + return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, + errbuf)); + } + fnvlist_add_nvlist(hdrnv, "fss", fss); + VERIFY0(nvlist_pack(hdrnv, &packbuf, &buflen, NV_ENCODE_XDR, + 0)); + if (fssp != NULL) { + *fssp = fss; + } else { + nvlist_free(fss); + } + nvlist_free(hdrnv); + } + + if (!dryrun) { + dmu_replay_record_t drr = { 0 }; + /* write first begin record */ + drr.drr_type = DRR_BEGIN; + drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; + DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin. + drr_versioninfo, DMU_COMPOUNDSTREAM); + DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin. + drr_versioninfo, featureflags); + if (snprintf(drr.drr_u.drr_begin.drr_toname, + sizeof (drr.drr_u.drr_begin.drr_toname), "%s@%s", tofs, + tosnap) >= sizeof (drr.drr_u.drr_begin.drr_toname)) { + return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, + errbuf)); + } + drr.drr_payloadlen = buflen; + + err = dump_record(&drr, packbuf, buflen, &zc, fd); + free(packbuf); + if (err != 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(err)); + return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, + errbuf)); + } + err = send_conclusion_record(fd, &zc); + if (err != 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(err)); + return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, + errbuf)); + } + } + return (0); +} + +/* + * Generate a send stream. The "zhp" argument is the filesystem/volume + * that contains the snapshot to send. The "fromsnap" argument is the + * short name (the part after the '@') of the snapshot that is the + * incremental source to send from (if non-NULL). The "tosnap" argument + * is the short name of the snapshot to send. * * The content of the send stream is the snapshot identified by * 'tosnap'. Incremental streams are requested in two ways: @@ -1891,80 +2302,22 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, } if (flags->replicate || flags->doall || flags->props || flags->backup) { - dmu_replay_record_t drr = { 0 }; - char *packbuf = NULL; - size_t buflen = 0; - zio_cksum_t zc; - - ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); - - if (flags->replicate || flags->props || flags->backup) { - nvlist_t *hdrnv; - - VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0)); - if (fromsnap) { - VERIFY(0 == nvlist_add_string(hdrnv, - "fromsnap", fromsnap)); - } - VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap)); - if (!flags->replicate) { - VERIFY(0 == nvlist_add_boolean(hdrnv, - "not_recursive")); - } - if (flags->raw) { - VERIFY(0 == nvlist_add_boolean(hdrnv, "raw")); - } - - err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name, - fromsnap, tosnap, flags->replicate, flags->raw, - flags->verbose, flags->backup, &fss, &fsavl); - if (err) - goto err_out; - VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss)); - err = nvlist_pack(hdrnv, &packbuf, &buflen, - NV_ENCODE_XDR, 0); - if (debugnvp) - *debugnvp = hdrnv; - else - nvlist_free(hdrnv); - if (err) - goto stderr_out; - } - - if (!flags->dryrun) { - /* write first begin record */ - drr.drr_type = DRR_BEGIN; - drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; - DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin. - drr_versioninfo, DMU_COMPOUNDSTREAM); - DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin. - drr_versioninfo, featureflags); - if (snprintf(drr.drr_u.drr_begin.drr_toname, - sizeof (drr.drr_u.drr_begin.drr_toname), - "%s@%s", zhp->zfs_name, tosnap) >= - sizeof (drr.drr_u.drr_begin.drr_toname)) { - err = EINVAL; - goto stderr_out; - } - drr.drr_payloadlen = buflen; - - err = dump_record(&drr, packbuf, buflen, &zc, outfd); - free(packbuf); - if (err != 0) - goto stderr_out; - - /* write end record */ - bzero(&drr, sizeof (drr)); - drr.drr_type = DRR_END; - drr.drr_u.drr_end.drr_checksum = zc; - err = write(outfd, &drr, sizeof (drr)); - if (err == -1) { - err = errno; - goto stderr_out; - } - - err = 0; + char full_tosnap_name[ZFS_MAX_DATASET_NAME_LEN]; + if (snprintf(full_tosnap_name, sizeof (full_tosnap_name), + "%s@%s", zhp->zfs_name, tosnap) >= + sizeof (full_tosnap_name)) { + err = EINVAL; + goto stderr_out; } + zfs_handle_t *tosnap = zfs_open(zhp->zfs_hdl, + full_tosnap_name, ZFS_TYPE_SNAPSHOT); + err = send_prelim_records(tosnap, fromsnap, outfd, + flags->replicate || flags->props, flags->replicate, + flags->verbosity > 0, flags->dryrun, flags->raw, + flags->backup, &fss, &fsavl); + zfs_close(tosnap); + if (err != 0) + goto err_out; } /* dump each stream */ @@ -1979,7 +2332,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sdd.fromorigin = flags->fromorigin; sdd.fss = fss; sdd.fsavl = fsavl; - sdd.verbose = flags->verbose; + sdd.verbosity = flags->verbosity; sdd.parsable = flags->parsable; sdd.progress = flags->progress; sdd.dryrun = flags->dryrun; @@ -1991,7 +2344,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sdd.filter_cb_arg = cb_arg; if (debugnvp) sdd.debugnv = *debugnvp; - if (sdd.verbose && sdd.dryrun) + if (sdd.verbosity != 0 && sdd.dryrun) sdd.std_out = B_TRUE; fout = sdd.std_out ? stdout : stderr; @@ -2019,7 +2372,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sdd.cleanup_fd = -1; sdd.snapholds = NULL; } - if (flags->verbose || sdd.snapholds != NULL) { + if (flags->verbosity != 0 || sdd.snapholds != NULL) { /* * Do a verbose no-op dry run to get all the verbose output * or to gather snapshot hold's before generating any data, @@ -2031,7 +2384,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, if (err != 0) goto stderr_out; - if (flags->verbose) { + if (flags->verbosity != 0) { if (flags->parsable) { (void) fprintf(fout, "size\t%llu\n", (longlong_t)sdd.size); @@ -2063,7 +2416,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, } sdd.dryrun = B_FALSE; - sdd.verbose = B_FALSE; + sdd.verbosity = 0; } err = dump_filesystems(zhp, &sdd); @@ -2093,12 +2446,9 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, * there was some error, because it might not be totally * failed. */ - dmu_replay_record_t drr = { 0 }; - drr.drr_type = DRR_END; - if (write(outfd, &drr, sizeof (drr)) == -1) { - return (zfs_standard_error(zhp->zfs_hdl, - errno, errbuf)); - } + err = send_conclusion_record(outfd, NULL); + if (err != 0) + return (zfs_standard_error(zhp->zfs_hdl, err, errbuf)); } return (err || sdd.err); @@ -2120,43 +2470,237 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, return (err); } +static int +get_dedup_fd(zfs_handle_t *zhp, dedup_arg_t *dda, int fd, pthread_t *tid, + int *outfd) +{ + int pipefd[2]; + char errbuf[1024]; + int err; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot send '%s'"), zhp->zfs_name); + if ((err = socketpair(AF_UNIX, SOCK_STREAM, 0, pipefd)) != 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, + errbuf)); + } + dda->outputfd = fd; + dda->inputfd = pipefd[1]; + dda->dedup_hdl = zhp->zfs_hdl; + if ((err = pthread_create(tid, NULL, cksummer, dda)) != 0) { + (void) close(pipefd[0]); + (void) close(pipefd[1]); + zfs_error_aux(zhp->zfs_hdl, strerror(err)); + return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, + errbuf)); + } + *outfd = pipefd[0]; + return (0); +} + +zfs_handle_t * +name_to_dir_handle(libzfs_handle_t *hdl, const char *snapname) +{ + char dirname[ZFS_MAX_DATASET_NAME_LEN]; + (void) strlcpy(dirname, snapname, ZFS_MAX_DATASET_NAME_LEN); + char *c = strchr(dirname, '@'); + if (c != NULL) + *c = '\0'; + return (zfs_open(hdl, dirname, ZFS_TYPE_DATASET)); +} + +/* + * Returns B_TRUE if earlier is an earlier snapshot in later's timeline; either + * an earlier snapshot in the same filesystem, or a snapshot before later's + * origin, or it's origin's origin, etc. + */ +static boolean_t +snapshot_is_before(zfs_handle_t *earlier, zfs_handle_t *later) +{ + boolean_t ret; + uint64_t later_txg = + (later->zfs_type == ZFS_TYPE_FILESYSTEM || + later->zfs_type == ZFS_TYPE_VOLUME ? + UINT64_MAX : zfs_prop_get_int(later, ZFS_PROP_CREATETXG)); + uint64_t earlier_txg = zfs_prop_get_int(earlier, ZFS_PROP_CREATETXG); + + if (earlier_txg >= later_txg) + return (B_FALSE); + + zfs_handle_t *earlier_dir = name_to_dir_handle(earlier->zfs_hdl, + earlier->zfs_name); + zfs_handle_t *later_dir = name_to_dir_handle(later->zfs_hdl, + later->zfs_name); + + if (strcmp(earlier_dir->zfs_name, later_dir->zfs_name) == 0) { + zfs_close(earlier_dir); + zfs_close(later_dir); + return (B_TRUE); + } + + char clonename[ZFS_MAX_DATASET_NAME_LEN]; + if (zfs_prop_get(later_dir, ZFS_PROP_ORIGIN, clonename, + ZFS_MAX_DATASET_NAME_LEN, NULL, NULL, 0, B_TRUE) != 0) { + zfs_close(earlier_dir); + zfs_close(later_dir); + return (B_FALSE); + } + + zfs_handle_t *origin = zfs_open(earlier->zfs_hdl, clonename, + ZFS_TYPE_DATASET); + uint64_t origin_txg = zfs_prop_get_int(origin, ZFS_PROP_CREATETXG); + + /* + * If "earlier" is exactly the origin, then + * snapshot_is_before(earlier, origin) will return false (because + * they're the same). + */ + if (origin_txg == earlier_txg && + strcmp(origin->zfs_name, earlier->zfs_name) == 0) { + zfs_close(earlier_dir); + zfs_close(later_dir); + zfs_close(origin); + return (B_TRUE); + } + zfs_close(earlier_dir); + zfs_close(later_dir); + + ret = snapshot_is_before(earlier, origin); + zfs_close(origin); + return (ret); +} + +/* + * The "zhp" argument is the handle of the dataset to send (typically a + * snapshot). The "from" argument is the full name of the snapshot or + * bookmark that is the incremental source. + */ int -zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t flags) +zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, + const char *redactbook) { - int err = 0; + int err; libzfs_handle_t *hdl = zhp->zfs_hdl; - enum lzc_send_flags lzc_flags = 0; - FILE *fout = (flags.verbose && flags.dryrun) ? stdout : stderr; + int orig_fd = fd; + pthread_t ddtid, ptid; + progress_arg_t pa = { 0 }; + dedup_arg_t dda = { 0 }; + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot send '%s'"), zhp->zfs_name); - if (flags.largeblock) - lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; - if (flags.embed_data) - lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; - if (flags.compress) - lzc_flags |= LZC_SEND_FLAG_COMPRESS; - if (flags.raw) - lzc_flags |= LZC_SEND_FLAG_RAW; + if (from != NULL && strchr(from, '@')) { + zfs_handle_t *from_zhp = zfs_open(hdl, from, + ZFS_TYPE_DATASET); + if (!snapshot_is_before(from_zhp, zhp)) { + zfs_close(from_zhp); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "not an earlier snapshot from the same fs")); + return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); + } + zfs_close(from_zhp); + } - if (flags.verbose) { - uint64_t size = 0; - err = lzc_send_space(zhp->zfs_name, from, lzc_flags, &size); - if (err == 0) { - send_print_verbose(fout, zhp->zfs_name, from, size, - flags.parsable); - } else { - (void) fprintf(stderr, "Cannot estimate send size: " - "%s\n", strerror(errno)); + /* + * Send fs properties + */ + if (flags->props || flags->backup) { + /* + * Note: the header generated by send_prelim_records() + * assumes that the incremental source is in the same + * filesystem/volume as the target (which is a requirement + * when doing "zfs send -R"). But that isn't always the + * case here (e.g. send from snap in origin, or send from + * bookmark). We pass from=NULL, which will omit this + * information from the prelim records; it isn't used + * when receiving this type of stream. + */ + err = send_prelim_records(zhp, NULL, fd, B_TRUE, B_FALSE, + flags->verbosity > 0, flags->dryrun, flags->raw, + flags->backup, NULL, NULL); + if (err != 0) + return (err); + } + + /* + * Perform size estimate if verbose was specified. + */ + if (flags->verbosity != 0) { + err = estimate_size(zhp, from, fd, flags, 0, 0, 0, redactbook, + errbuf); + if (err != 0) + return (err); + } + + if (flags->dryrun) + return (0); + + /* + * If deduplication is requested, spawn a thread that will deduplicate + * the data coming out of the kernel. + */ + if (flags->dedup) { + err = get_dedup_fd(zhp, &dda, fd, &ddtid, &fd); + if (err != 0) + return (err); + } + + /* + * If progress reporting is requested, spawn a new thread to poll + * ZFS_IOC_SEND_PROGRESS at a regular interval. + */ + if (flags->progress) { + pa.pa_zhp = zhp; + pa.pa_fd = fd; + pa.pa_parsable = flags->parsable; + pa.pa_estimate = B_FALSE; + pa.pa_verbosity = flags->verbosity; + + err = pthread_create(&ptid, NULL, + send_progress_thread, &pa); + if (err != 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + if (flags->dedup) { + (void) pthread_cancel(ddtid); + (void) close(fd); + (void) pthread_join(ddtid, NULL); + } + return (zfs_error(zhp->zfs_hdl, + EZFS_THREADCREATEFAILED, errbuf)); } } - if (flags.dryrun) - return (err); + err = lzc_send_redacted(zhp->zfs_name, from, fd, + lzc_flags_from_sendflags(flags), redactbook); - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "warning: cannot send '%s'"), zhp->zfs_name); + if (flags->progress) { + void *status = NULL; + if (err != 0) + (void) pthread_cancel(ptid); + (void) pthread_join(ptid, &status); + int error = (int)(uintptr_t)status; + if (error != 0 && status != PTHREAD_CANCELED) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "progress thread exited " + "nonzero")); + return (zfs_standard_error(hdl, error, errbuf)); + } + } + if (flags->dedup) { + if (err != 0) + (void) pthread_cancel(ddtid); + (void) close(fd); + (void) pthread_join(ddtid, NULL); + } - err = lzc_send(zhp->zfs_name, from, fd, lzc_flags); + if (flags->props || flags->backup) { + /* Write the final end record. */ + err = send_conclusion_record(orig_fd, NULL); + if (err != 0) + return (zfs_standard_error(hdl, err, errbuf)); + } if (err != 0) { switch (errno) { case EXDEV: @@ -2516,8 +3060,38 @@ typedef struct guid_to_name_data { boolean_t bookmark_ok; char *name; char *skip; + uint64_t *redact_snap_guids; + uint64_t num_redact_snaps; } guid_to_name_data_t; +boolean_t +redact_snaps_match(zfs_handle_t *zhp, guid_to_name_data_t *gtnd) +{ + uint64_t *bmark_snaps; + uint_t bmark_num_snaps; + nvlist_t *nvl; + if (zhp->zfs_type != ZFS_TYPE_BOOKMARK) + return (B_FALSE); + + nvl = fnvlist_lookup_nvlist(zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); + bmark_snaps = fnvlist_lookup_uint64_array(nvl, ZPROP_VALUE, + &bmark_num_snaps); + if (bmark_num_snaps != gtnd->num_redact_snaps) + return (B_FALSE); + int i = 0; + for (; i < bmark_num_snaps; i++) { + int j = 0; + for (; j < bmark_num_snaps; j++) { + if (bmark_snaps[i] == gtnd->redact_snap_guids[j]) + break; + } + if (j == bmark_num_snaps) + break; + } + return (i == bmark_num_snaps); +} + static int guid_to_name_cb(zfs_handle_t *zhp, void *arg) { @@ -2532,7 +3106,8 @@ guid_to_name_cb(zfs_handle_t *zhp, void *arg) return (0); } - if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid) { + if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid && + (gtnd->num_redact_snaps == -1 || redact_snaps_match(zhp, gtnd))) { (void) strcpy(gtnd->name, zhp->zfs_name); zfs_close(zhp); return (EEXIST); @@ -2551,10 +3126,19 @@ guid_to_name_cb(zfs_handle_t *zhp, void *arg) * progressively larger portions of the hierarchy. This allows one to send a * tree of datasets individually and guarantee that we will find the source * guid within that hierarchy, even if there are multiple matches elsewhere. + * + * If num_redact_snaps is not -1, we attempt to find a redaction bookmark with + * the specified number of redaction snapshots. If num_redact_snaps isn't 0 or + * -1, then redact_snap_guids will be an array of the guids of the snapshots the + * redaction bookmark was created with. If num_redact_snaps is -1, then we will + * attempt to find a snapshot or bookmark (if bookmark_ok is passed) with the + * given guid. Note that a redaction bookmark can be returned if + * num_redact_snaps == -1. */ static int -guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, - boolean_t bookmark_ok, char *name) +guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent, + uint64_t guid, boolean_t bookmark_ok, uint64_t *redact_snap_guids, + uint64_t num_redact_snaps, char *name) { char pname[ZFS_MAX_DATASET_NAME_LEN]; guid_to_name_data_t gtnd; @@ -2563,6 +3147,8 @@ guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, gtnd.bookmark_ok = bookmark_ok; gtnd.name = name; gtnd.skip = NULL; + gtnd.redact_snap_guids = redact_snap_guids; + gtnd.num_redact_snaps = num_redact_snaps; /* * Search progressively larger portions of the hierarchy, starting @@ -2601,6 +3187,14 @@ guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, return (ENOENT); } +static int +guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, + boolean_t bookmark_ok, char *name) +{ + return (guid_to_name_redact_snaps(hdl, parent, guid, bookmark_ok, NULL, + -1, name)); +} + /* * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if * guid1 is after guid2. @@ -3400,6 +3994,7 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize, 8), B_FALSE, NULL); break; + case DRR_OBJECT_RANGE: case DRR_WRITE_BYREF: case DRR_FREEOBJECTS: case DRR_FREE: @@ -3675,6 +4270,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, boolean_t toplevel = B_FALSE; boolean_t zoned = B_FALSE; boolean_t hastoken = B_FALSE; + boolean_t redacted; uint8_t *wkeydata = NULL; uint_t wkeylen = 0; @@ -3900,6 +4496,9 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, (void) strcpy(name, destsnap); *strchr(name, '@') = '\0'; + redacted = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_REDACTED; + if (zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp; @@ -4306,7 +4905,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, * receive (indicated by stream_avl being non-NULL). */ cp = strchr(destsnap, '@'); - if (cp && (ioctl_err == 0 || !newfs)) { + if (cp && (ioctl_err == 0 || !newfs) && !redacted) { zfs_handle_t *h; *cp = '\0'; diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 440ed3bc0828..1fc484b755c7 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -283,6 +283,8 @@ libzfs_error_description(libzfs_handle_t *hdl) "different host")); case EZFS_CRYPTOFAILED: return (dgettext(TEXT_DOMAIN, "encryption failure")); + case EZFS_TOOMANY: + return (dgettext(TEXT_DOMAIN, "argument list too long")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index f24581db83e0..15a42b062ee2 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -608,12 +608,42 @@ int lzc_send(const char *snapname, const char *from, int fd, enum lzc_send_flags flags) { - return (lzc_send_resume(snapname, from, fd, flags, 0, 0)); + return (lzc_send_resume_redacted(snapname, from, fd, flags, 0, 0, + NULL)); +} + +int +lzc_send_redacted(const char *snapname, const char *from, int fd, + enum lzc_send_flags flags, const char *redactbook) +{ + return (lzc_send_resume_redacted(snapname, from, fd, flags, 0, 0, + redactbook)); } int lzc_send_resume(const char *snapname, const char *from, int fd, enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff) +{ + return (lzc_send_resume_redacted(snapname, from, fd, flags, resumeobj, + resumeoff, NULL)); +} + +/* + * snapname: The name of the "tosnap", or the snapshot whose contents we are + * sending. + * from: The name of the "fromsnap", or the incremental source. + * fd: File descriptor to write the stream to. + * flags: flags that determine features to be used by the stream. + * resumeobj: Object to resume from, for resuming send + * resumeoff: Offset to resume from, for resuming send. + * redactnv: nvlist of string -> boolean(ignored) containing the names of all + * the snapshots that we should redact with respect to. + * redactbook: Name of the redaction bookmark to create. + */ +int +lzc_send_resume_redacted(const char *snapname, const char *from, int fd, + enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff, + const char *redactbook) { nvlist_t *args; int err; @@ -634,6 +664,9 @@ lzc_send_resume(const char *snapname, const char *from, int fd, fnvlist_add_uint64(args, "resume_object", resumeobj); fnvlist_add_uint64(args, "resume_offset", resumeoff); } + if (redactbook != NULL) + fnvlist_add_string(args, "redactbook", redactbook); + err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL); nvlist_free(args); return (err); @@ -652,11 +685,13 @@ lzc_send_resume(const char *snapname, const char *from, int fd, * are traversed, looking for blocks with a birth time since the creation TXG of * the snapshot this bookmark was created from. This will result in * significantly more I/O and be less efficient than a send space estimation on - * an equivalent snapshot. + * an equivalent snapshot. This process is also used if redact_snaps is + * non-null. */ int -lzc_send_space(const char *snapname, const char *from, - enum lzc_send_flags flags, uint64_t *spacep) +lzc_send_space_resume_redacted(const char *snapname, const char *from, + enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff, + uint64_t resume_bytes, const char *redactbook, int fd, uint64_t *spacep) { nvlist_t *args; nvlist_t *result; @@ -673,6 +708,16 @@ lzc_send_space(const char *snapname, const char *from, fnvlist_add_boolean(args, "compressok"); if (flags & LZC_SEND_FLAG_RAW) fnvlist_add_boolean(args, "rawok"); + if (resumeobj != 0 || resumeoff != 0) { + fnvlist_add_uint64(args, "resume_object", resumeobj); + fnvlist_add_uint64(args, "resume_offset", resumeoff); + fnvlist_add_uint64(args, "bytes", resume_bytes); + } + if (redactbook != NULL) + fnvlist_add_string(args, "redactbook", redactbook); + if (fd != -1) + fnvlist_add_int32(args, "fd", fd); + err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result); nvlist_free(args); if (err == 0) @@ -681,6 +726,14 @@ lzc_send_space(const char *snapname, const char *from, return (err); } +int +lzc_send_space(const char *snapname, const char *from, + enum lzc_send_flags flags, uint64_t *spacep) +{ + return (lzc_send_space_resume_redacted(snapname, from, flags, 0, 0, 0, + NULL, -1, spacep)); +} + static int recv_read(int fd, void *buf, int ilen) { @@ -720,6 +773,7 @@ recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, char fsname[MAXPATHLEN]; char *atp; int error; + boolean_t payload = B_FALSE; ASSERT3S(g_refcount, >, 0); VERIFY3S(g_fd, !=, -1); @@ -750,13 +804,13 @@ recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, return (error); } else { drr = *begin_record; + payload = (begin_record->drr_payloadlen != 0); } /* - * Raw receives, resumable receives, and receives that include a - * wrapping key all use the new interface. + * All recives with a payload should use the new interface. */ - if (resumable || raw || wkeydata != NULL) { + if (resumable || raw || wkeydata != NULL || payload) { nvlist_t *outnvl = NULL; nvlist_t *innvl = fnvlist_alloc(); @@ -1094,18 +1148,32 @@ lzc_bookmark(nvlist_t *bookmarks, nvlist_t **errlist) * parameter is an nvlist of property names (with no values) that will be * returned for each bookmark. * - * The following are valid properties on bookmarks, all of which are numbers - * (represented as uint64 in the nvlist) + * The following are valid properties on bookmarks, most of which are numbers + * (represented as uint64 in the nvlist), except redact_snaps, which is a + * uint64 array, and redact_complete, which is a boolean * * "guid" - globally unique identifier of the snapshot it refers to * "createtxg" - txg when the snapshot it refers to was created * "creation" - timestamp when the snapshot it refers to was created + * "redact_snaps" - list of guids of the redaction snapshots for the specified + * bookmark. If the bookmark is not a redaction bookmark, the nvlist will + * not contain an entry for this value. If it is redacted with respect to + * no snapshots, it will contain value -> NULL uint64 array + * "redact_complete" - boolean value; true if the redaction bookmark is + * complete, false otherwise. * * The format of the returned nvlist as follows: * -> { * -> { * "value" -> uint64 * } + * ... + * "redact_snaps" -> { + * "value" -> uint64 array + * } + * "redact_complete" -> { + * "value" -> boolean value + * } * } */ int @@ -1114,6 +1182,33 @@ lzc_get_bookmarks(const char *fsname, nvlist_t *props, nvlist_t **bmarks) return (lzc_ioctl(ZFS_IOC_GET_BOOKMARKS, fsname, props, bmarks)); } +/* + * Get bookmark properties. + * + * Given a bookmark's full name, retrieve all properties for the bookmark. + * + * The format of the returned property list is as follows: + * { + * -> { + * "value" -> uint64 + * } + * ... + * "redact_snaps" -> { + * "value" -> uint64 array + * } + */ +int +lzc_get_bookmark_props(const char *bookmark, nvlist_t **props) +{ + int error; + + nvlist_t *innvl = fnvlist_alloc(); + error = lzc_ioctl(ZFS_IOC_GET_BOOKMARK_PROPS, bookmark, innvl, props); + fnvlist_free(innvl); + + return (error); +} + /* * Destroys bookmarks. * @@ -1373,3 +1468,18 @@ lzc_reopen(const char *pool_name, boolean_t scrub_restart) nvlist_free(args); return (error); } + +/* + * Create a redaction bookmark named bookname by redacting snapshot with respect + * to all the snapshots in snapnv. + */ +int +lzc_redact(const char *snapshot, const char *bookname, nvlist_t *snapnv) +{ + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_string(args, "bookname", bookname); + fnvlist_add_nvlist(args, "snapnv", snapnv); + int error = lzc_ioctl(ZFS_IOC_REDACT, snapshot, args, NULL); + fnvlist_free(args); + return (error); +} diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 149706a27cb3..a0f60aabe2b7 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -58,6 +58,8 @@ KERNEL_C = \ dmu_diff.c \ dmu_object.c \ dmu_objset.c \ + dmu_recv.c \ + dmu_redact.c \ dmu_send.c \ dmu_traverse.c \ dmu_tx.c \ @@ -85,6 +87,7 @@ KERNEL_C = \ metaslab.c \ mmp.c \ multilist.c \ + objlist.c \ pathname.c \ range_tree.c \ refcount.c \ diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 341548ac300a..0e3e7785fed6 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 25e4a41d1844..885dcb9459cd 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1664,6 +1664,18 @@ regardless of this setting. Default value: \fB1,048,576\fR. .RE +.sp +.ne 2 +.na +\fBzfs_allow_redacted_dataset_mount\fR (int) +.ad +.RS 12n +Allow datasets received with redacted send/receive to be mounted. Normally +disabled because these datasets may be missing key data. +.sp +Default value: \fB0\fR. +.RE + .sp .ne 2 .na @@ -2148,18 +2160,65 @@ Allow sending of corrupt data (ignore read/checksum errors when sending data) Use \fB1\fR for yes and \fB0\fR for no (default). .RE +.sp +.ne 2 +.na +\fBzfs_send_no_prefetch_queue_ff\fR (int) +.ad +.RS 12n +The fill fraction of the \fBzfs send\fR internal queues. The fill fraction +controls the timing with which internal threads are woken up. +.sp +Default value: \fB20\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_send_no_prefetch_queue_length\fR (int) +.ad +.RS 12n +The maximum number of bytes allowed in \fBzfs send\fR's internal queues. +.sp +Default value: \fB1,048,576\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_send_queue_ff\fR (int) +.ad +.RS 12n +The fill fraction of the \fBzfs send\fR prefetch queue. The fill fraction +controls the timing with which internal threads are woken up. +.sp +Default value: \fB20\fR. +.RE + .sp .ne 2 .na \fBzfs_send_queue_length\fR (int) .ad .RS 12n -The maximum number of bytes allowed in the \fBzfs send\fR queue. This value -must be at least twice the maximum block size in use. +The maximum number of bytes allowed that will be prefetched by \fBzfs send\fR. +This value must be at least twice the maximum block size in use. .sp Default value: \fB16,777,216\fR. .RE +.sp +.ne 2 +.na +\fBzfs_recv_queue_ff\fR (int) +.ad +.RS 12n +The fill fraction of the \fBzfs receive\fR queue. The fill fraction +controls the timing with which internal threads are woken up. +.sp +Default value: \fB20\fR. +.RE + .sp .ne 2 .na @@ -2173,6 +2232,21 @@ must be at least twice the maximum block size in use. Default value: \fB16,777,216\fR. .RE +.sp +.ne 2 +.na +\fBzfs_override_estimate_recordsize\fR (ulong) +.ad +.RS 12n +Setting this variable overrides the default logic for estimating block +sizes when doing a zfs send. The default heuristic is that the average +block size will be the current recordsize. Override this value if most data +in your dataset is not of that size and you require accurate zfs send size +estimates. +.sp +Default value: \fB0\fR. +.RE + .sp .ne 2 .na diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 index 5cc4db45e0ed..efba04fd2556 100644 --- a/man/man5/zpool-features.5 +++ b/man/man5/zpool-features.5 @@ -1,5 +1,5 @@ '\" te -.\" Copyright (c) 2013, 2017 by Delphix. All rights reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" The contents of this file are subject to the terms of the Common Development @@ -780,5 +780,44 @@ vdevs from an allocation class are removed. .RE +.sp +.ne 2 +.na +\fB\fBredaction_bookmarks\fR\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:redaction_bookmarks +READ\-ONLY COMPATIBLE no +DEPENDENCIES bookmarks, extensible_dataset +.TE + +This feature enables the use of the redacted zfs send. Redacted zfs sends +creates redaction bookmarks, which store the list of blocks redacted by the +send that created them. For more information about redacted send, +see \fBzfs\fR(1M). + +.RE + +.sp +.ne 2 +.na +\fB\fBredacted_datasets\fR\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:redacted_datasets +READ\-ONLY COMPATIBLE no +DEPENDENCIES extensible_dataset +.TE + +This feature enables the receiving of redacted zfs send streams. Redacted zfs +send streams creates redacted datasets when received. These datasets are +missing some of their blocks, and so cannot be safely mounted, and their +contents cannot be safely read. For more information about redacted receive, +see \fBzfs\fR(1M). + .SH "SEE ALSO" \fBzpool\fR(8) diff --git a/man/man8/zfs.8 b/man/man8/zfs.8 index da7ee83ae8d5..040de43c4f78 100644 --- a/man/man8/zfs.8 +++ b/man/man8/zfs.8 @@ -21,7 +21,7 @@ .\" .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011 Joshua M. Clulow -.\" Copyright (c) 2011, 2017 by Delphix. All rights reserved. +.\" Copyright (c) 2011, 2018 by Delphix. All rights reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" Copyright (c) 2014 by Adam Stevko. All rights reserved. @@ -180,7 +180,7 @@ .Cm mount .Nm .Cm mount -.Op Fl Olv +.Op Fl Oflv .Op Fl o Ar options .Fl a | Ar filesystem .Nm @@ -203,17 +203,24 @@ .Ar snapshot .Nm .Cm send -.Op Fl LPcenvw -.Op Fl i Ar snapshot Ns | Ns Ar bookmark +.Op Fl DLPcenpvw +.Oo Fl i Ar snapshot Ns | Ns Ar bookmark +.Oc .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm .Cm send +.Fl -redact Ar redaction_bookmark +.Op Fl DLPcenpv +.Op Fl i Ar snapshot Ns | Ns Ar bookmark +.Ar snapshot +.Nm +.Cm send .Op Fl Penv .Fl t Ar receive_resume_token .Nm .Cm receive -.Op Fl Fnsuv -.Op Fl o Sy origin Ns = Ns Ar snapshot +.Op Fl Fnuvs +.Op Fl o Ar origin= .Op Fl o Ar property Ns = Ns Ar value .Op Fl x Ar property .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot @@ -230,6 +237,10 @@ .Fl A .Ar filesystem Ns | Ns Ar volume .Nm +.Cm redact +.Ar snapshot redaction_bookmark +.Op Ar redaction_snapshot Ns ... +.Nm .Cm allow .Ar filesystem Ns | Ns Ar volume .Nm @@ -741,6 +752,11 @@ this opaque token can be provided to .Sy zfs send -t to resume and complete the .Sy zfs receive . +.It Sy redact_snaps +For bookmarks, this is the list of snapshot guids the bookmark contains a redaction +list for. +For snapshots, this is the list of snapshot guids the snapshot is redacted with +respect to. .It Sy referenced The amount of data that is accessible by this dataset, which may or may not be shared with other datasets in the pool. @@ -3310,7 +3326,7 @@ Displays all ZFS file systems currently mounted. .It Xo .Nm .Cm mount -.Op Fl Olv +.Op Fl Oflv .Op Fl o Ar options .Fl a | Ar filesystem .Xc @@ -3343,6 +3359,8 @@ of this will cause the terminal to interactively block after asking for the key. .It Fl v Report mount progress. +.It Fl f +Attempt to force mounting of all filesystems, even those that couldn't normally be mounted. .El .It Xo .Nm @@ -3613,7 +3631,7 @@ You will be able to receive your streams on future versions of ZFS. .It Xo .Nm .Cm send -.Op Fl LPcenvw +.Op Fl DLPRcenpvw .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc @@ -3738,6 +3756,94 @@ This information includes a per-second report of how much data has been sent. .It Xo .Nm .Cm send +.Fl -redact Ar redaction_bookmark +.Op Fl DLPcenpv +.br +.Op Fl i Ar snapshot Ns | Ns Ar bookmark +.Ar snapshot +.Xc +Generate a redacted send stream. +This send stream contains all blocks from the snapshot being sent that aren't +included in the redaction list contained in the bookmark specified by the +.Fl -redact +flag. +The resulting send stream is said to be redacted with respect to the snapshots +the bookmark specified by the +.Fl -redact No flag was created with. +The bookmark must have been created by running +.Sy zfs redact +on the snapshot being sent. +.sp +This feature can be used to allow clones of a filesystem to be made available on +a remote system, in the case where their parent need not (or needs to not) be +usable. +For example, if a filesystem contains sensitive data, and it has clones where +that sensitive data has been secured or replaced with dummy data, redacted sends +can be used to replicate the secured data without replicating the original +sensitive data, while still sharing all possible blocks. +A snapshot that has been redacted with respect to a set of snapshots will +contain all blocks referenced by at least one snapshot in the set, but will +contain none of the blocks referenced by none of the snapshots in the set. +In other words, if all snapshots in the set have modified a given block in the +parent, that block will not be sent; but if one or more snapshots have not +modified a block in the parent, they will still reference the parent's block, so +that block will be sent. +Note that only user data will be redacted. +.sp +When the redacted send stream is received, we will generate a redacted +snapshot. +Due to the nature of redaction, a redacted dataset can only be used in the +following ways: +.sp +1. To receive, as a clone, an incremental send from the original snapshot to one +of the snapshots it was redacted with respect to. +In this case, the stream will produce a valid dataset when received because all +blocks that were redacted in the parent are guaranteed to be present in the +child's send stream. +This use case will produce a normal snapshot, which can be used just like other +snapshots. +.sp +2. To receive an incremental send from the original snapshot to something +redacted with respect to a subset of the set of snapshots the initial snapshot +was redacted with respect to. +In this case, each block that was redacted in the original is still redacted +(redacting with respect to additional snapshots causes less data to be redacted +(because the snapshots define what is permitted, and everything else is +redacted)). +This use case will produce a new redacted snapshot. +.sp +3. To receive an incremental send from a redaction bookmark on the original +snapshot that was created when redacting with respect to a subset of the set of +snapshots the initial snapshot was created with respect to to +anything else. +A send stream from such a redaction bookmark will contain all of the blocks +necessary to fill in any redacted data, should it be needed, because the sending +system is aware of what blocks were originally redacted. +This will either produce a normal snapshot or a redacted one, depending on +whether the new send stream is redacted. +.sp +4. To receive an incremental send from a redacted version of the initial +snapshot that is redacted with respect to a subect of the set of snapshots the +initial snapshot was created with respect to. +A send stream from a compatible redacted dataset will contain all of the blocks +necessary to fill in any redacted data. +This will either produce a normal snapshot or a redacted one, depending on +whether the new send stream is redacted. +.sp +5. To receive a full send as a clone of the redacted snapshot. +Since the stream is a full send, it definitionally contains all the data needed +to create a new dataset. +This use case will either produce a normal snapshot or a redacted one, depending +on whether the full send stream was redacted. +.sp +These restrictions are detected and enforced by \fBzfs receive\fR; a +redacted send stream will contain the list of snapshots that the stream is +redacted with respsect to. +These are stored with the redacted snapshot, and are used to detect and +correctly handle the cases above. +.It Xo +.Nm +.Cm send .Op Fl Penv .Fl t .Ar receive_resume_token @@ -4021,6 +4127,24 @@ Abort an interrupted deleting its saved partially received state. .It Xo .Nm +.Cm redact +.Ar snapshot redaction_bookmark +.Op Ar redaction_snapshot Ns ... +.Xc +Generate a new redaction bookmark. +In addition to the typical bookmark information, a redaction bookmark contains +the list of redacted blocks and the list of redaction snapshots specified. +The redacted blocks are blocks in the snapshot which are not referenced by any +of the redaction snapshots. +These blocks are found by iterating over the metadata in each redaction snapshot +to determine what has been changed since the target snapshot. +Redaction is designed to support redacted zfs sends; see the entry for +.Sy zfs send +for more information on the purpose of this operation. +If a redact operation fails partway through (due to an error or a system +failure), the redaction can be resumed by rerunning the same command. +.It Xo +.Nm .Cm allow .Ar filesystem Ns | Ns Ar volume .Xc diff --git a/module/nvpair/fnvpair.c b/module/nvpair/fnvpair.c index ce151d6920f1..dc8257e48594 100644 --- a/module/nvpair/fnvpair.c +++ b/module/nvpair/fnvpair.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #include @@ -411,6 +411,85 @@ fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name) VERIFY0(nvlist_lookup_nvlist(nvl, name, &rv)); return (rv); } +boolean_t * +fnvlist_lookup_boolean_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + boolean_t *rv; + VERIFY0(nvlist_lookup_boolean_array(nvl, name, &rv, n)); + return (rv); +} + +uchar_t * +fnvlist_lookup_byte_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + uchar_t *rv; + VERIFY0(nvlist_lookup_byte_array(nvl, name, &rv, n)); + return (rv); +} + +int8_t * +fnvlist_lookup_int8_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + int8_t *rv; + VERIFY0(nvlist_lookup_int8_array(nvl, name, &rv, n)); + return (rv); +} + +uint8_t * +fnvlist_lookup_uint8_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + uint8_t *rv; + VERIFY0(nvlist_lookup_uint8_array(nvl, name, &rv, n)); + return (rv); +} + +int16_t * +fnvlist_lookup_int16_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + int16_t *rv; + VERIFY0(nvlist_lookup_int16_array(nvl, name, &rv, n)); + return (rv); +} + +uint16_t * +fnvlist_lookup_uint16_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + uint16_t *rv; + VERIFY0(nvlist_lookup_uint16_array(nvl, name, &rv, n)); + return (rv); +} + +int32_t * +fnvlist_lookup_int32_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + int32_t *rv; + VERIFY0(nvlist_lookup_int32_array(nvl, name, &rv, n)); + return (rv); +} + +uint32_t * +fnvlist_lookup_uint32_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + uint32_t *rv; + VERIFY0(nvlist_lookup_uint32_array(nvl, name, &rv, n)); + return (rv); +} + +int64_t * +fnvlist_lookup_int64_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + int64_t *rv; + VERIFY0(nvlist_lookup_int64_array(nvl, name, &rv, n)); + return (rv); +} + +uint64_t * +fnvlist_lookup_uint64_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + uint64_t *rv; + VERIFY0(nvlist_lookup_uint64_array(nvl, name, &rv, n)); + return (rv); +} boolean_t fnvpair_value_boolean_value(nvpair_t *nvp) diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index e630481cbff3..f403e1894f50 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -203,7 +203,8 @@ zfs_mod_supported_feature(const char *name) static void zfeature_register(spa_feature_t fid, const char *guid, const char *name, - const char *desc, zfeature_flags_t flags, const spa_feature_t *deps) + const char *desc, zfeature_flags_t flags, zfeature_type_t type, + const spa_feature_t *deps) { zfeature_info_t *feature = &spa_feature_table[fid]; static spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; @@ -226,6 +227,7 @@ zfeature_register(spa_feature_t fid, const char *guid, const char *name, feature->fi_uname = name; feature->fi_desc = desc; feature->fi_flags = flags; + feature->fi_type = type; feature->fi_depends = deps; feature->fi_zfs_mod_supported = zfs_mod_supported_feature(guid); } @@ -236,32 +238,32 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_ASYNC_DESTROY, "com.delphix:async_destroy", "async_destroy", "Destroy filesystems asynchronously.", - ZFEATURE_FLAG_READONLY_COMPAT, NULL); + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); zfeature_register(SPA_FEATURE_EMPTY_BPOBJ, "com.delphix:empty_bpobj", "empty_bpobj", "Snapshots use less space.", - ZFEATURE_FLAG_READONLY_COMPAT, NULL); + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); zfeature_register(SPA_FEATURE_LZ4_COMPRESS, "org.illumos:lz4_compress", "lz4_compress", "LZ4 compression algorithm support.", - ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, NULL); + ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL); zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump", "Crash dumps to multiple vdev pools.", - 0, NULL); + 0, ZFEATURE_TYPE_BOOLEAN, NULL); zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM, "com.delphix:spacemap_histogram", "spacemap_histogram", "Spacemaps maintain space histograms.", - ZFEATURE_FLAG_READONLY_COMPAT, NULL); + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); zfeature_register(SPA_FEATURE_ENABLED_TXG, "com.delphix:enabled_txg", "enabled_txg", "Record txg at which a feature is enabled", - ZFEATURE_FLAG_READONLY_COMPAT, NULL); + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); { static const spa_feature_t hole_birth_deps[] = { @@ -272,24 +274,24 @@ zpool_feature_init(void) "com.delphix:hole_birth", "hole_birth", "Retain hole birth txg for more precise zfs send", ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, - hole_birth_deps); + ZFEATURE_TYPE_BOOLEAN, hole_birth_deps); } zfeature_register(SPA_FEATURE_POOL_CHECKPOINT, "com.delphix:zpool_checkpoint", "zpool_checkpoint", "Pool state can be checkpointed, allowing rewind later.", - ZFEATURE_FLAG_READONLY_COMPAT, NULL); + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); zfeature_register(SPA_FEATURE_SPACEMAP_V2, "com.delphix:spacemap_v2", "spacemap_v2", "Space maps representing large segments are more efficient.", ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, - NULL); + ZFEATURE_TYPE_BOOLEAN, NULL); zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET, "com.delphix:extensible_dataset", "extensible_dataset", "Enhanced dataset functionality, used by other features.", - 0, NULL); + 0, ZFEATURE_TYPE_BOOLEAN, NULL); { static const spa_feature_t bookmarks_deps[] = { @@ -300,7 +302,8 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_BOOKMARKS, "com.delphix:bookmarks", "bookmarks", "\"zfs bookmark\" command", - ZFEATURE_FLAG_READONLY_COMPAT, bookmarks_deps); + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, + bookmarks_deps); } { @@ -311,14 +314,15 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_FS_SS_LIMIT, "com.joyent:filesystem_limits", "filesystem_limits", "Filesystem and snapshot limits.", - ZFEATURE_FLAG_READONLY_COMPAT, filesystem_limits_deps); + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, + filesystem_limits_deps); } zfeature_register(SPA_FEATURE_EMBEDDED_DATA, "com.delphix:embedded_data", "embedded_data", "Blocks which compress very well use even less space.", ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, - NULL); + ZFEATURE_TYPE_BOOLEAN, NULL); { static const spa_feature_t large_blocks_deps[] = { @@ -328,7 +332,8 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_LARGE_BLOCKS, "org.open-zfs:large_blocks", "large_blocks", "Support for blocks larger than 128KB.", - ZFEATURE_FLAG_PER_DATASET, large_blocks_deps); + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, + large_blocks_deps); } { @@ -339,7 +344,8 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_LARGE_DNODE, "org.zfsonlinux:large_dnode", "large_dnode", "Variable on-disk size of dnodes.", - ZFEATURE_FLAG_PER_DATASET, large_dnode_deps); + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, + large_dnode_deps); } { @@ -350,8 +356,10 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_SHA512, "org.illumos:sha512", "sha512", "SHA-512/256 hash algorithm.", - ZFEATURE_FLAG_PER_DATASET, sha512_deps); + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, + sha512_deps); } + { static const spa_feature_t skein_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, @@ -360,7 +368,8 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_SKEIN, "org.illumos:skein", "skein", "Skein hash algorithm.", - ZFEATURE_FLAG_PER_DATASET, skein_deps); + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, + skein_deps); } { @@ -371,12 +380,54 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_EDONR, "org.illumos:edonr", "edonr", "Edon-R hash algorithm.", - ZFEATURE_FLAG_PER_DATASET, edonr_deps); + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, + edonr_deps); + } + + { + static const spa_feature_t redact_books_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_BOOKMARKS, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_REDACTION_BOOKMARKS, + "com.delphix:redaction_bookmarks", "redaction_bookmarks", + "Support for bookmarks which store redaction lists for zfs " + "redacted send/recv.", 0, ZFEATURE_TYPE_BOOLEAN, + redact_books_deps); + } + + { + static const spa_feature_t redact_datasets_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_REDACTED_DATASETS, + "com.delphix:redacted_datasets", "redacted_datasets", "Support for " + "redacted datasets, produced by receiving a redacted zfs send " + "stream.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_UINT64_ARRAY, + redact_datasets_deps); } + + { + static const spa_feature_t bookmark_written_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_BOOKMARKS, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_BOOKMARK_WRITTEN, + "com.delphix:bookmark_written", "bookmark_written", + "Additional accounting, enabling the written# property" + "(space written since a bookmark), and estimates of send stream " + "sizes for incrementals from bookmarks.", + 0, ZFEATURE_TYPE_BOOLEAN, bookmark_written_deps); + } + zfeature_register(SPA_FEATURE_DEVICE_REMOVAL, "com.delphix:device_removal", "device_removal", "Top-level vdevs can be removed, reducing logical pool size.", - ZFEATURE_FLAG_MOS, NULL); + ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL); + { static const spa_feature_t obsolete_counts_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, @@ -387,8 +438,10 @@ zpool_feature_init(void) "com.delphix:obsolete_counts", "obsolete_counts", "Reduce memory used by removed devices when their blocks are " "freed or remapped.", - ZFEATURE_FLAG_READONLY_COMPAT, obsolete_counts_deps); + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, + obsolete_counts_deps); } + { static const spa_feature_t userobj_accounting_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, @@ -398,7 +451,7 @@ zpool_feature_init(void) "org.zfsonlinux:userobj_accounting", "userobj_accounting", "User/Group object accounting.", ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET, - userobj_accounting_deps); + ZFEATURE_TYPE_BOOLEAN, userobj_accounting_deps); } { @@ -409,7 +462,8 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_ENCRYPTION, "com.datto:encryption", "encryption", "Support for dataset level encryption", - ZFEATURE_FLAG_PER_DATASET, encryption_deps); + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, + encryption_deps); } { @@ -421,14 +475,14 @@ zpool_feature_init(void) "org.zfsonlinux:project_quota", "project_quota", "space/object accounting based on project ID.", ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET, - project_quota_deps); + ZFEATURE_TYPE_BOOLEAN, project_quota_deps); } { zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES, "org.zfsonlinux:allocation_classes", "allocation_classes", "Support for separate allocation classes.", - ZFEATURE_FLAG_READONLY_COMPAT, NULL); + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); } } diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c index 58b23b0e00b0..5ab81884d7d0 100644 --- a/module/zcommon/zfs_namecheck.c +++ b/module/zcommon/zfs_namecheck.c @@ -397,6 +397,7 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what) } #if defined(_KERNEL) +EXPORT_SYMBOL(entity_namecheck); EXPORT_SYMBOL(pool_namecheck); EXPORT_SYMBOL(dataset_namecheck); EXPORT_SYMBOL(zfs_component_namecheck); diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 4d5bc39e5a4a..d23b750348f7 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright 2016, Joyent, Inc. */ @@ -458,6 +458,10 @@ zfs_prop_init(void) zprop_register_string(ZFS_PROP_KEYLOCATION, "keylocation", "none", PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "prompt | ", "KEYLOCATION"); + zprop_register_string(ZFS_PROP_REDACT_SNAPS, + "redact_snaps", NULL, PROP_READONLY, + ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "[,...]", + "RSNAPS"); /* readonly number properties */ zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY, @@ -465,9 +469,10 @@ zfs_prop_init(void) zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "AVAIL"); zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0, - PROP_READONLY, ZFS_TYPE_DATASET, "", "REFER"); + PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", + "REFER"); zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0, - PROP_READONLY, ZFS_TYPE_DATASET, + PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<1.00x or higher if compressed>", "RATIO"); zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0, PROP_READONLY, ZFS_TYPE_DATASET, @@ -495,7 +500,8 @@ zfs_prop_init(void) PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "LUSED"); zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced", - 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "LREFER"); + 0, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", + "LREFER"); zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count", UINT64_MAX, PROP_READONLY, ZFS_TYPE_FILESYSTEM, "", "FSCOUNT"); @@ -566,6 +572,8 @@ zfs_prop_init(void) ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PBKDF2SALT"); zprop_register_hidden(ZFS_PROP_KEY_GUID, "keyguid", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "KEYGUID"); + zprop_register_hidden(ZFS_PROP_REDACTED, "redacted", PROP_TYPE_NUMBER, + PROP_READONLY, ZFS_TYPE_DATASET, "REDACTED"); /* * Property to be removed once libbe is integrated @@ -665,8 +673,10 @@ zfs_prop_userquota(const char *name) boolean_t zfs_prop_written(const char *name) { - static const char *prefix = "written@"; - return (strncmp(name, prefix, strlen(prefix)) == 0); + static const char *prop_prefix = "written@"; + static const char *book_prefix = "written#"; + return (strncmp(name, prop_prefix, strlen(prop_prefix)) == 0 || + strncmp(name, book_prefix, strlen(book_prefix)) == 0); } /* diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 6fd24757c88c..864b2144cb69 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -34,6 +34,8 @@ $(MODULE)-objs += dmu.o $(MODULE)-objs += dmu_diff.o $(MODULE)-objs += dmu_object.o $(MODULE)-objs += dmu_objset.o +$(MODULE)-objs += dmu_recv.o +$(MODULE)-objs += dmu_redact.o $(MODULE)-objs += dmu_send.o $(MODULE)-objs += dmu_traverse.o $(MODULE)-objs += dmu_tx.o @@ -59,6 +61,7 @@ $(MODULE)-objs += lz4.o $(MODULE)-objs += metaslab.o $(MODULE)-objs += mmp.o $(MODULE)-objs += multilist.o +$(MODULE)-objs += objlist.o $(MODULE)-objs += pathname.o $(MODULE)-objs += policy.o $(MODULE)-objs += range_tree.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 5e53f987961a..ba767716fef1 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -6079,6 +6079,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, ASSERT(!BP_IS_EMBEDDED(bp) || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(!BP_IS_REDACTED(bp)); top: if (!BP_IS_EMBEDDED(bp)) { diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c index 8f78e8de594e..164216213322 100644 --- a/module/zfs/bptree.c +++ b/module/zfs/bptree.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. */ #include @@ -156,7 +156,8 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, int err; struct bptree_args *ba = arg; - if (bp == NULL || BP_IS_HOLE(bp)) + if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || + BP_IS_REDACTED(bp)) return (0); err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); diff --git a/module/zfs/bqueue.c b/module/zfs/bqueue.c index f30253d24bfb..045fdde6779e 100644 --- a/module/zfs/bqueue.c +++ b/module/zfs/bqueue.c @@ -13,7 +13,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2018 by Delphix. All rights reserved. */ #include @@ -27,13 +27,27 @@ obj2node(bqueue_t *q, void *data) /* * Initialize a blocking queue The maximum capacity of the queue is set to - * size. Types that want to be stored in a bqueue must contain a bqueue_node_t, - * and offset should give its offset from the start of the struct. Return 0 on - * success, or -1 on failure. + * size. Types that are stored in a bqueue must contain a bqueue_node_t, + * and node_offset must be its offset from the start of the struct. + * fill_fraction is a performance tuning value; when the queue is full, any + * threads attempting to enqueue records will block. They will block until + * they're signaled, which will occur when the queue is at least 1/fill_fraction + * empty. Similar behavior occurs on dequeue; if the queue is empty, threads + * block. They will be signalled when the queue has 1/fill_fraction full, or + * when bqueue_flush is called. As a result, you must call bqueue_flush when + * you enqueue your final record on a thread, in case the dequeueing threads are + * currently blocked and that enqueue does not cause them to be awoken. + * Alternatively, this behavior can be disabled (causing signaling to happen + * immediately) by setting fill_fraction to any value larger than size. + * Return 0 on success, or -1 on failure. */ int -bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset) +bqueue_init(bqueue_t *q, uint64_t fill_fraction, uint64_t size, + size_t node_offset) { + if (fill_fraction == 0) { + return (-1); + } list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t), node_offset + offsetof(bqueue_node_t, bqn_node)); cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL); @@ -42,6 +56,7 @@ bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset) q->bq_node_offset = node_offset; q->bq_size = 0; q->bq_maxsize = size; + q->bq_fill_fraction = fill_fraction; return (0); } @@ -53,20 +68,18 @@ bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset) void bqueue_destroy(bqueue_t *q) { + mutex_enter(&q->bq_lock); ASSERT0(q->bq_size); cv_destroy(&q->bq_add_cv); cv_destroy(&q->bq_pop_cv); - mutex_destroy(&q->bq_lock); list_destroy(&q->bq_list); + mutex_exit(&q->bq_lock); + mutex_destroy(&q->bq_lock); } -/* - * Add data to q, consuming size units of capacity. If there is insufficient - * capacity to consume size units, block until capacity exists. Asserts size is - * > 0. - */ -void -bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) +static void +bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size, + boolean_t flush) { ASSERT3U(item_size, >, 0); ASSERT3U(item_size, <=, q->bq_maxsize); @@ -77,9 +90,38 @@ bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) } q->bq_size += item_size; list_insert_tail(&q->bq_list, data); - cv_signal(&q->bq_pop_cv); + if (q->bq_size >= q->bq_maxsize / q->bq_fill_fraction) + cv_signal(&q->bq_pop_cv); + if (flush) + cv_broadcast(&q->bq_pop_cv); mutex_exit(&q->bq_lock); } + +/* + * Add data to q, consuming size units of capacity. If there is insufficient + * capacity to consume size units, block until capacity exists. Asserts size is + * > 0. + */ +void +bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) +{ + bqueue_enqueue_impl(q, data, item_size, B_FALSE); +} + +/* + * Enqueue an entry, and then flush the queue. This forces the popping threads + * to wake up, even if we're below the fill fraction. We have this in a single + * function, rather than having a separate call, because it prevents race + * conditions between the enqueuing thread and the dequeueing thread, where the + * enqueueing thread will wake up the dequeueing thread, that thread will + * destroy the condvar before the enqueuing thread is done. + */ +void +bqueue_enqueue_flush(bqueue_t *q, void *data, uint64_t item_size) +{ + bqueue_enqueue_impl(q, data, item_size, B_TRUE); +} + /* * Take the first element off of q. If there are no elements on the queue, wait * until one is put there. Return the removed element. @@ -97,7 +139,8 @@ bqueue_dequeue(bqueue_t *q) ASSERT3P(ret, !=, NULL); item_size = obj2node(q, ret)->bqn_size; q->bq_size -= item_size; - cv_signal(&q->bq_add_cv); + if (q->bq_size <= q->bq_maxsize - (q->bq_maxsize / q->bq_fill_fraction)) + cv_signal(&q->bq_add_cv); mutex_exit(&q->bq_lock); return (ret); } diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index f7376875afe9..8e42faa95931 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1358,6 +1358,20 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) return (0); } + /* + * Any attempt to read a redacted block should result in an error. This + * will never happen under normal conditions, but can be useful for + * debugging purposes. + */ + if (BP_IS_REDACTED(db->db_blkptr)) { + ASSERT(dsl_dataset_feature_is_active( + db->db_objset->os_dsl_dataset, + SPA_FEATURE_REDACTED_DATASETS)); + DB_DNODE_EXIT(db); + mutex_exit(&db->db_mtx); + return (SET_ERROR(EIO)); + } + SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), db->db.db_object, db->db_level, db->db_blkid); @@ -2367,11 +2381,23 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, bcopy(mac, dr->dt.dl.dr_mac, ZIO_DATA_MAC_LEN); } -#pragma weak dmu_buf_fill_done = dbuf_fill_done +static void +dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx) +{ + struct dirty_leaf *dl; + + ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); + dl = &db->db_last_dirty->dt.dl; + dl->dr_overridden_by = *bp; + dl->dr_override_state = DR_OVERRIDDEN; + dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; +} + /* ARGSUSED */ void -dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) +dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; mutex_enter(&db->db_mtx); DBUF_VERIFY(db); @@ -2426,6 +2452,31 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; } +void +dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + dmu_object_type_t type; + ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset, + SPA_FEATURE_REDACTED_DATASETS)); + + DB_DNODE_ENTER(db); + type = DB_DNODE(db)->dn_type; + DB_DNODE_EXIT(db); + + ASSERT0(db->db_level); + dmu_buf_will_not_fill(dbuf, tx); + + blkptr_t bp = { { { {0} } } }; + BP_SET_TYPE(&bp, type); + BP_SET_LEVEL(&bp, 0); + BP_SET_BIRTH(&bp, tx->tx_txg, 0); + BP_SET_REDACTED(&bp); + BPE_SET_LSIZE(&bp, dbuf->db_size); + + dbuf_override_impl(db, &bp, tx); +} + /* * Directly assign a provided arc buf to a given dbuf if it's not referenced * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. @@ -2792,6 +2843,36 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, return (db); } +/* + * This function returns a block pointer and information about the object, + * given a dnode and a block. This is a publicly accessible version of + * dbuf_findbp that only returns some information, rather than the + * dbuf. Note that the dnode passed in must be held, and the dn_struct_rwlock + * should be locked as (at least) a reader. + */ +int +dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid, + blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift) +{ + dmu_buf_impl_t *dbp = NULL; + blkptr_t *bp2; + int err = 0; + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + + err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2); + if (err == 0) { + *bp = *bp2; + if (dbp != NULL) + dbuf_rele(dbp, NULL); + if (datablkszsec != NULL) + *datablkszsec = dn->dn_phys->dn_datablkszsec; + if (indblkshift != NULL) + *indblkshift = dn->dn_phys->dn_indblkshift; + } + + return (err); +} + typedef struct dbuf_prefetch_arg { spa_t *dpa_spa; /* The spa to issue the prefetch in. */ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ @@ -2809,7 +2890,12 @@ typedef struct dbuf_prefetch_arg { static void dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) { - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + ASSERT(!BP_IS_REDACTED(bp) || + dsl_dataset_feature_is_active( + dpa->dpa_dnode->dn_objset->os_dsl_dataset, + SPA_FEATURE_REDACTED_DATASETS)); + + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) return; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; @@ -2887,7 +2973,11 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, blkptr_t *bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); - if (BP_IS_HOLE(bp)) { + ASSERT(!BP_IS_REDACTED(bp) || + dsl_dataset_feature_is_active( + dpa->dpa_dnode->dn_objset->os_dsl_dataset, + SPA_FEATURE_REDACTED_DATASETS)); + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { kmem_free(dpa, sizeof (*dpa)); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); @@ -2991,7 +3081,10 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); bp = dn->dn_phys->dn_blkptr[curblkid]; } - if (BP_IS_HOLE(&bp)) + ASSERT(!BP_IS_REDACTED(&bp) || + dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset, + SPA_FEATURE_REDACTED_DATASETS)); + if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp)) return; ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 8779eb3586ca..cf51579dd3c9 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved. @@ -1227,6 +1227,20 @@ dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, dmu_buf_rele(db, FTAG); } +void +dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx) +{ + int numbufs, i; + dmu_buf_t **dbp; + + VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, + &numbufs, &dbp)); + for (i = 0; i < numbufs; i++) + dmu_buf_redact(dbp[i], tx); + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + /* * DMU support for xuio */ diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c index 76c32b126423..180f90f94949 100644 --- a/module/zfs/dmu_diff.c +++ b/module/zfs/dmu_diff.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #include @@ -115,7 +115,8 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (issig(JUSTLOOKING) && issig(FORREAL)) return (SET_ERROR(EINTR)); - if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT) + if (zb->zb_level == ZB_DNODE_LEVEL || + zb->zb_object != DMU_META_DNODE_OBJECT) return (0); if (BP_IS_HOLE(bp)) { diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index afee869b6d76..9b5cf125f397 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -337,7 +337,8 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) if (*objectp == 0) { start_obj = 1; - } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { + } else if (ds && dsl_dataset_feature_is_active(ds, + SPA_FEATURE_LARGE_DNODE)) { uint64_t i = *objectp + 1; uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); dmu_object_info_t doi; diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 3c9a817f7bec..4856e602c785 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -58,7 +58,7 @@ #include #include #include -#include +#include #include #include "zfs_namecheck.h" @@ -411,6 +411,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, int i, err; ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); + ASSERT(!BP_IS_REDACTED(bp)); /* * The $ORIGIN dataset (if it exists) doesn't have an associated @@ -1073,14 +1074,14 @@ dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, (!os->os_encrypted || !dmu_objset_is_receiving(os))) { os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; if (dmu_objset_userobjused_enabled(os)) { - ds->ds_feature_activation_needed[ - SPA_FEATURE_USEROBJ_ACCOUNTING] = B_TRUE; + ds->ds_feature_activation[ + SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE; os->os_phys->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE; } if (dmu_objset_projectquota_enabled(os)) { - ds->ds_feature_activation_needed[ - SPA_FEATURE_PROJECT_QUOTA] = B_TRUE; + ds->ds_feature_activation[ + SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE; os->os_phys->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE; } @@ -2371,11 +2372,11 @@ dmu_objset_id_quota_upgrade_cb(objset_t *os) dmu_objset_userobjspace_present(os)) return (SET_ERROR(ENOTSUP)); - dmu_objset_ds(os)->ds_feature_activation_needed[ - SPA_FEATURE_USEROBJ_ACCOUNTING] = B_TRUE; + dmu_objset_ds(os)->ds_feature_activation[ + SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE; if (dmu_objset_projectquota_enabled(os)) - dmu_objset_ds(os)->ds_feature_activation_needed[ - SPA_FEATURE_PROJECT_QUOTA] = B_TRUE; + dmu_objset_ds(os)->ds_feature_activation[ + SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE; err = dmu_objset_space_upgrade(os); if (err) diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c new file mode 100644 index 000000000000..31731ad40e1c --- /dev/null +++ b/module/zfs/dmu_recv.c @@ -0,0 +1,3011 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2014 HybridCluster. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#endif + +int zfs_recv_queue_length = SPA_MAXBLOCKSIZE; +int zfs_recv_queue_ff = 20; + +static char *dmu_recv_tag = "dmu_recv_tag"; +const char *recv_clone_name = "%recv"; + +static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len, + void *buf); + +struct receive_record_arg { + dmu_replay_record_t header; + void *payload; /* Pointer to a buffer containing the payload */ + /* + * If the record is a write, pointer to the arc_buf_t containing the + * payload. + */ + arc_buf_t *arc_buf; + int payload_size; + uint64_t bytes_read; /* bytes read from stream when record created */ + boolean_t eos_marker; /* Marks the end of the stream */ + bqueue_node_t node; +}; + +struct receive_writer_arg { + objset_t *os; + boolean_t byteswap; + bqueue_t q; + + /* + * These three args are used to signal to the main thread that we're + * done. + */ + kmutex_t mutex; + kcondvar_t cv; + boolean_t done; + + int err; + /* A map from guid to dataset to help handle dedup'd streams. */ + avl_tree_t *guid_to_ds_map; + boolean_t resumable; + boolean_t raw; + uint64_t last_object; + uint64_t last_offset; + uint64_t max_object; /* highest object ID referenced in stream */ + uint64_t bytes_read; /* bytes read when current record created */ + + /* Encryption parameters for the last received DRR_OBJECT_RANGE */ + boolean_t or_crypt_params_present; + uint64_t or_firstobj; + uint64_t or_numslots; + uint8_t or_salt[ZIO_DATA_SALT_LEN]; + uint8_t or_iv[ZIO_DATA_IV_LEN]; + uint8_t or_mac[ZIO_DATA_MAC_LEN]; + boolean_t or_byteorder; +}; + +typedef struct guid_map_entry { + uint64_t guid; + boolean_t raw; + dsl_dataset_t *gme_ds; + avl_node_t avlnode; +} guid_map_entry_t; + +typedef struct dmu_recv_begin_arg { + const char *drba_origin; + dmu_recv_cookie_t *drba_cookie; + cred_t *drba_cred; + dsl_crypto_params_t *drba_dcp; + uint64_t drba_snapobj; +} dmu_recv_begin_arg_t; + +static void +byteswap_record(dmu_replay_record_t *drr) +{ +#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) +#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) + drr->drr_type = BSWAP_32(drr->drr_type); + drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); + + switch (drr->drr_type) { + case DRR_BEGIN: + DO64(drr_begin.drr_magic); + DO64(drr_begin.drr_versioninfo); + DO64(drr_begin.drr_creation_time); + DO32(drr_begin.drr_type); + DO32(drr_begin.drr_flags); + DO64(drr_begin.drr_toguid); + DO64(drr_begin.drr_fromguid); + break; + case DRR_OBJECT: + DO64(drr_object.drr_object); + DO32(drr_object.drr_type); + DO32(drr_object.drr_bonustype); + DO32(drr_object.drr_blksz); + DO32(drr_object.drr_raw_bonuslen); + DO64(drr_object.drr_toguid); + DO64(drr_object.drr_maxblkid); + break; + case DRR_FREEOBJECTS: + DO64(drr_freeobjects.drr_firstobj); + DO64(drr_freeobjects.drr_numobjs); + DO64(drr_freeobjects.drr_toguid); + break; + case DRR_WRITE: + DO64(drr_write.drr_object); + DO32(drr_write.drr_type); + DO64(drr_write.drr_offset); + DO64(drr_write.drr_logical_size); + DO64(drr_write.drr_toguid); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); + DO64(drr_write.drr_key.ddk_prop); + DO64(drr_write.drr_compressed_size); + break; + case DRR_WRITE_BYREF: + DO64(drr_write_byref.drr_object); + DO64(drr_write_byref.drr_offset); + DO64(drr_write_byref.drr_length); + DO64(drr_write_byref.drr_toguid); + DO64(drr_write_byref.drr_refguid); + DO64(drr_write_byref.drr_refobject); + DO64(drr_write_byref.drr_refoffset); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. + drr_key.ddk_cksum); + DO64(drr_write_byref.drr_key.ddk_prop); + break; + case DRR_WRITE_EMBEDDED: + DO64(drr_write_embedded.drr_object); + DO64(drr_write_embedded.drr_offset); + DO64(drr_write_embedded.drr_length); + DO64(drr_write_embedded.drr_toguid); + DO32(drr_write_embedded.drr_lsize); + DO32(drr_write_embedded.drr_psize); + break; + case DRR_FREE: + DO64(drr_free.drr_object); + DO64(drr_free.drr_offset); + DO64(drr_free.drr_length); + DO64(drr_free.drr_toguid); + break; + case DRR_SPILL: + DO64(drr_spill.drr_object); + DO64(drr_spill.drr_length); + DO64(drr_spill.drr_toguid); + DO64(drr_spill.drr_compressed_size); + DO32(drr_spill.drr_type); + break; + case DRR_OBJECT_RANGE: + DO64(drr_object_range.drr_firstobj); + DO64(drr_object_range.drr_numslots); + DO64(drr_object_range.drr_toguid); + break; + case DRR_REDACT: + DO64(drr_redact.drr_object); + DO64(drr_redact.drr_offset); + DO64(drr_redact.drr_length); + DO64(drr_redact.drr_toguid); + break; + case DRR_END: + DO64(drr_end.drr_toguid); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); + break; + default: + break; + } + + if (drr->drr_type != DRR_BEGIN) { + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); + } + +#undef DO64 +#undef DO32 +} + +static boolean_t +redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid) +{ + for (int i = 0; i < num_snaps; i++) { + if (snaps[i] == guid) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Check that the new stream we're trying to receive is redacted with respect to + * a subset of the snapshots that the origin was redacted with respect to. For + * the reasons behind this, see the man page on redacted zfs sends and receives. + */ +static boolean_t +compatible_redact_snaps(uint64_t *origin_snaps, uint64_t origin_num_snaps, + uint64_t *redact_snaps, uint64_t num_redact_snaps) +{ + /* + * Short circuit the comparison; if we are redacted with respect to + * more snapshots than the origin, we can't be redacted with respect + * to a subset. + */ + if (num_redact_snaps > origin_num_snaps) { + return (B_FALSE); + } + + for (int i = 0; i < num_redact_snaps; i++) { + if (!redact_snaps_contains(origin_snaps, origin_num_snaps, + redact_snaps[i])) { + return (B_FALSE); + } + } + return (B_TRUE); +} + +static boolean_t +redact_check(dmu_recv_begin_arg_t *drba, dsl_dataset_t *origin) +{ + uint64_t *origin_snaps; + uint64_t origin_num_snaps; + dmu_recv_cookie_t *drc = drba->drba_cookie; + struct drr_begin *drrb = drc->drc_drrb; + int featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + int err = 0; + boolean_t ret = B_TRUE; + uint64_t *redact_snaps; + uint_t numredactsnaps; + + /* + * If this is a full send stream, we're safe no matter what. + */ + if (drrb->drr_fromguid == 0) + return (ret); + + VERIFY(dsl_dataset_get_uint64_array_feature(origin, + SPA_FEATURE_REDACTED_DATASETS, &origin_num_snaps, &origin_snaps)); + + if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, + BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, &numredactsnaps) == + 0) { + /* + * If the send stream was sent from the redaction bookmark or + * the redacted version of the dataset, then we're safe. Verify + * that this is from the a compatible redaction bookmark or + * redacted dataset. + */ + if (!compatible_redact_snaps(origin_snaps, origin_num_snaps, + redact_snaps, numredactsnaps)) { + err = EINVAL; + } + } else if (featureflags & DMU_BACKUP_FEATURE_REDACTED) { + /* + * If the stream is redacted, it must be redacted with respect + * to a subset of what the origin is redacted with respect to. + * See case number 2 in the zfs man page section on redacted zfs + * send. + */ + err = nvlist_lookup_uint64_array(drc->drc_begin_nvl, + BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps); + + if (err != 0 || !compatible_redact_snaps(origin_snaps, + origin_num_snaps, redact_snaps, numredactsnaps)) { + err = EINVAL; + } + } else if (!redact_snaps_contains(origin_snaps, origin_num_snaps, + drrb->drr_toguid)) { + /* + * If the stream isn't redacted but the origin is, this must be + * one of the snapshots the origin is redacted with respect to. + * See case number 1 in the zfs man page section on redacted zfs + * send. + */ + err = EINVAL; + } + + if (err != 0) + ret = B_FALSE; + return (ret); +} + +static int +recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, + uint64_t fromguid, uint64_t featureflags) +{ + uint64_t val; + int error; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0; + boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0; + boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0; + + /* temporary clone name must not exist */ + error = zap_lookup(dp->dp_meta_objset, + dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, + 8, 1, &val); + if (error != ENOENT) + return (error == 0 ? SET_ERROR(EBUSY) : error); + + /* new snapshot name must not exist */ + error = zap_lookup(dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, + drba->drba_cookie->drc_tosnap, 8, 1, &val); + if (error != ENOENT) + return (error == 0 ? SET_ERROR(EEXIST) : error); + + /* + * Check snapshot limit before receiving. We'll recheck again at the + * end, but might as well abort before receiving if we're already over + * the limit. + * + * Note that we do not check the file system limit with + * dsl_dir_fscount_check because the temporary %clones don't count + * against that limit. + */ + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, + NULL, drba->drba_cred); + if (error != 0) + return (error); + + if (fromguid != 0) { + dsl_dataset_t *snap; + uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + + /* Can't perform a raw receive on top of a non-raw receive */ + if (!encrypted && raw) + return (SET_ERROR(EINVAL)); + + /* Encryption is incompatible with embedded data */ + if (encrypted && embed) + return (SET_ERROR(EINVAL)); + + /* Find snapshot in this dir that matches fromguid. */ + while (obj != 0) { + error = dsl_dataset_hold_obj(dp, obj, FTAG, + &snap); + if (error != 0) + return (SET_ERROR(ENODEV)); + if (snap->ds_dir != ds->ds_dir) { + dsl_dataset_rele(snap, FTAG); + return (SET_ERROR(ENODEV)); + } + if (dsl_dataset_phys(snap)->ds_guid == fromguid) + break; + obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + dsl_dataset_rele(snap, FTAG); + } + if (obj == 0) + return (SET_ERROR(ENODEV)); + + if (drba->drba_cookie->drc_force) { + drba->drba_snapobj = obj; + } else { + /* + * If we are not forcing, there must be no + * changes since fromsnap. + */ + if (dsl_dataset_modified_since_snap(ds, snap)) { + dsl_dataset_rele(snap, FTAG); + return (SET_ERROR(ETXTBSY)); + } + drba->drba_snapobj = ds->ds_prev->ds_object; + } + + if (dsl_dataset_feature_is_active(snap, + SPA_FEATURE_REDACTED_DATASETS) && !redact_check(drba, + snap)) { + dsl_dataset_rele(snap, FTAG); + return (SET_ERROR(EINVAL)); + } + + dsl_dataset_rele(snap, FTAG); + } else { + /* if full, then must be forced */ + if (!drba->drba_cookie->drc_force) + return (SET_ERROR(EEXIST)); + + /* + * We don't support using zfs recv -F to blow away + * encrypted filesystems. This would require the + * dsl dir to point to the old encryption key and + * the new one at the same time during the receive. + */ + if ((!encrypted && raw) || encrypted) + return (SET_ERROR(EINVAL)); + + /* + * Perform the same encryption checks we would if + * we were creating a new dataset from scratch. + */ + if (!raw) { + boolean_t will_encrypt; + + error = dmu_objset_create_crypt_check( + ds->ds_dir->dd_parent, drba->drba_dcp, + &will_encrypt); + if (error != 0) + return (error); + + if (will_encrypt && embed) + return (SET_ERROR(EINVAL)); + } + } + + return (0); + +} + +/* + * Check that any feature flags used in the data stream we're receiving are + * supported by the pool we are receiving into. + * + * Note that some of the features we explicitly check here have additional + * (implicit) features they depend on, but those dependencies are enforced + * through the zfeature_register() calls declaring the features that we + * explicitly check. + */ +static int +recv_begin_check_feature_flags_impl(uint64_t featureflags, spa_t *spa) +{ + /* Verify pool version supports SA if SA_SPILL feature set */ + if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && + spa_version(spa) < SPA_VERSION_SA) + return (SET_ERROR(ENOTSUP)); + + /* + * LZ4 compressed, embedded, mooched, large blocks, and large_dnodes + * in the stream can only be used if those pool features are enabled + * because we don't attempt to decompress / un-embed / un-mooch / + * split up the blocks / dnodes during the receive process. + */ + if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && + !spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && + !spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && + !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) + return (SET_ERROR(ENOTSUP)); + + /* + * Receiving redacted streams requires that redacted datasets are + * enabled. + */ + if ((featureflags & DMU_BACKUP_FEATURE_REDACTED) && + !spa_feature_is_enabled(spa, SPA_FEATURE_REDACTED_DATASETS)) + return (SET_ERROR(ENOTSUP)); + + return (0); +} + +static int +dmu_recv_begin_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + uint64_t fromguid = drrb->drr_fromguid; + int flags = drrb->drr_flags; + ds_hold_flags_t dsflags = 0; + int error; + uint64_t featureflags = drba->drba_cookie->drc_featureflags; + dsl_dataset_t *ds; + const char *tofs = drba->drba_cookie->drc_tofs; + + /* already checked */ + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING)); + + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM || + drrb->drr_type >= DMU_OST_NUMTYPES || + ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) + return (SET_ERROR(EINVAL)); + + error = recv_begin_check_feature_flags_impl(featureflags, dp->dp_spa); + if (error != 0) + return (error); + + /* Resumable receives require extensible datasets */ + if (drba->drba_cookie->drc_resumable && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) + return (SET_ERROR(ENOTSUP)); + + if (featureflags & DMU_BACKUP_FEATURE_RAW) { + /* raw receives require the encryption feature */ + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) + return (SET_ERROR(ENOTSUP)); + + /* embedded data is incompatible with encryption and raw recv */ + if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) + return (SET_ERROR(EINVAL)); + } else { + dsflags |= DS_HOLD_FLAG_DECRYPT; + } + + error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); + if (error == 0) { + /* target fs already exists; recv into temp clone */ + + /* Can't recv a clone into an existing fs */ + if (flags & DRR_FLAG_CLONE || drba->drba_origin) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + error = recv_begin_check_existing_impl(drba, ds, fromguid, + featureflags); + dsl_dataset_rele_flags(ds, dsflags, FTAG); + } else if (error == ENOENT) { + /* target fs does not exist; must be a full backup or clone */ + char buf[ZFS_MAX_DATASET_NAME_LEN]; + + /* + * If it's a non-clone incremental, we are missing the + * target fs, so fail the recv. + */ + if (fromguid != 0 && !((flags & DRR_FLAG_CLONE) || + drba->drba_origin)) + return (SET_ERROR(ENOENT)); + + /* + * If we're receiving a full send as a clone, and it doesn't + * contain all the necessary free records and freeobject + * records, reject it. + */ + if (fromguid == 0 && drba->drba_origin != NULL && + !(flags & DRR_FLAG_FREERECORDS)) + return (SET_ERROR(EINVAL)); + + /* Open the parent of tofs */ + ASSERT3U(strlen(tofs), <, sizeof (buf)); + (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); + error = dsl_dataset_hold_flags(dp, buf, dsflags, FTAG, &ds); + if (error != 0) + return (error); + + if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0 && + drba->drba_origin == NULL) { + boolean_t will_encrypt; + + /* + * Check that we aren't breaking any encryption rules + * and that we have all the parameters we need to + * create an encrypted dataset if necessary. If we are + * making an encrypted dataset the stream can't have + * embedded data. + */ + error = dmu_objset_create_crypt_check(ds->ds_dir, + drba->drba_dcp, &will_encrypt); + if (error != 0) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (error); + } + + if (will_encrypt && + (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + } + + /* + * Check filesystem and snapshot limits before receiving. We'll + * recheck snapshot limits again at the end (we create the + * filesystems and increment those counts during begin_sync). + */ + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, + ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); + if (error != 0) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (error); + } + + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, + ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); + if (error != 0) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (error); + } + + if (drba->drba_origin != NULL) { + dsl_dataset_t *origin; + error = dsl_dataset_hold_flags(dp, drba->drba_origin, + dsflags, FTAG, &origin); + if (error != 0) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (error); + } + if (!origin->ds_is_snapshot) { + dsl_dataset_rele_flags(origin, dsflags, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + if (dsl_dataset_phys(origin)->ds_guid != fromguid && + fromguid != 0) { + dsl_dataset_rele_flags(origin, dsflags, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(ENODEV)); + } + + if (origin->ds_dir->dd_crypto_obj != 0 && + (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { + dsl_dataset_rele_flags(origin, dsflags, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * If the origin is redacted we need to verify that this + * send stream can safely be received on top of the + * origin. + */ + if (dsl_dataset_feature_is_active(origin, + SPA_FEATURE_REDACTED_DATASETS)) { + if (!redact_check(drba, origin)) { + dsl_dataset_rele_flags(origin, dsflags, + FTAG); + dsl_dataset_rele_flags(ds, dsflags, + FTAG); + return (SET_ERROR(EINVAL)); + } + } + + dsl_dataset_rele_flags(origin, dsflags, FTAG); + } + dsl_dataset_rele_flags(ds, dsflags, FTAG); + error = 0; + } + return (error); +} + +static void +dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + dmu_recv_cookie_t *drc = drba->drba_cookie; + struct drr_begin *drrb = drc->drc_drrb; + const char *tofs = drc->drc_tofs; + uint64_t featureflags = drc->drc_featureflags; + dsl_dataset_t *ds, *newds; + objset_t *os; + uint64_t dsobj; + ds_hold_flags_t dsflags = 0; + int error; + uint64_t crflags = 0; + dsl_crypto_params_t dummy_dcp = { 0 }; + dsl_crypto_params_t *dcp = drba->drba_dcp; + + if (drrb->drr_flags & DRR_FLAG_CI_DATA) + crflags |= DS_FLAG_CI_DATASET; + + if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0) + dsflags |= DS_HOLD_FLAG_DECRYPT; + + /* + * Raw, non-incremental recvs always use a dummy dcp with + * the raw cmd set. Raw incremental recvs do not use a dcp + * since the encryption parameters are already set in stone. + */ + if (dcp == NULL && drrb->drr_fromguid == 0 && + drba->drba_origin == NULL) { + ASSERT3P(dcp, ==, NULL); + dcp = &dummy_dcp; + + if (featureflags & DMU_BACKUP_FEATURE_RAW) + dcp->cp_cmd = DCP_CMD_RAW_RECV; + } + + error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); + if (error == 0) { + /* create temporary clone */ + dsl_dataset_t *snap = NULL; + + if (drba->drba_snapobj != 0) { + VERIFY0(dsl_dataset_hold_obj(dp, + drba->drba_snapobj, FTAG, &snap)); + ASSERT3P(dcp, ==, NULL); + } + dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, + snap, crflags, drba->drba_cred, dcp, tx); + if (drba->drba_snapobj != 0) + dsl_dataset_rele(snap, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); + } else { + dsl_dir_t *dd; + const char *tail; + dsl_dataset_t *origin = NULL; + + VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); + + if (drba->drba_origin != NULL) { + VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, + FTAG, &origin)); + ASSERT3P(dcp, ==, NULL); + } + + /* Create new dataset. */ + dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1, + origin, crflags, drba->drba_cred, dcp, tx); + if (origin != NULL) + dsl_dataset_rele(origin, FTAG); + dsl_dir_rele(dd, FTAG); + drc->drc_newfs = B_TRUE; + } + VERIFY0(dsl_dataset_own_obj_force(dp, dsobj, dsflags, dmu_recv_tag, + &newds)); + if (dsl_dataset_feature_is_active(newds, + SPA_FEATURE_REDACTED_DATASETS)) { + /* + * If the origin dataset is redacted, the child will be redacted + * when we create it. We clear the new dataset's + * redaction info; if it should be redacted, we'll fill + * in its information later. + */ + dsl_dataset_deactivate_feature(newds, + SPA_FEATURE_REDACTED_DATASETS, tx); + } + VERIFY0(dmu_objset_from_ds(newds, &os)); + + if (drc->drc_resumable) { + dsl_dataset_zapify(newds, tx); + if (drrb->drr_fromguid != 0) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, + 8, 1, &drrb->drr_fromguid, tx)); + } + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID, + 8, 1, &drrb->drr_toguid, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME, + 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx)); + uint64_t one = 1; + uint64_t zero = 0; + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT, + 8, 1, &one, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET, + 8, 1, &zero, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, + 8, 1, &zero, tx)); + if (featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK, + 8, 1, &one, tx)); + } + if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, + 8, 1, &one, tx)); + } + if (featureflags & DMU_BACKUP_FEATURE_COMPRESSED) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK, + 8, 1, &one, tx)); + } + if (featureflags & DMU_BACKUP_FEATURE_RAW) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_RAWOK, + 8, 1, &one, tx)); + } + + uint64_t *redact_snaps; + uint_t numredactsnaps; + if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, + BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, + &numredactsnaps) == 0) { + VERIFY0(zap_add(mos, dsobj, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, + sizeof (*redact_snaps), numredactsnaps, + redact_snaps, tx)); + } + } + + /* + * Usually the os->os_encrypted value is tied to the presence of a + * DSL Crypto Key object in the dd. However, that will not be received + * until dmu_recv_stream(), so we set the value manually for now. + */ + if (featureflags & DMU_BACKUP_FEATURE_RAW) { + os->os_encrypted = B_TRUE; + drba->drba_cookie->drc_raw = B_TRUE; + } + + + if (featureflags & DMU_BACKUP_FEATURE_REDACTED) { + uint64_t *redact_snaps; + uint_t numredactsnaps; + VERIFY0(nvlist_lookup_uint64_array(drc->drc_begin_nvl, + BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps)); + dsl_dataset_activate_redaction(newds, redact_snaps, + numredactsnaps, tx); + } + + dmu_buf_will_dirty(newds->ds_dbuf, tx); + dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; + + /* + * If we actually created a non-clone, we need to create the objset + * in our new dataset. If this is a raw send we postpone this until + * dmu_recv_stream() so that we can allocate the metadnode with the + * properties from the DRR_BEGIN payload. + */ + rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG); + if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) && + (featureflags & DMU_BACKUP_FEATURE_RAW) == 0) { + (void) dmu_objset_create_impl(dp->dp_spa, + newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); + } + rrw_exit(&newds->ds_bp_rwlock, FTAG); + + drba->drba_cookie->drc_ds = newds; + + spa_history_log_internal_ds(newds, "receive", tx, ""); +} + +static int +dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dmu_recv_cookie_t *drc = drba->drba_cookie; + dsl_pool_t *dp = dmu_tx_pool(tx); + struct drr_begin *drrb = drc->drc_drrb; + int error; + ds_hold_flags_t dsflags = 0; + dsl_dataset_t *ds; + const char *tofs = drc->drc_tofs; + + /* already checked */ + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + ASSERT(drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING); + + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM || + drrb->drr_type >= DMU_OST_NUMTYPES) + return (SET_ERROR(EINVAL)); + + /* + * This is mostly a sanity check since we should have already done these + * checks during a previous attempt to receive the data. + */ + error = recv_begin_check_feature_flags_impl(drc->drc_featureflags, + dp->dp_spa); + if (error != 0) + return (error); + + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + + (void) snprintf(recvname, sizeof (recvname), "%s/%s", + tofs, recv_clone_name); + + if ((drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) == 0) + dsflags |= DS_HOLD_FLAG_DECRYPT; + + if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) { + /* %recv does not exist; continue in tofs */ + error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); + if (error != 0) + return (error); + } + + /* check that ds is marked inconsistent */ + if (!DS_IS_INCONSISTENT(ds)) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* check that there is resuming data, and that the toguid matches */ + if (!dsl_dataset_is_zapified(ds)) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + uint64_t val; + error = zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); + if (error != 0 || drrb->drr_toguid != val) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * Check if the receive is still running. If so, it will be owned. + * Note that nothing else can own the dataset (e.g. after the receive + * fails) because it will be marked inconsistent. + */ + if (dsl_dataset_has_owner(ds)) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EBUSY)); + } + + /* There should not be any snapshots of this fs yet. */ + if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * Note: resume point will be checked when we process the first WRITE + * record. + */ + + /* check that the origin matches */ + val = 0; + (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); + if (drrb->drr_fromguid != val) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * If we're resuming, and the send is redacted, then the original send + * must have been redacted, and must have been redacted with respect to + * the same snapshots. + */ + if (drc->drc_featureflags & DMU_BACKUP_FEATURE_REDACTED) { + uint64_t num_ds_redact_snaps; + uint64_t *ds_redact_snaps; + + uint_t num_stream_redact_snaps; + uint64_t *stream_redact_snaps; + + if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, + BEGINNV_REDACT_SNAPS, &stream_redact_snaps, + &num_stream_redact_snaps) != 0) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + if (!dsl_dataset_get_uint64_array_feature(ds, + SPA_FEATURE_REDACTED_DATASETS, &num_ds_redact_snaps, + &ds_redact_snaps)) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + for (int i = 0; i < num_ds_redact_snaps; i++) { + if (!redact_snaps_contains(ds_redact_snaps, + num_ds_redact_snaps, stream_redact_snaps[i])) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + } + } + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (0); +} + +static void +dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + const char *tofs = drba->drba_cookie->drc_tofs; + uint64_t featureflags = drba->drba_cookie->drc_featureflags; + dsl_dataset_t *ds; + ds_hold_flags_t dsflags = 0; + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + + (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, + recv_clone_name); + + if (featureflags & DMU_BACKUP_FEATURE_RAW) { + drba->drba_cookie->drc_raw = B_TRUE; + } else { + dsflags |= DS_HOLD_FLAG_DECRYPT; + } + + if (dsl_dataset_own_force(dp, recvname, dsflags, dmu_recv_tag, &ds) + != 0) { + /* %recv does not exist; continue in tofs */ + VERIFY0(dsl_dataset_own_force(dp, tofs, dsflags, dmu_recv_tag, + &ds)); + drba->drba_cookie->drc_newfs = B_TRUE; + } + + ASSERT(DS_IS_INCONSISTENT(ds)); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) || + drba->drba_cookie->drc_raw); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + drba->drba_cookie->drc_ds = ds; + + spa_history_log_internal_ds(ds, "resume receive", tx, ""); +} + +/* + * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() + * succeeds; otherwise we will leak the holds on the datasets. + */ +int +dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, + boolean_t force, boolean_t resumable, nvlist_t *localprops, + nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc, + vnode_t *vp, offset_t *voffp) +{ + dmu_recv_begin_arg_t drba = { 0 }; + int err; + + bzero(drc, sizeof (dmu_recv_cookie_t)); + drc->drc_drr_begin = drr_begin; + drc->drc_drrb = &drr_begin->drr_u.drr_begin; + drc->drc_tosnap = tosnap; + drc->drc_tofs = tofs; + drc->drc_force = force; + drc->drc_resumable = resumable; + drc->drc_cred = CRED(); + drc->drc_clone = (origin != NULL); + + if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { + drc->drc_byteswap = B_TRUE; + (void) fletcher_4_incremental_byteswap(drr_begin, + sizeof (dmu_replay_record_t), &drc->drc_cksum); + byteswap_record(drr_begin); + } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { + (void) fletcher_4_incremental_native(drr_begin, + sizeof (dmu_replay_record_t), &drc->drc_cksum); + } else { + return (SET_ERROR(EINVAL)); + } + + drc->drc_vp = vp; + drc->drc_voff = *voffp; + drc->drc_featureflags = + DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); + + uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen; + void *payload = NULL; + if (payloadlen != 0) + payload = kmem_alloc(payloadlen, KM_SLEEP); + + err = receive_read_payload_and_next_header(drc, payloadlen, + payload); + if (err != 0) { + kmem_free(payload, payloadlen); + return (err); + } + if (payloadlen != 0) { + err = nvlist_unpack(payload, payloadlen, &drc->drc_begin_nvl, + KM_SLEEP); + kmem_free(payload, payloadlen); + if (err != 0) { + kmem_free(drc->drc_next_rrd, + sizeof (*drc->drc_next_rrd)); + return (err); + } + } + + drba.drba_origin = origin; + drba.drba_cookie = drc; + drba.drba_cred = CRED(); + + if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) { + err = dsl_sync_task(tofs, + dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL); + } else { + int err; + + /* + * For non-raw, non-incremental, non-resuming receives the + * user can specify encryption parameters on the command line + * with "zfs recv -o". For these receives we create a dcp and + * pass it to the sync task. Creating the dcp will implicitly + * remove the encryption params from the localprops nvlist, + * which avoids errors when trying to set these normally + * read-only properties. Any other kind of receive that + * attempts to set these properties will fail as a result. + */ + if ((DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_RAW) == 0 && + origin == NULL && drc->drc_drrb->drr_fromguid == 0) { + err = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, + localprops, hidden_args, &drba.drba_dcp); + if (err != 0) + return (err); + } + + err = dsl_sync_task(tofs, + dmu_recv_begin_check, dmu_recv_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL); + dsl_crypto_params_free(drba.drba_dcp, !!err); + + return (err); + } + + if (err != 0) { + kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); + nvlist_free(drc->drc_begin_nvl); + } + return (err); +} + +static int +guid_compare(const void *arg1, const void *arg2) +{ + const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1; + const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2; + + return (AVL_CMP(gmep1->guid, gmep2->guid)); +} + +static void +free_guid_map_onexit(void *arg) +{ + avl_tree_t *ca = arg; + void *cookie = NULL; + guid_map_entry_t *gmep; + + while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { + ds_hold_flags_t dsflags = DS_HOLD_FLAG_DECRYPT; + + if (gmep->raw) { + gmep->gme_ds->ds_objset->os_raw_receive = B_FALSE; + dsflags &= ~DS_HOLD_FLAG_DECRYPT; + } + + dsl_dataset_disown(gmep->gme_ds, dsflags, gmep); + kmem_free(gmep, sizeof (guid_map_entry_t)); + } + avl_destroy(ca); + kmem_free(ca, sizeof (avl_tree_t)); +} + +static int +receive_read(dmu_recv_cookie_t *drc, int len, void *buf) +{ + int done = 0; + + /* + * The code doesn't rely on this (lengths being multiples of 8). See + * comment in dump_bytes. + */ + ASSERT(len % 8 == 0 || + (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) != 0); + + while (done < len) { + ssize_t resid; + + drc->drc_err = vn_rdwr(UIO_READ, drc->drc_vp, + (char *)buf + done, len - done, + drc->drc_voff, UIO_SYSSPACE, FAPPEND, + RLIM64_INFINITY, CRED(), &resid); + + if (resid == len - done) { + /* + * Note: ECKSUM indicates that the receive + * was interrupted and can potentially be resumed. + */ + drc->drc_err = SET_ERROR(ECKSUM); + } + drc->drc_voff += len - done - resid; + done = len - resid; + if (drc->drc_err != 0) + return (drc->drc_err); + } + + drc->drc_bytes_read += len; + + ASSERT3U(done, ==, len); + return (0); +} + +static inline uint8_t +deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) +{ + if (bonus_type == DMU_OT_SA) { + return (1); + } else { + return (1 + + ((DN_OLD_MAX_BONUSLEN - + MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT)); + } +} + +static void +save_resume_state(struct receive_writer_arg *rwa, + uint64_t object, uint64_t offset, dmu_tx_t *tx) +{ + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + if (!rwa->resumable) + return; + + /* + * We use ds_resume_bytes[] != 0 to indicate that we need to + * update this on disk, so it must not be 0. + */ + ASSERT(rwa->bytes_read != 0); + + /* + * We only resume from write records, which have a valid + * (non-meta-dnode) object number. + */ + ASSERT(object != 0); + + /* + * For resuming to work correctly, we must receive records in order, + * sorted by object,offset. This is checked by the callers, but + * assert it here for good measure. + */ + ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]); + ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] || + offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]); + ASSERT3U(rwa->bytes_read, >=, + rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]); + + rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object; + rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset; + rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; +} + +noinline static int +receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, + void *data) +{ + dmu_object_info_t doi; + dmu_tx_t *tx; + uint64_t object; + int err; + uint8_t dn_slots = drro->drr_dn_slots != 0 ? + drro->drr_dn_slots : DNODE_MIN_SLOTS; + + if (drro->drr_type == DMU_OT_NONE || + !DMU_OT_IS_VALID(drro->drr_type) || + !DMU_OT_IS_VALID(drro->drr_bonustype) || + drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || + drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || + P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || + drro->drr_blksz < SPA_MINBLOCKSIZE || + drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || + drro->drr_bonuslen > + DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) || + dn_slots > + (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) { + return (SET_ERROR(EINVAL)); + } + + if (rwa->raw) { + /* + * We should have received a DRR_OBJECT_RANGE record + * containing this block and stored it in rwa. + */ + if (drro->drr_object < rwa->or_firstobj || + drro->drr_object >= rwa->or_firstobj + rwa->or_numslots || + drro->drr_raw_bonuslen < drro->drr_bonuslen || + drro->drr_indblkshift > SPA_MAXBLOCKSHIFT || + drro->drr_nlevels > DN_MAX_LEVELS || + drro->drr_nblkptr > DN_MAX_NBLKPTR || + DN_SLOTS_TO_BONUSLEN(dn_slots) < + drro->drr_raw_bonuslen) + return (SET_ERROR(EINVAL)); + } else { + if (drro->drr_flags != 0 || drro->drr_raw_bonuslen != 0 || + drro->drr_indblkshift != 0 || drro->drr_nlevels != 0 || + drro->drr_nblkptr != 0) + return (SET_ERROR(EINVAL)); + } + + err = dmu_object_info(rwa->os, drro->drr_object, &doi); + + if (err != 0 && err != ENOENT && err != EEXIST) + return (SET_ERROR(EINVAL)); + + if (drro->drr_object > rwa->max_object) + rwa->max_object = drro->drr_object; + + /* + * If we are losing blkptrs or changing the block size this must + * be a new file instance. We must clear out the previous file + * contents before we can change this type of metadata in the dnode. + * Raw receives will also check that the indirect structure of the + * dnode hasn't changed. + */ + if (err == 0) { + uint32_t indblksz = drro->drr_indblkshift ? + 1ULL << drro->drr_indblkshift : 0; + int nblkptr = deduce_nblkptr(drro->drr_bonustype, + drro->drr_bonuslen); + + object = drro->drr_object; + + /* nblkptr will be bounded by the bonus size and type */ + if (rwa->raw && nblkptr != drro->drr_nblkptr) + return (SET_ERROR(EINVAL)); + + if (drro->drr_blksz != doi.doi_data_block_size || + nblkptr < doi.doi_nblkptr || + dn_slots != doi.doi_dnodesize >> DNODE_SHIFT || + (rwa->raw && + (indblksz != doi.doi_metadata_block_size || + drro->drr_nlevels < doi.doi_indirection))) { + err = dmu_free_long_range(rwa->os, drro->drr_object, + 0, DMU_OBJECT_END); + if (err != 0) + return (SET_ERROR(EINVAL)); + } + + /* + * The dmu does not currently support decreasing nlevels + * on an object. For non-raw sends, this does not matter + * and the new object can just use the previous one's nlevels. + * For raw sends, however, the structure of the received dnode + * (including nlevels) must match that of the send side. + * Therefore, instead of using dmu_object_reclaim(), we must + * free the object completely and call dmu_object_claim_dnsize() + * instead. + */ + if ((rwa->raw && drro->drr_nlevels < doi.doi_indirection) || + dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) { + err = dmu_free_long_object(rwa->os, drro->drr_object); + if (err != 0) + return (SET_ERROR(EINVAL)); + + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + object = DMU_NEW_OBJECT; + } + } else if (err == EEXIST) { + /* + * The object requested is currently an interior slot of a + * multi-slot dnode. This will be resolved when the next txg + * is synced out, since the send stream will have told us + * to free this slot when we freed the associated dnode + * earlier in the stream. + */ + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + object = drro->drr_object; + } else { + /* object is free and we are about to allocate a new one */ + object = DMU_NEW_OBJECT; + } + + /* + * If this is a multi-slot dnode there is a chance that this + * object will expand into a slot that is already used by + * another object from the previous snapshot. We must free + * these objects before we attempt to allocate the new dnode. + */ + if (dn_slots > 1) { + boolean_t need_sync = B_FALSE; + + for (uint64_t slot = drro->drr_object + 1; + slot < drro->drr_object + dn_slots; + slot++) { + dmu_object_info_t slot_doi; + + err = dmu_object_info(rwa->os, slot, &slot_doi); + if (err == ENOENT || err == EEXIST) + continue; + else if (err != 0) + return (err); + + err = dmu_free_long_object(rwa->os, slot); + + if (err != 0) + return (err); + + need_sync = B_TRUE; + } + + if (need_sync) + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + } + + tx = dmu_tx_create(rwa->os); + dmu_tx_hold_bonus(tx, object); + dmu_tx_hold_write(tx, object, 0, 0); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + return (err); + } + + if (object == DMU_NEW_OBJECT) { + /* currently free, want to be allocated */ + err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, + drro->drr_type, drro->drr_blksz, + drro->drr_bonustype, drro->drr_bonuslen, + dn_slots << DNODE_SHIFT, tx); + } else if (drro->drr_type != doi.doi_type || + drro->drr_blksz != doi.doi_data_block_size || + drro->drr_bonustype != doi.doi_bonus_type || + drro->drr_bonuslen != doi.doi_bonus_size) { + /* currently allocated, but with different properties */ + err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object, + drro->drr_type, drro->drr_blksz, + drro->drr_bonustype, drro->drr_bonuslen, + dn_slots << DNODE_SHIFT, tx); + } + if (err != 0) { + dmu_tx_commit(tx); + return (SET_ERROR(EINVAL)); + } + + if (rwa->or_crypt_params_present) { + /* + * Set the crypt params for the buffer associated with this + * range of dnodes. This causes the blkptr_t to have the + * same crypt params (byteorder, salt, iv, mac) as on the + * sending side. + * + * Since we are committing this tx now, it is possible for + * the dnode block to end up on-disk with the incorrect MAC, + * if subsequent objects in this block are received in a + * different txg. However, since the dataset is marked as + * inconsistent, no code paths will do a non-raw read (or + * decrypt the block / verify the MAC). The receive code and + * scrub code can safely do raw reads and verify the + * checksum. They don't need to verify the MAC. + */ + dmu_buf_t *db = NULL; + uint64_t offset = rwa->or_firstobj * DNODE_MIN_SIZE; + + err = dmu_buf_hold_by_dnode(DMU_META_DNODE(rwa->os), + offset, FTAG, &db, DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT); + if (err != 0) { + dmu_tx_commit(tx); + return (SET_ERROR(EINVAL)); + } + + dmu_buf_set_crypt_params(db, rwa->or_byteorder, + rwa->or_salt, rwa->or_iv, rwa->or_mac, tx); + + dmu_buf_rele(db, FTAG); + + rwa->or_crypt_params_present = B_FALSE; + } + + dmu_object_set_checksum(rwa->os, drro->drr_object, + drro->drr_checksumtype, tx); + dmu_object_set_compress(rwa->os, drro->drr_object, + drro->drr_compress, tx); + + /* handle more restrictive dnode structuring for raw recvs */ + if (rwa->raw) { + /* + * Set the indirect block shift and nlevels. This will not fail + * because we ensured all of the blocks were free earlier if + * this is a new object. + */ + VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object, + drro->drr_blksz, drro->drr_indblkshift, tx)); + VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object, + drro->drr_nlevels, tx)); + VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object, + drro->drr_maxblkid, tx)); + } + + if (data != NULL) { + dmu_buf_t *db; + uint32_t flags = DMU_READ_NO_PREFETCH; + + if (rwa->raw) + flags |= DMU_READ_NO_DECRYPT; + + VERIFY0(dmu_bonus_hold_impl(rwa->os, drro->drr_object, + FTAG, flags, &db)); + dmu_buf_will_dirty(db, tx); + + ASSERT3U(db->db_size, >=, drro->drr_bonuslen); + bcopy(data, db->db_data, DRR_OBJECT_PAYLOAD_SIZE(drro)); + + /* + * Raw bonus buffers have their byteorder determined by the + * DRR_OBJECT_RANGE record. + */ + if (rwa->byteswap && !rwa->raw) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drro->drr_bonustype); + dmu_ot_byteswap[byteswap].ob_func(db->db_data, + DRR_OBJECT_PAYLOAD_SIZE(drro)); + } + dmu_buf_rele(db, FTAG); + } + dmu_tx_commit(tx); + + return (0); +} + +/* ARGSUSED */ +noinline static int +receive_freeobjects(struct receive_writer_arg *rwa, + struct drr_freeobjects *drrfo) +{ + uint64_t obj; + int next_err = 0; + + if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) + return (SET_ERROR(EINVAL)); + + for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj; + obj < drrfo->drr_firstobj + drrfo->drr_numobjs && + obj < DN_MAX_OBJECT && next_err == 0; + next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { + dmu_object_info_t doi; + int err; + + err = dmu_object_info(rwa->os, obj, &doi); + if (err == ENOENT) + continue; + else if (err != 0) + return (err); + + err = dmu_free_long_object(rwa->os, obj); + + if (err != 0) + return (err); + } + if (next_err != ESRCH) + return (next_err); + return (0); +} + +noinline static int +receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, + arc_buf_t *abuf) +{ + int err; + dmu_tx_t *tx; + dnode_t *dn; + + if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset || + !DMU_OT_IS_VALID(drrw->drr_type)) + return (SET_ERROR(EINVAL)); + + /* + * For resuming to work, records must be in increasing order + * by (object, offset). + */ + if (drrw->drr_object < rwa->last_object || + (drrw->drr_object == rwa->last_object && + drrw->drr_offset < rwa->last_offset)) { + return (SET_ERROR(EINVAL)); + } + rwa->last_object = drrw->drr_object; + rwa->last_offset = drrw->drr_offset; + + if (rwa->last_object > rwa->max_object) + rwa->max_object = rwa->last_object; + + if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) + return (SET_ERROR(EINVAL)); + + tx = dmu_tx_create(rwa->os); + dmu_tx_hold_write(tx, drrw->drr_object, + drrw->drr_offset, drrw->drr_logical_size); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + return (err); + } + + if (rwa->byteswap && !arc_is_encrypted(abuf) && + arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrw->drr_type); + dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, + DRR_WRITE_PAYLOAD_SIZE(drrw)); + } + + /* use the bonus buf to look up the dnode in dmu_assign_arcbuf */ + VERIFY0(dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn)); + dmu_assign_arcbuf_by_dnode(dn, drrw->drr_offset, abuf, tx); + dnode_rele(dn, FTAG); + + /* + * Note: If the receive fails, we want the resume stream to start + * with the same record that we last successfully received (as opposed + * to the next record), so that we can verify that we are + * resuming from the correct location. + */ + save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); + dmu_tx_commit(tx); + + return (0); +} + +/* + * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed + * streams to refer to a copy of the data that is already on the + * system because it came in earlier in the stream. This function + * finds the earlier copy of the data, and uses that copy instead of + * data from the stream to fulfill this write. + */ +noinline static int +receive_write_byref(struct receive_writer_arg *rwa, + struct drr_write_byref *drrwbr) +{ + dmu_tx_t *tx; + int err; + guid_map_entry_t gmesrch; + guid_map_entry_t *gmep; + avl_index_t where; + objset_t *ref_os = NULL; + int flags = DMU_READ_PREFETCH; + dmu_buf_t *dbp; + + if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) + return (SET_ERROR(EINVAL)); + + /* + * If the GUID of the referenced dataset is different from the + * GUID of the target dataset, find the referenced dataset. + */ + if (drrwbr->drr_toguid != drrwbr->drr_refguid) { + gmesrch.guid = drrwbr->drr_refguid; + if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch, + &where)) == NULL) { + return (SET_ERROR(EINVAL)); + } + if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) + return (SET_ERROR(EINVAL)); + } else { + ref_os = rwa->os; + } + + if (drrwbr->drr_object > rwa->max_object) + rwa->max_object = drrwbr->drr_object; + + if (rwa->raw) + flags |= DMU_READ_NO_DECRYPT; + + /* may return either a regular db or an encrypted one */ + err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, + drrwbr->drr_refoffset, FTAG, &dbp, flags); + if (err != 0) + return (err); + + tx = dmu_tx_create(rwa->os); + + dmu_tx_hold_write(tx, drrwbr->drr_object, + drrwbr->drr_offset, drrwbr->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + return (err); + } + + if (rwa->raw) { + dmu_copy_from_buf(rwa->os, drrwbr->drr_object, + drrwbr->drr_offset, dbp, tx); + } else { + dmu_write(rwa->os, drrwbr->drr_object, + drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); + } + dmu_buf_rele(dbp, FTAG); + + /* See comment in restore_write. */ + save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx); + dmu_tx_commit(tx); + return (0); +} + +static int +receive_write_embedded(struct receive_writer_arg *rwa, + struct drr_write_embedded *drrwe, void *data) +{ + dmu_tx_t *tx; + int err; + + if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset) + return (SET_ERROR(EINVAL)); + + if (drrwe->drr_psize > BPE_PAYLOAD_SIZE) + return (SET_ERROR(EINVAL)); + + if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES) + return (SET_ERROR(EINVAL)); + if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) + return (SET_ERROR(EINVAL)); + if (rwa->raw) + return (SET_ERROR(EINVAL)); + + if (drrwe->drr_object > rwa->max_object) + rwa->max_object = drrwe->drr_object; + + tx = dmu_tx_create(rwa->os); + + dmu_tx_hold_write(tx, drrwe->drr_object, + drrwe->drr_offset, drrwe->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + return (err); + } + + dmu_write_embedded(rwa->os, drrwe->drr_object, + drrwe->drr_offset, data, drrwe->drr_etype, + drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize, + rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); + + /* See comment in restore_write. */ + save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx); + dmu_tx_commit(tx); + return (0); +} + +static int +receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, + arc_buf_t *abuf) +{ + dmu_tx_t *tx; + dmu_buf_t *db, *db_spill; + int err; + + if (drrs->drr_length < SPA_MINBLOCKSIZE || + drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) + return (SET_ERROR(EINVAL)); + + if (rwa->raw) { + if (!DMU_OT_IS_VALID(drrs->drr_type) || + drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS || + drrs->drr_compressed_size == 0) + return (SET_ERROR(EINVAL)); + } + + if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) + return (SET_ERROR(EINVAL)); + + if (drrs->drr_object > rwa->max_object) + rwa->max_object = drrs->drr_object; + + VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); + if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG, + &db_spill)) != 0) { + dmu_buf_rele(db, FTAG); + return (err); + } + + tx = dmu_tx_create(rwa->os); + + dmu_tx_hold_spill(tx, db->db_object); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_buf_rele(db, FTAG); + dmu_buf_rele(db_spill, FTAG); + dmu_tx_abort(tx); + return (err); + } + + if (db_spill->db_size < drrs->drr_length) + VERIFY0(dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); + + if (rwa->byteswap && !arc_is_encrypted(abuf) && + arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrs->drr_type); + dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, + DRR_SPILL_PAYLOAD_SIZE(drrs)); + } + + dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx); + + dmu_buf_rele(db, FTAG); + dmu_buf_rele(db_spill, FTAG); + + dmu_tx_commit(tx); + return (0); +} + +/* ARGSUSED */ +noinline static int +receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) +{ + int err; + + if (drrf->drr_length != -1ULL && + drrf->drr_offset + drrf->drr_length < drrf->drr_offset) + return (SET_ERROR(EINVAL)); + + if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) + return (SET_ERROR(EINVAL)); + + if (drrf->drr_object > rwa->max_object) + rwa->max_object = drrf->drr_object; + + err = dmu_free_long_range(rwa->os, drrf->drr_object, + drrf->drr_offset, drrf->drr_length); + + return (err); +} + +static int +receive_object_range(struct receive_writer_arg *rwa, + struct drr_object_range *drror) +{ + /* + * By default, we assume this block is in our native format + * (ZFS_HOST_BYTEORDER). We then take into account whether + * the send stream is byteswapped (rwa->byteswap). Finally, + * we need to byteswap again if this particular block was + * in non-native format on the send side. + */ + boolean_t byteorder = ZFS_HOST_BYTEORDER ^ rwa->byteswap ^ + !!DRR_IS_RAW_BYTESWAPPED(drror->drr_flags); + + /* + * Since dnode block sizes are constant, we should not need to worry + * about making sure that the dnode block size is the same on the + * sending and receiving sides for the time being. For non-raw sends, + * this does not matter (and in fact we do not send a DRR_OBJECT_RANGE + * record at all). Raw sends require this record type because the + * encryption parameters are used to protect an entire block of bonus + * buffers. If the size of dnode blocks ever becomes variable, + * handling will need to be added to ensure that dnode block sizes + * match on the sending and receiving side. + */ + if (drror->drr_numslots != DNODES_PER_BLOCK || + P2PHASE(drror->drr_firstobj, DNODES_PER_BLOCK) != 0 || + !rwa->raw) + return (SET_ERROR(EINVAL)); + + if (drror->drr_firstobj > rwa->max_object) + rwa->max_object = drror->drr_firstobj; + + /* + * The DRR_OBJECT_RANGE handling must be deferred to receive_object() + * so that the block of dnodes is not written out when it's empty, + * and converted to a HOLE BP. + */ + rwa->or_crypt_params_present = B_TRUE; + rwa->or_firstobj = drror->drr_firstobj; + rwa->or_numslots = drror->drr_numslots; + bcopy(drror->drr_salt, rwa->or_salt, ZIO_DATA_SALT_LEN); + bcopy(drror->drr_iv, rwa->or_iv, ZIO_DATA_IV_LEN); + bcopy(drror->drr_mac, rwa->or_mac, ZIO_DATA_MAC_LEN); + rwa->or_byteorder = byteorder; + + return (0); +} + +/* + * Until we have the ability to redact large ranges of data efficiently, we + * process these records as frees. + */ +/* ARGSUSED */ +noinline static int +receive_redact(struct receive_writer_arg *rwa, struct drr_redact *drrr) +{ + struct drr_free drrf = {0}; + drrf.drr_length = drrr->drr_length; + drrf.drr_object = drrr->drr_object; + drrf.drr_offset = drrr->drr_offset; + drrf.drr_toguid = drrr->drr_toguid; + return (receive_free(rwa, &drrf)); +} + +/* used to destroy the drc_ds on error */ +static void +dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) +{ + dsl_dataset_t *ds = drc->drc_ds; + ds_hold_flags_t dsflags = (drc->drc_raw) ? 0 : DS_HOLD_FLAG_DECRYPT; + + /* + * Wait for the txg sync before cleaning up the receive. For + * resumable receives, this ensures that our resume state has + * been written out to disk. For raw receives, this ensures + * that the user accounting code will not attempt to do anything + * after we stopped receiving the dataset. + */ + txg_wait_synced(ds->ds_dir->dd_pool, 0); + ds->ds_objset->os_raw_receive = B_FALSE; + + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + if (drc->drc_resumable && !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) { + rrw_exit(&ds->ds_bp_rwlock, FTAG); + dsl_dataset_disown(ds, dsflags, dmu_recv_tag); + } else { + char name[ZFS_MAX_DATASET_NAME_LEN]; + rrw_exit(&ds->ds_bp_rwlock, FTAG); + dsl_dataset_name(ds, name); + dsl_dataset_disown(ds, dsflags, dmu_recv_tag); + (void) dsl_destroy_head(name); + } +} + +static void +receive_cksum(dmu_recv_cookie_t *drc, int len, void *buf) +{ + if (drc->drc_byteswap) { + (void) fletcher_4_incremental_byteswap(buf, len, + &drc->drc_cksum); + } else { + (void) fletcher_4_incremental_native(buf, len, &drc->drc_cksum); + } +} + +/* + * Read the payload into a buffer of size len, and update the current record's + * payload field. + * Allocate drc->drc_next_rrd and read the next record's header into + * drc->drc_next_rrd->header. + * Verify checksum of payload and next record. + */ +static int +receive_read_payload_and_next_header(dmu_recv_cookie_t *drc, int len, void *buf) +{ + int err; + + if (len != 0) { + ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); + err = receive_read(drc, len, buf); + if (err != 0) + return (err); + receive_cksum(drc, len, buf); + + /* note: rrd is NULL when reading the begin record's payload */ + if (drc->drc_rrd != NULL) { + drc->drc_rrd->payload = buf; + drc->drc_rrd->payload_size = len; + drc->drc_rrd->bytes_read = drc->drc_bytes_read; + } + } + + drc->drc_prev_cksum = drc->drc_cksum; + + drc->drc_next_rrd = kmem_zalloc(sizeof (*drc->drc_next_rrd), KM_SLEEP); + err = receive_read(drc, sizeof (drc->drc_next_rrd->header), + &drc->drc_next_rrd->header); + drc->drc_next_rrd->bytes_read = drc->drc_bytes_read; + + if (err != 0) { + kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); + drc->drc_next_rrd = NULL; + return (err); + } + if (drc->drc_next_rrd->header.drr_type == DRR_BEGIN) { + kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); + drc->drc_next_rrd = NULL; + return (SET_ERROR(EINVAL)); + } + + /* + * Note: checksum is of everything up to but not including the + * checksum itself. + */ + ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + receive_cksum(drc, + offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + &drc->drc_next_rrd->header); + + zio_cksum_t cksum_orig = + drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum; + zio_cksum_t *cksump = + &drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum; + + if (drc->drc_byteswap) + byteswap_record(&drc->drc_next_rrd->header); + + if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && + !ZIO_CHECKSUM_EQUAL(drc->drc_cksum, *cksump)) { + kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); + drc->drc_next_rrd = NULL; + return (SET_ERROR(ECKSUM)); + } + + receive_cksum(drc, sizeof (cksum_orig), &cksum_orig); + + return (0); +} + +/* + * Issue the prefetch reads for any necessary indirect blocks. + * + * We use the object ignore list to tell us whether or not to issue prefetches + * for a given object. We do this for both correctness (in case the blocksize + * of an object has changed) and performance (if the object doesn't exist, don't + * needlessly try to issue prefetches). We also trim the list as we go through + * the stream to prevent it from growing to an unbounded size. + * + * The object numbers within will always be in sorted order, and any write + * records we see will also be in sorted order, but they're not sorted with + * respect to each other (i.e. we can get several object records before + * receiving each object's write records). As a result, once we've reached a + * given object number, we can safely remove any reference to lower object + * numbers in the ignore list. In practice, we receive up to 32 object records + * before receiving write records, so the list can have up to 32 nodes in it. + */ +/* ARGSUSED */ +static void +receive_read_prefetch(dmu_recv_cookie_t *drc, uint64_t object, uint64_t offset, + uint64_t length) +{ + if (!objlist_exists(drc->drc_ignore_objlist, object)) { + dmu_prefetch(drc->drc_os, object, 1, offset, length, + ZIO_PRIORITY_SYNC_READ); + } +} + +/* + * Read records off the stream, issuing any necessary prefetches. + */ +static int +receive_read_record(dmu_recv_cookie_t *drc) +{ + int err; + + switch (drc->drc_rrd->header.drr_type) { + case DRR_OBJECT: + { + struct drr_object *drro = + &drc->drc_rrd->header.drr_u.drr_object; + uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro); + void *buf = kmem_zalloc(size, KM_SLEEP); + dmu_object_info_t doi; + + err = receive_read_payload_and_next_header(drc, size, buf); + if (err != 0) { + kmem_free(buf, size); + return (err); + } + err = dmu_object_info(drc->drc_os, drro->drr_object, &doi); + /* + * See receive_read_prefetch for an explanation why we're + * storing this object in the ignore_obj_list. + */ + if (err == ENOENT || err == EEXIST || + (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { + objlist_insert(drc->drc_ignore_objlist, + drro->drr_object); + err = 0; + } + return (err); + } + case DRR_FREEOBJECTS: + { + err = receive_read_payload_and_next_header(drc, 0, NULL); + return (err); + } + case DRR_WRITE: + { + struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write; + arc_buf_t *abuf; + boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type); + + if (drc->drc_raw) { + boolean_t byteorder = ZFS_HOST_BYTEORDER ^ + !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^ + drc->drc_byteswap; + + abuf = arc_loan_raw_buf(dmu_objset_spa(drc->drc_os), + drrw->drr_object, byteorder, drrw->drr_salt, + drrw->drr_iv, drrw->drr_mac, drrw->drr_type, + drrw->drr_compressed_size, drrw->drr_logical_size, + drrw->drr_compressiontype); + } else if (DRR_WRITE_COMPRESSED(drrw)) { + ASSERT3U(drrw->drr_compressed_size, >, 0); + ASSERT3U(drrw->drr_logical_size, >=, + drrw->drr_compressed_size); + ASSERT(!is_meta); + abuf = arc_loan_compressed_buf( + dmu_objset_spa(drc->drc_os), + drrw->drr_compressed_size, drrw->drr_logical_size, + drrw->drr_compressiontype); + } else { + abuf = arc_loan_buf(dmu_objset_spa(drc->drc_os), + is_meta, drrw->drr_logical_size); + } + + err = receive_read_payload_and_next_header(drc, + DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data); + if (err != 0) { + dmu_return_arcbuf(abuf); + return (err); + } + drc->drc_rrd->arc_buf = abuf; + receive_read_prefetch(drc, drrw->drr_object, drrw->drr_offset, + drrw->drr_logical_size); + return (err); + } + case DRR_WRITE_BYREF: + { + struct drr_write_byref *drrwb = + &drc->drc_rrd->header.drr_u.drr_write_byref; + err = receive_read_payload_and_next_header(drc, 0, NULL); + receive_read_prefetch(drc, drrwb->drr_object, drrwb->drr_offset, + drrwb->drr_length); + return (err); + } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &drc->drc_rrd->header.drr_u.drr_write_embedded; + uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); + void *buf = kmem_zalloc(size, KM_SLEEP); + + err = receive_read_payload_and_next_header(drc, size, buf); + if (err != 0) { + kmem_free(buf, size); + return (err); + } + + receive_read_prefetch(drc, drrwe->drr_object, drrwe->drr_offset, + drrwe->drr_length); + return (err); + } + case DRR_FREE: + case DRR_REDACT: + { + /* + * It might be beneficial to prefetch indirect blocks here, but + * we don't really have the data to decide for sure. + */ + err = receive_read_payload_and_next_header(drc, 0, NULL); + return (err); + } + case DRR_END: + { + struct drr_end *drre = &drc->drc_rrd->header.drr_u.drr_end; + if (!ZIO_CHECKSUM_EQUAL(drc->drc_prev_cksum, + drre->drr_checksum)) + return (SET_ERROR(ECKSUM)); + return (0); + } + case DRR_SPILL: + { + struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill; + arc_buf_t *abuf; + /* DRR_SPILL records are either raw or uncompressed */ + if (drc->drc_raw) { + boolean_t byteorder = ZFS_HOST_BYTEORDER ^ + !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^ + drc->drc_byteswap; + + abuf = arc_loan_raw_buf(dmu_objset_spa(drc->drc_os), + drrs->drr_object, byteorder, drrs->drr_salt, + drrs->drr_iv, drrs->drr_mac, drrs->drr_type, + drrs->drr_compressed_size, drrs->drr_length, + drrs->drr_compressiontype); + } else { + abuf = arc_loan_buf(dmu_objset_spa(drc->drc_os), + DMU_OT_IS_METADATA(drrs->drr_type), + drrs->drr_length); + } + err = receive_read_payload_and_next_header(drc, + DRR_SPILL_PAYLOAD_SIZE(drrs), abuf->b_data); + if (err != 0) + dmu_return_arcbuf(abuf); + else + drc->drc_rrd->arc_buf = abuf; + return (err); + } + case DRR_OBJECT_RANGE: + { + err = receive_read_payload_and_next_header(drc, 0, NULL); + return (err); + + } + default: + return (SET_ERROR(EINVAL)); + } +} + + + +static void +dprintf_drr(struct receive_record_arg *rrd, int err) +{ +#ifdef ZFS_DEBUG + switch (rrd->header.drr_type) { + case DRR_OBJECT: + { + struct drr_object *drro = &rrd->header.drr_u.drr_object; + dprintf("drr_type = OBJECT obj = %llu type = %u " + "bonustype = %u blksz = %u bonuslen = %u cksumtype = %u " + "compress = %u dn_slots = %u err = %d\n", + drro->drr_object, drro->drr_type, drro->drr_bonustype, + drro->drr_blksz, drro->drr_bonuslen, + drro->drr_checksumtype, drro->drr_compress, + drro->drr_dn_slots, err); + break; + } + case DRR_FREEOBJECTS: + { + struct drr_freeobjects *drrfo = + &rrd->header.drr_u.drr_freeobjects; + dprintf("drr_type = FREEOBJECTS firstobj = %llu " + "numobjs = %llu err = %d\n", + drrfo->drr_firstobj, drrfo->drr_numobjs, err); + break; + } + case DRR_WRITE: + { + struct drr_write *drrw = &rrd->header.drr_u.drr_write; + dprintf("drr_type = WRITE obj = %llu type = %u offset = %llu " + "lsize = %llu cksumtype = %u cksumflags = %u " + "compress = %u psize = %llu err = %d\n", + drrw->drr_object, drrw->drr_type, drrw->drr_offset, + drrw->drr_logical_size, drrw->drr_checksumtype, + drrw->drr_flags, drrw->drr_compressiontype, + drrw->drr_compressed_size, err); + break; + } + case DRR_WRITE_BYREF: + { + struct drr_write_byref *drrwbr = + &rrd->header.drr_u.drr_write_byref; + dprintf("drr_type = WRITE_BYREF obj = %llu offset = %llu " + "length = %llu toguid = %llx refguid = %llx " + "refobject = %llu refoffset = %llu cksumtype = %u " + "cksumflags = %u err = %d\n", + drrwbr->drr_object, drrwbr->drr_offset, + drrwbr->drr_length, drrwbr->drr_toguid, + drrwbr->drr_refguid, drrwbr->drr_refobject, + drrwbr->drr_refoffset, drrwbr->drr_checksumtype, + drrwbr->drr_flags, err); + break; + } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &rrd->header.drr_u.drr_write_embedded; + dprintf("drr_type = WRITE_EMBEDDED obj = %llu offset = %llu " + "length = %llu compress = %u etype = %u lsize = %u " + "psize = %u err = %d\n", + drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length, + drrwe->drr_compression, drrwe->drr_etype, + drrwe->drr_lsize, drrwe->drr_psize, err); + break; + } + case DRR_FREE: + { + struct drr_free *drrf = &rrd->header.drr_u.drr_free; + dprintf("drr_type = FREE obj = %llu offset = %llu " + "length = %lld err = %d\n", + drrf->drr_object, drrf->drr_offset, drrf->drr_length, + err); + break; + } + case DRR_SPILL: + { + struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; + dprintf("drr_type = SPILL obj = %llu length = %llu " + "err = %d\n", drrs->drr_object, drrs->drr_length, err); + break; + } + default: + return; + } +#endif +} + +/* + * Commit the records to the pool. + */ +static int +receive_process_record(struct receive_writer_arg *rwa, + struct receive_record_arg *rrd) +{ + int err; + + /* Processing in order, therefore bytes_read should be increasing. */ + ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); + rwa->bytes_read = rrd->bytes_read; + + switch (rrd->header.drr_type) { + case DRR_OBJECT: + { + struct drr_object *drro = &rrd->header.drr_u.drr_object; + err = receive_object(rwa, drro, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + break; + } + case DRR_FREEOBJECTS: + { + struct drr_freeobjects *drrfo = + &rrd->header.drr_u.drr_freeobjects; + err = receive_freeobjects(rwa, drrfo); + break; + } + case DRR_WRITE: + { + struct drr_write *drrw = &rrd->header.drr_u.drr_write; + err = receive_write(rwa, drrw, rrd->arc_buf); + /* if receive_write() is successful, it consumes the arc_buf */ + if (err != 0) + dmu_return_arcbuf(rrd->arc_buf); + rrd->arc_buf = NULL; + rrd->payload = NULL; + break; + } + case DRR_WRITE_BYREF: + { + struct drr_write_byref *drrwbr = + &rrd->header.drr_u.drr_write_byref; + err = receive_write_byref(rwa, drrwbr); + break; + } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &rrd->header.drr_u.drr_write_embedded; + err = receive_write_embedded(rwa, drrwe, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + break; + } + case DRR_FREE: + { + struct drr_free *drrf = &rrd->header.drr_u.drr_free; + err = receive_free(rwa, drrf); + break; + } + case DRR_SPILL: + { + struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; + err = receive_spill(rwa, drrs, rrd->arc_buf); + if (err != 0) + dmu_return_arcbuf(rrd->arc_buf); + rrd->arc_buf = NULL; + rrd->payload = NULL; + break; + } + case DRR_OBJECT_RANGE: + { + struct drr_object_range *drror = + &rrd->header.drr_u.drr_object_range; + err = receive_object_range(rwa, drror); + break; + } + case DRR_REDACT: + { + struct drr_redact *drrr = &rrd->header.drr_u.drr_redact; + err = receive_redact(rwa, drrr); + break; + } + default: + return (SET_ERROR(EINVAL)); + } + + if (err != 0) + dprintf_drr(rrd, err); + + return (err); +} + +/* + * dmu_recv_stream's worker thread; pull records off the queue, and then call + * receive_process_record When we're done, signal the main thread and exit. + */ +static void +receive_writer_thread(void *arg) +{ + struct receive_writer_arg *rwa = arg; + struct receive_record_arg *rrd; + fstrans_cookie_t cookie = spl_fstrans_mark(); + + for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; + rrd = bqueue_dequeue(&rwa->q)) { + /* + * If there's an error, the main thread will stop putting things + * on the queue, but we need to clear everything in it before we + * can exit. + */ + if (rwa->err == 0) { + rwa->err = receive_process_record(rwa, rrd); + } else if (rrd->arc_buf != NULL) { + dmu_return_arcbuf(rrd->arc_buf); + rrd->arc_buf = NULL; + rrd->payload = NULL; + } else if (rrd->payload != NULL) { + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + } + kmem_free(rrd, sizeof (*rrd)); + } + kmem_free(rrd, sizeof (*rrd)); + mutex_enter(&rwa->mutex); + rwa->done = B_TRUE; + cv_signal(&rwa->cv); + mutex_exit(&rwa->mutex); + spl_fstrans_unmark(cookie); + thread_exit(); +} + +static int +resume_check(dmu_recv_cookie_t *drc, nvlist_t *begin_nvl) +{ + uint64_t val; + objset_t *mos = dmu_objset_pool(drc->drc_os)->dp_meta_objset; + uint64_t dsobj = dmu_objset_id(drc->drc_os); + uint64_t resume_obj, resume_off; + + if (nvlist_lookup_uint64(begin_nvl, + "resume_object", &resume_obj) != 0 || + nvlist_lookup_uint64(begin_nvl, + "resume_offset", &resume_off) != 0) { + return (SET_ERROR(EINVAL)); + } + VERIFY0(zap_lookup(mos, dsobj, + DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val)); + if (resume_obj != val) + return (SET_ERROR(EINVAL)); + VERIFY0(zap_lookup(mos, dsobj, + DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val)); + if (resume_off != val) + return (SET_ERROR(EINVAL)); + + return (0); +} + +/* + * Read in the stream's records, one by one, and apply them to the pool. There + * are two threads involved; the thread that calls this function will spin up a + * worker thread, read the records off the stream one by one, and issue + * prefetches for any necessary indirect blocks. It will then push the records + * onto an internal blocking queue. The worker thread will pull the records off + * the queue, and actually write the data into the DMU. This way, the worker + * thread doesn't have to wait for reads to complete, since everything it needs + * (the indirect blocks) will be prefetched. + * + * NB: callers *must* call dmu_recv_end() if this succeeds. + */ +int +dmu_recv_stream(dmu_recv_cookie_t *drc, int cleanup_fd, + uint64_t *action_handlep, offset_t *voffp) +{ + int err = 0; + struct receive_writer_arg *rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP); + + if (dsl_dataset_is_zapified(drc->drc_ds)) { + uint64_t bytes; + (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, + drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, + sizeof (bytes), 1, &bytes); + drc->drc_bytes_read += bytes; + } + + drc->drc_ignore_objlist = objlist_create(); + + /* these were verified in dmu_recv_begin */ + ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, + DMU_SUBSTREAM); + ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); + + /* + * Open the objset we are modifying. + */ + VERIFY0(dmu_objset_from_ds(drc->drc_ds, &drc->drc_os)); + ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); + ASSERT0(drc->drc_os->os_encrypted && + (drc->drc_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)); + + /* if this stream is dedup'ed, set up the avl tree for guid mapping */ + if (drc->drc_featureflags & DMU_BACKUP_FEATURE_DEDUP) { + minor_t minor; + + if (cleanup_fd == -1) { + err = SET_ERROR(EBADF); + goto out; + } + err = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (err != 0) { + cleanup_fd = -1; + goto out; + } + + if (*action_handlep == 0) { + rwa->guid_to_ds_map = + kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); + avl_create(rwa->guid_to_ds_map, guid_compare, + sizeof (guid_map_entry_t), + offsetof(guid_map_entry_t, avlnode)); + err = zfs_onexit_add_cb(minor, + free_guid_map_onexit, rwa->guid_to_ds_map, + action_handlep); + if (err != 0) + goto out; + } else { + err = zfs_onexit_cb_data(minor, *action_handlep, + (void **)&rwa->guid_to_ds_map); + if (err != 0) + goto out; + } + + drc->drc_guid_to_ds_map = rwa->guid_to_ds_map; + } + + /* handle DSL encryption key payload */ + if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) { + nvlist_t *keynvl = NULL; + + ASSERT(drc->drc_os->os_encrypted); + ASSERT(drc->drc_raw); + + err = nvlist_lookup_nvlist(drc->drc_begin_nvl, "crypt_keydata", + &keynvl); + if (err != 0) + goto out; + + err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa), + drc->drc_ds->ds_object, drc->drc_drrb->drr_type, + keynvl, drc->drc_newfs); + if (err != 0) + goto out; + + if (!drc->drc_newfs) + drc->drc_keynvl = fnvlist_dup(keynvl); + } + + if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) { + err = resume_check(drc, drc->drc_begin_nvl); + if (err != 0) + goto out; + } + + (void) bqueue_init(&rwa->q, zfs_recv_queue_ff, + MAX(zfs_recv_queue_length, 2 * zfs_max_recordsize), + offsetof(struct receive_record_arg, node)); + cv_init(&rwa->cv, NULL, CV_DEFAULT, NULL); + mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL); + rwa->os = drc->drc_os; + rwa->os->os_raw_receive = drc->drc_raw; + rwa->byteswap = drc->drc_byteswap; + rwa->resumable = drc->drc_resumable; + rwa->raw = drc->drc_raw; + + (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc, + TS_RUN, minclsyspri); + /* + * We're reading rwa->err without locks, which is safe since we are the + * only reader, and the worker thread is the only writer. It's ok if we + * miss a write for an iteration or two of the loop, since the writer + * thread will keep freeing records we send it until we send it an eos + * marker. + * + * We can leave this loop in 3 ways: First, if rwa->err is + * non-zero. In that case, the writer thread will free the rrd we just + * pushed. Second, if we're interrupted; in that case, either it's the + * first loop and drc->drc_rrd was never allocated, or it's later, and + * drc->drc_rrd has been handed off to the writer thread who will free + * it. Finally, if receive_read_record fails or we're at the end of the + * stream, then we free drc->drc_rrd and exit. + */ + while (rwa->err == 0) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + err = SET_ERROR(EINTR); + break; + } + + ASSERT3P(drc->drc_rrd, ==, NULL); + drc->drc_rrd = drc->drc_next_rrd; + drc->drc_next_rrd = NULL; + /* Allocates and loads header into drc->drc_next_rrd */ + err = receive_read_record(drc); + + if (drc->drc_rrd->header.drr_type == DRR_END || err != 0) { + kmem_free(drc->drc_rrd, sizeof (*drc->drc_rrd)); + drc->drc_rrd = NULL; + break; + } + + bqueue_enqueue(&rwa->q, drc->drc_rrd, + sizeof (struct receive_record_arg) + + drc->drc_rrd->payload_size); + drc->drc_rrd = NULL; + } + if (drc->drc_next_rrd == NULL) { + drc->drc_next_rrd = kmem_zalloc(sizeof (*drc->drc_next_rrd), + KM_SLEEP); + } + drc->drc_next_rrd->eos_marker = B_TRUE; + bqueue_enqueue_flush(&rwa->q, drc->drc_next_rrd, 1); + + mutex_enter(&rwa->mutex); + while (!rwa->done) { + /* + * We need to use cv_wait_sig() so that any process that may + * be sleeping here can still fork. + */ + (void) cv_wait_sig(&rwa->cv, &rwa->mutex); + } + mutex_exit(&rwa->mutex); + + /* + * If we are receiving a full stream as a clone, all object IDs which + * are greater than the maximum ID referenced in the stream are + * by definition unused and must be freed. + */ + if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) { + uint64_t obj = rwa->max_object + 1; + int free_err = 0; + int next_err = 0; + + while (next_err == 0) { + free_err = dmu_free_long_object(rwa->os, obj); + if (free_err != 0 && free_err != ENOENT) + break; + + next_err = dmu_object_next(rwa->os, &obj, FALSE, 0); + } + + if (err == 0) { + if (free_err != 0 && free_err != ENOENT) + err = free_err; + else if (next_err != ESRCH) + err = next_err; + } + } + + cv_destroy(&rwa->cv); + mutex_destroy(&rwa->mutex); + bqueue_destroy(&rwa->q); + if (err == 0) + err = rwa->err; + +out: + kmem_free(rwa, sizeof (*rwa)); + nvlist_free(drc->drc_begin_nvl); + if ((drc->drc_featureflags & DMU_BACKUP_FEATURE_DEDUP) && + (cleanup_fd != -1)) + zfs_onexit_fd_rele(cleanup_fd); + + if (err != 0) { + /* + * Clean up references. If receive is not resumable, + * destroy what we created, so we don't leave it in + * the inconsistent state. + */ + dmu_recv_cleanup_ds(drc); + nvlist_free(drc->drc_keynvl); + } + + objlist_destroy(drc->drc_ignore_objlist); + drc->drc_ignore_objlist = NULL; + *voffp = drc->drc_voff; + return (err); +} + +static int +dmu_recv_end_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_cookie_t *drc = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + int error; + + ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); + + if (!drc->drc_newfs) { + dsl_dataset_t *origin_head; + + error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); + if (error != 0) + return (error); + if (drc->drc_force) { + /* + * We will destroy any snapshots in tofs (i.e. before + * origin_head) that are after the origin (which is + * the snap before drc_ds, because drc_ds can not + * have any snaps of its own). + */ + uint64_t obj; + + obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; + while (obj != + dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { + dsl_dataset_t *snap; + error = dsl_dataset_hold_obj(dp, obj, FTAG, + &snap); + if (error != 0) + break; + if (snap->ds_dir != origin_head->ds_dir) + error = SET_ERROR(EINVAL); + if (error == 0) { + error = dsl_destroy_snapshot_check_impl( + snap, B_FALSE); + } + obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + dsl_dataset_rele(snap, FTAG); + if (error != 0) + break; + } + if (error != 0) { + dsl_dataset_rele(origin_head, FTAG); + return (error); + } + } + if (drc->drc_keynvl != NULL) { + error = dsl_crypto_recv_raw_key_check(drc->drc_ds, + drc->drc_keynvl, tx); + if (error != 0) { + dsl_dataset_rele(origin_head, FTAG); + return (error); + } + } + + error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, + origin_head, drc->drc_force, drc->drc_owner, tx); + if (error != 0) { + dsl_dataset_rele(origin_head, FTAG); + return (error); + } + error = dsl_dataset_snapshot_check_impl(origin_head, + drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); + dsl_dataset_rele(origin_head, FTAG); + if (error != 0) + return (error); + + error = dsl_destroy_head_check_impl(drc->drc_ds, 1); + } else { + error = dsl_dataset_snapshot_check_impl(drc->drc_ds, + drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); + } + return (error); +} + +static void +dmu_recv_end_sync(void *arg, dmu_tx_t *tx) +{ + dmu_recv_cookie_t *drc = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0; + + spa_history_log_internal_ds(drc->drc_ds, "finish receiving", + tx, "snap=%s", drc->drc_tosnap); + drc->drc_ds->ds_objset->os_raw_receive = B_FALSE; + + if (!drc->drc_newfs) { + dsl_dataset_t *origin_head; + + VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, + &origin_head)); + + if (drc->drc_force) { + /* + * Destroy any snapshots of drc_tofs (origin_head) + * after the origin (the snap before drc_ds). + */ + uint64_t obj; + + obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; + while (obj != + dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { + dsl_dataset_t *snap; + VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, + &snap)); + ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); + obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + dsl_destroy_snapshot_sync_impl(snap, + B_FALSE, tx); + dsl_dataset_rele(snap, FTAG); + } + } + if (drc->drc_keynvl != NULL) { + dsl_crypto_recv_raw_key_sync(drc->drc_ds, + drc->drc_keynvl, tx); + nvlist_free(drc->drc_keynvl); + drc->drc_keynvl = NULL; + } + + VERIFY3P(drc->drc_ds->ds_prev, ==, + origin_head->ds_prev); + + dsl_dataset_clone_swap_sync_impl(drc->drc_ds, + origin_head, tx); + dsl_dataset_snapshot_sync_impl(origin_head, + drc->drc_tosnap, tx); + + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); + dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = + drc->drc_drrb->drr_creation_time; + dsl_dataset_phys(origin_head->ds_prev)->ds_guid = + drc->drc_drrb->drr_toguid; + dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= + ~DS_FLAG_INCONSISTENT; + + dmu_buf_will_dirty(origin_head->ds_dbuf, tx); + dsl_dataset_phys(origin_head)->ds_flags &= + ~DS_FLAG_INCONSISTENT; + + drc->drc_newsnapobj = + dsl_dataset_phys(origin_head)->ds_prev_snap_obj; + + dsl_dataset_rele(origin_head, FTAG); + dsl_destroy_head_sync_impl(drc->drc_ds, tx); + + if (drc->drc_owner != NULL) + VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); + } else { + dsl_dataset_t *ds = drc->drc_ds; + + dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); + + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + dsl_dataset_phys(ds->ds_prev)->ds_creation_time = + drc->drc_drrb->drr_creation_time; + dsl_dataset_phys(ds->ds_prev)->ds_guid = + drc->drc_drrb->drr_toguid; + dsl_dataset_phys(ds->ds_prev)->ds_flags &= + ~DS_FLAG_INCONSISTENT; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; + if (dsl_dataset_has_resume_receive_state(ds)) { + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OBJECT, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OFFSET, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_BYTES, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TONAME, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, tx); + } + drc->drc_newsnapobj = + dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; + } + zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE); + + /* + * Release the hold from dmu_recv_begin. This must be done before + * we return to open context, so that when we free the dataset's dnode + * we can evict its bonus buffer. Since the dataset may be destroyed + * at this point (and therefore won't have a valid pointer to the spa) + * we release the key mapping manually here while we do have a valid + * pointer, if it exists. + */ + if (!drc->drc_raw && encrypted) { + (void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa, + drc->drc_ds->ds_object, drc->drc_ds); + } + dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag); + drc->drc_ds = NULL; +} + +static int +add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj, + boolean_t raw) +{ + dsl_pool_t *dp; + dsl_dataset_t *snapds; + guid_map_entry_t *gmep; + objset_t *os; + ds_hold_flags_t dsflags = (raw) ? 0 : DS_HOLD_FLAG_DECRYPT; + int err; + + ASSERT(guid_map != NULL); + + err = dsl_pool_hold(name, FTAG, &dp); + if (err != 0) + return (err); + gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); + err = dsl_dataset_own_obj(dp, snapobj, dsflags, gmep, &snapds); + + if (err == 0) { + /* + * If this is a deduplicated raw send stream, we need + * to make sure that we can still read raw blocks from + * earlier datasets in the stream, so we set the + * os_raw_receive flag now. + */ + if (raw) { + err = dmu_objset_from_ds(snapds, &os); + if (err != 0) { + dsl_dataset_disown(snapds, dsflags, FTAG); + dsl_pool_rele(dp, FTAG); + kmem_free(gmep, sizeof (*gmep)); + return (err); + } + os->os_raw_receive = B_TRUE; + } + + gmep->raw = raw; + gmep->guid = dsl_dataset_phys(snapds)->ds_guid; + gmep->gme_ds = snapds; + avl_add(guid_map, gmep); + } else { + kmem_free(gmep, sizeof (*gmep)); + } + + dsl_pool_rele(dp, FTAG); + return (err); +} + +static int dmu_recv_end_modified_blocks = 3; + +static int +dmu_recv_existing_end(dmu_recv_cookie_t *drc) +{ +#ifdef _KERNEL + /* + * We will be destroying the ds; make sure its origin is unmounted if + * necessary. + */ + char name[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(drc->drc_ds, name); + zfs_destroy_unmount_origin(name); +#endif + + return (dsl_sync_task(drc->drc_tofs, + dmu_recv_end_check, dmu_recv_end_sync, drc, + dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); +} + +static int +dmu_recv_new_end(dmu_recv_cookie_t *drc) +{ + return (dsl_sync_task(drc->drc_tofs, + dmu_recv_end_check, dmu_recv_end_sync, drc, + dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); +} + +int +dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) +{ + int error; + + drc->drc_owner = owner; + + if (drc->drc_newfs) + error = dmu_recv_new_end(drc); + else + error = dmu_recv_existing_end(drc); + + if (error != 0) { + dmu_recv_cleanup_ds(drc); + nvlist_free(drc->drc_keynvl); + } else if (drc->drc_guid_to_ds_map != NULL) { + (void) add_ds_to_guidmap(drc->drc_tofs, drc->drc_guid_to_ds_map, + drc->drc_newsnapobj, drc->drc_raw); + } + return (error); +} + +/* + * Return TRUE if this objset is currently being received into. + */ +boolean_t +dmu_objset_is_receiving(objset_t *os) +{ + return (os->os_dsl_dataset != NULL && + os->os_dsl_dataset->ds_owner == dmu_recv_tag); +} + +#if defined(_KERNEL) +module_param(zfs_recv_queue_length, int, 0644); +MODULE_PARM_DESC(zfs_recv_queue_length, "Maximum receive queue length"); + +module_param(zfs_recv_queue_ff, int, 0644); +MODULE_PARM_DESC(zfs_recv_queue_ff, "Receive queue fill fraction"); +#endif diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c new file mode 100644 index 000000000000..583c33dc8276 --- /dev/null +++ b/module/zfs/dmu_redact.c @@ -0,0 +1,1108 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#endif + +/* + * This controls the number of entries in the buffer the redaction_list_update + * synctask uses to buffer writes to the redaction list. + */ +int redact_sync_bufsize = 1024; + +/* + * Controls how often to update the redaction list when creating a redaction + * list. + */ +uint64_t redaction_list_update_interval_ns = 1000 * 1000 * 1000ULL; /* NS */ + +/* + * This tunable controls the length of the queues that zfs redact worker threads + * use to communicate. If the dmu_redact_snap thread is blocking on these + * queues, this variable may need to be increased. If there is a significant + * slowdown at the start of a redact operation as these threads consume all the + * available IO resources, or the queues are consuming too much memory, this + * variable may need to be decreased. + */ +int zfs_redact_queue_length = 1024 * 1024; + +/* + * These tunables control the fill fraction of the queues by zfs redact. The + * fill fraction controls the frequency with which threads have to be + * cv_signaled. If a lot of cpu time is being spent on cv_signal, then these + * should be tuned down. If the queues empty before the signalled thread can + * catch up, then these should be tuned up. + */ +uint64_t zfs_redact_queue_ff = 20; + +struct redact_record { + bqueue_node_t ln; + boolean_t eos_marker; /* Marks the end of the stream */ + uint64_t start_object; + uint64_t start_blkid; + uint64_t end_object; + uint64_t end_blkid; + uint8_t indblkshift; + uint32_t datablksz; +}; + +struct redact_thread_arg { + bqueue_t q; + dsl_dataset_t *ds; /* Dataset to traverse */ + struct redact_record *current_record; + int error_code; + boolean_t cancel; + zbookmark_phys_t resume; + objlist_t *deleted_objs; + uint64_t *num_blocks_visited; + uint64_t ignore_object; /* ignore further callbacks on this */ + uint64_t txg; /* txg to traverse since */ +}; + +/* + * The redaction node is a wrapper around the redaction record that is used + * by the redaction merging thread to sort the records and determine overlaps. + * + * It contains two nodes; one sorts the records by their start_zb, and the other + * sorts the records by their end_zb. + */ +struct redact_node { + avl_node_t avl_node_start; + avl_node_t avl_node_end; + struct redact_record *record; + struct redact_thread_arg *rt_arg; + uint32_t thread_num; +}; + +struct merge_data { + list_t md_redact_block_pending; + redact_block_phys_t md_coalesce_block; + uint64_t md_last_time; + redact_block_phys_t md_furthest[TXG_SIZE]; + /* Lists of struct redact_block_list_node. */ + list_t md_blocks[TXG_SIZE]; + boolean_t md_synctask_txg[TXG_SIZE]; + uint64_t md_latest_synctask_txg; + redaction_list_t *md_redaction_list; +}; + +/* + * A wrapper around struct redact_block so it can be stored in a list_t. + */ +struct redact_block_list_node { + redact_block_phys_t block; + list_node_t node; +}; + +/* + * We've found a new redaction candidate. In order to improve performance, we + * coalesce these blocks when they're adjacent to each other. This function + * handles that. If the new candidate block range is immediately after the + * range we're building, coalesce it into the range we're building. Otherwise, + * put the record we're building on the queue, and update the build pointer to + * point to the new record. + */ +static void +record_merge_enqueue(bqueue_t *q, struct redact_record **build, + struct redact_record *new) +{ + if (new->eos_marker) { + if (*build != NULL) + bqueue_enqueue(q, *build, sizeof (*build)); + bqueue_enqueue_flush(q, new, sizeof (*new)); + return; + } + if (*build == NULL) { + *build = new; + return; + } + struct redact_record *curbuild = *build; + if ((curbuild->end_object == new->start_object && + curbuild->end_blkid + 1 == new->start_blkid) || + (curbuild->end_object + 1 == new->start_object && + curbuild->end_blkid == UINT64_MAX && new->start_blkid == 0)) { + curbuild->end_object = new->end_object; + curbuild->end_blkid = new->end_blkid; + kmem_free(new, sizeof (*new)); + } else { + bqueue_enqueue(q, curbuild, sizeof (*curbuild)); + *build = new; + } +} + +/* + * This is the callback function to traverse_dataset for the redaction threads + * for dmu_redact_snap. This thread is responsible for creating redaction + * records for all the data that is modified by the snapshots we're redacting + * with respect to. Redaction records represent ranges of data that have been + * modified by one of the redaction snapshots, and are stored in the + * redact_record struct. We need to create redaction records for three + * cases: + * + * First, if there's a normal write, we need to create a redaction record for + * that block. + * + * Second, if there's a hole, we need to create a redaction record that covers + * the whole range of the hole. If the hole is in the meta-dnode, it must cover + * every block in all of the objects in the hole. + * + * Third, if there is a deleted object, we need to create a redaction record for + * all of the blocks in that object. + */ +/*ARGSUSED*/ +static int +redact_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) +{ + struct redact_thread_arg *rta = arg; + struct redact_record *record; + + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= rta->resume.zb_object); + + if (rta->cancel) + return (SET_ERROR(EINTR)); + + if (rta->ignore_object == zb->zb_object) + return (0); + + /* + * If we're visiting a dnode, we need to handle the case where the + * object has been deleted. + */ + if (zb->zb_level == ZB_DNODE_LEVEL) { + ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); + + if (zb->zb_object == 0) + return (0); + + /* + * If the object has been deleted, redact all of the blocks in + * it. + */ + if (dnp->dn_type == DMU_OT_NONE || + objlist_exists(rta->deleted_objs, zb->zb_object)) { + rta->ignore_object = zb->zb_object; + record = kmem_zalloc(sizeof (struct redact_record), + KM_SLEEP); + + record->eos_marker = B_FALSE; + record->start_object = record->end_object = + zb->zb_object; + record->start_blkid = 0; + record->end_blkid = UINT64_MAX; + record_merge_enqueue(&rta->q, + &rta->current_record, record); + } + return (0); + } else if (zb->zb_level < 0) { + return (0); + } else if (zb->zb_level > 0 && !BP_IS_HOLE(bp)) { + /* + * If this is an indirect block, but not a hole, it doesn't + * provide any useful information for redaction, so ignore it. + */ + return (0); + } + + /* + * At this point, there are two options left for the type of block we're + * looking at. Either this is a hole (which could be in the dnode or + * the meta-dnode), or it's a level 0 block of some sort. If it's a + * hole, we create a redaction record that covers the whole range. If + * the hole is in a dnode, we need to redact all the blocks in that + * hole. If the hole is in the meta-dnode, we instead need to redact + * all blocks in every object covered by that hole. If it's a level 0 + * block, we only need to redact that single block. + */ + record = kmem_zalloc(sizeof (struct redact_record), KM_SLEEP); + record->eos_marker = B_FALSE; + + record->start_object = record->end_object = zb->zb_object; + if (BP_IS_HOLE(bp)) { + record->start_blkid = zb->zb_blkid * + bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level); + + record->end_blkid = ((zb->zb_blkid + 1) * + bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level)) - 1; + + if (zb->zb_object == DMU_META_DNODE_OBJECT) { + record->start_object = record->start_blkid * + ((SPA_MINBLOCKSIZE * dnp->dn_datablkszsec) / + sizeof (dnode_phys_t)); + record->start_blkid = 0; + record->end_object = ((record->end_blkid + + 1) * ((SPA_MINBLOCKSIZE * dnp->dn_datablkszsec) / + sizeof (dnode_phys_t))) - 1; + record->end_blkid = UINT64_MAX; + } + } else if (zb->zb_level != 0 || + zb->zb_object == DMU_META_DNODE_OBJECT) { + kmem_free(record, sizeof (*record)); + return (0); + } else { + record->start_blkid = record->end_blkid = zb->zb_blkid; + } + record->indblkshift = dnp->dn_indblkshift; + record->datablksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + record_merge_enqueue(&rta->q, &rta->current_record, record); + + return (0); +} + +static void +redact_traverse_thread(void *arg) +{ + struct redact_thread_arg *rt_arg = arg; + int err; + struct redact_record *data; + objset_t *os; + VERIFY0(dmu_objset_from_ds(rt_arg->ds, &os)); +#ifdef _KERNEL + if (os->os_phys->os_type == DMU_OST_ZFS) + rt_arg->deleted_objs = zfs_get_deleteq(os); + else + rt_arg->deleted_objs = objlist_create(); +#else + rt_arg->deleted_objs = objlist_create(); +#endif + + err = traverse_dataset_resume(rt_arg->ds, rt_arg->txg, + &rt_arg->resume, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, + redact_cb, rt_arg); + + if (err != EINTR) + rt_arg->error_code = err; + objlist_destroy(rt_arg->deleted_objs); + data = kmem_zalloc(sizeof (*data), KM_SLEEP); + data->eos_marker = B_TRUE; + record_merge_enqueue(&rt_arg->q, &rt_arg->current_record, data); + thread_exit(); +} + +static inline void +create_zbookmark_from_obj_off(zbookmark_phys_t *zb, uint64_t object, + uint64_t blkid) +{ + zb->zb_object = object; + zb->zb_level = 0; + zb->zb_blkid = blkid; +} + +/* + * This is a utility function that can do the comparison for the start or ends + * of the ranges in a redact_record. + */ +static int +redact_range_compare(uint64_t obj1, uint64_t off1, uint32_t dbss1, + uint64_t obj2, uint64_t off2, uint32_t dbss2) +{ + zbookmark_phys_t z1, z2; + create_zbookmark_from_obj_off(&z1, obj1, off1); + create_zbookmark_from_obj_off(&z2, obj2, off2); + + return (zbookmark_compare(dbss1 >> SPA_MINBLOCKSHIFT, 0, + dbss2 >> SPA_MINBLOCKSHIFT, 0, &z1, &z2)); +} + +/* + * Compare two redaction records by their range's start location. Also makes + * eos records always compare last. We use the thread number in the redact_node + * to ensure that records do not compare equal (which is not allowed in our avl + * trees). + */ +static int +redact_node_compare_start(const void *arg1, const void *arg2) +{ + const struct redact_node *rn1 = arg1; + const struct redact_node *rn2 = arg2; + const struct redact_record *rr1 = rn1->record; + const struct redact_record *rr2 = rn2->record; + if (rr1->eos_marker) + return (1); + if (rr2->eos_marker) + return (-1); + + int cmp = redact_range_compare(rr1->start_object, rr1->start_blkid, + rr1->datablksz, rr2->start_object, rr2->start_blkid, + rr2->datablksz); + if (cmp == 0) + cmp = (rn1->thread_num < rn2->thread_num ? -1 : 1); + return (cmp); +} + +/* + * Compare two redaction records by their range's end location. Also makes + * eos records always compare last. We use the thread number in the redact_node + * to ensure that records do not compare equal (which is not allowed in our avl + * trees). + */ +static int +redact_node_compare_end(const void *arg1, const void *arg2) +{ + const struct redact_node *rn1 = arg1; + const struct redact_node *rn2 = arg2; + const struct redact_record *srr1 = rn1->record; + const struct redact_record *srr2 = rn2->record; + if (srr1->eos_marker) + return (1); + if (srr2->eos_marker) + return (-1); + + int cmp = redact_range_compare(srr1->end_object, srr1->end_blkid, + srr1->datablksz, srr2->end_object, srr2->end_blkid, + srr2->datablksz); + if (cmp == 0) + cmp = (rn1->thread_num < rn2->thread_num ? -1 : 1); + return (cmp); +} + +/* + * Utility function that compares two redaction records to determine if any part + * of the "from" record is before any part of the "to" record. Also causes End + * of Stream redaction records to compare after all others, so that the + * redaction merging logic can stay simple. + */ +static boolean_t +redact_record_before(const struct redact_record *from, + const struct redact_record *to) +{ + if (from->eos_marker == B_TRUE) + return (B_FALSE); + else if (to->eos_marker == B_TRUE) + return (B_TRUE); + return (redact_range_compare(from->start_object, from->start_blkid, + from->datablksz, to->end_object, to->end_blkid, + to->datablksz) <= 0); +} + +/* + * Pop a new redaction record off the queue, check that the records are in the + * right order, and free the old data. + */ +static struct redact_record * +get_next_redact_record(bqueue_t *bq, struct redact_record *prev) +{ + struct redact_record *next = bqueue_dequeue(bq); + ASSERT(redact_record_before(prev, next)); + kmem_free(prev, sizeof (*prev)); + return (next); +} + +/* + * Remove the given redaction node from both trees, pull a new redaction record + * off the queue, free the old redaction record, update the redaction node, and + * reinsert the node into the trees. + */ +static int +update_avl_trees(avl_tree_t *start_tree, avl_tree_t *end_tree, + struct redact_node *redact_node) +{ + avl_remove(start_tree, redact_node); + avl_remove(end_tree, redact_node); + redact_node->record = get_next_redact_record(&redact_node->rt_arg->q, + redact_node->record); + avl_add(end_tree, redact_node); + avl_add(start_tree, redact_node); + return (redact_node->rt_arg->error_code); +} + +/* + * Synctask for updating redaction lists. We first take this txg's list of + * redacted blocks and append those to the redaction list. We then update the + * redaction list's bonus buffer. We store the furthest blocks we visited and + * the list of snapshots that we're redacting with respect to. We need these so + * that redacted sends and receives can be correctly resumed. + */ +static void +redaction_list_update_sync(void *arg, dmu_tx_t *tx) +{ + struct merge_data *md = arg; + uint64_t txg = dmu_tx_get_txg(tx); + list_t *list = &md->md_blocks[txg & TXG_MASK]; + redact_block_phys_t *furthest_visited = + &md->md_furthest[txg & TXG_MASK]; + objset_t *mos = tx->tx_pool->dp_meta_objset; + redaction_list_t *rl = md->md_redaction_list; + int bufsize = redact_sync_bufsize; + redact_block_phys_t *buf = kmem_alloc(bufsize * sizeof (*buf), + KM_SLEEP); + int index = 0; + + dmu_buf_will_dirty(rl->rl_dbuf, tx); + + for (struct redact_block_list_node *rbln = list_remove_head(list); + rbln != NULL; rbln = list_remove_head(list)) { + ASSERT3U(rbln->block.rbp_object, <=, + furthest_visited->rbp_object); + ASSERT(rbln->block.rbp_object < furthest_visited->rbp_object || + rbln->block.rbp_blkid <= furthest_visited->rbp_blkid); + buf[index] = rbln->block; + index++; + if (index == bufsize) { + dmu_write(mos, rl->rl_object, + rl->rl_phys->rlp_num_entries * sizeof (*buf), + bufsize * sizeof (*buf), buf, tx); + rl->rl_phys->rlp_num_entries += bufsize; + index = 0; + } + kmem_free(rbln, sizeof (*rbln)); + } + if (index > 0) { + dmu_write(mos, rl->rl_object, rl->rl_phys->rlp_num_entries * + sizeof (*buf), index * sizeof (*buf), buf, tx); + rl->rl_phys->rlp_num_entries += index; + } + kmem_free(buf, bufsize * sizeof (*buf)); + + md->md_synctask_txg[txg & TXG_MASK] = B_FALSE; + rl->rl_phys->rlp_last_object = furthest_visited->rbp_object; + rl->rl_phys->rlp_last_blkid = furthest_visited->rbp_blkid; +} + +void +commit_rl_updates(objset_t *os, struct merge_data *md, uint64_t object, + uint64_t blkid) +{ + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(os->os_spa)->dp_mos_dir); + dmu_tx_hold_space(tx, sizeof (struct redact_block_list_node)); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + if (!md->md_synctask_txg[txg & TXG_MASK]) { + dsl_sync_task_nowait(dmu_tx_pool(tx), + redaction_list_update_sync, md, 5, ZFS_SPACE_CHECK_NONE, + tx); + md->md_synctask_txg[txg & TXG_MASK] = B_TRUE; + md->md_latest_synctask_txg = txg; + } + md->md_furthest[txg & TXG_MASK].rbp_object = object; + md->md_furthest[txg & TXG_MASK].rbp_blkid = blkid; + list_move_tail(&md->md_blocks[txg & TXG_MASK], + &md->md_redact_block_pending); + dmu_tx_commit(tx); + md->md_last_time = gethrtime(); +} + +/* + * We want to store the list of blocks that we're redacting in the bookmark's + * redaction list. However, this list is stored in the MOS, which means it can + * only be written to in syncing context. To get around this, we create a + * synctask that will write to the mos for us. We tell it what to write by + * a linked list for each current transaction group; every time we decide to + * redact a block, we append it to the transaction group that is currently in + * open context. We also update some progress information that the synctask + * will store to enable resumable redacted sends. + */ +static void +update_redaction_list(struct merge_data *md, objset_t *os, + uint64_t object, uint64_t blkid, uint64_t endblkid, uint32_t blksz) +{ + boolean_t enqueue = B_FALSE; + redact_block_phys_t cur = {0}; + uint64_t count = endblkid - blkid + 1; + while (count > REDACT_BLOCK_MAX_COUNT) { + update_redaction_list(md, os, object, blkid, + blkid + REDACT_BLOCK_MAX_COUNT - 1, blksz); + blkid += REDACT_BLOCK_MAX_COUNT; + count -= REDACT_BLOCK_MAX_COUNT; + } + redact_block_phys_t *coalesce = &md->md_coalesce_block; + boolean_t new; + if (coalesce->rbp_size_count == 0) { + new = B_TRUE; + enqueue = B_FALSE; + } else { + uint64_t old_count = redact_block_get_count(coalesce); + if (coalesce->rbp_object == object && + coalesce->rbp_blkid + old_count == blkid && + old_count + count <= REDACT_BLOCK_MAX_COUNT) { + ASSERT3U(redact_block_get_size(coalesce), ==, blksz); + redact_block_set_count(coalesce, old_count + count); + new = B_FALSE; + enqueue = B_FALSE; + } else { + new = B_TRUE; + enqueue = B_TRUE; + } + } + + if (new) { + cur = *coalesce; + coalesce->rbp_blkid = blkid; + coalesce->rbp_object = object; + + redact_block_set_count(coalesce, count); + redact_block_set_size(coalesce, blksz); + } + + if (enqueue && redact_block_get_size(&cur) != 0) { + struct redact_block_list_node *rbln = + kmem_alloc(sizeof (struct redact_block_list_node), + KM_SLEEP); + rbln->block = cur; + list_insert_tail(&md->md_redact_block_pending, rbln); + } + + if (gethrtime() > md->md_last_time + + redaction_list_update_interval_ns) { + commit_rl_updates(os, md, object, blkid); + } +} + +/* + * This thread merges all the redaction records provided by the worker threads, + * and determines which blocks are redacted by all the snapshots. The algorithm + * for doing so is similar to performing a merge in mergesort with n sub-lists + * instead of 2, with some added complexity due to the fact that the entries are + * ranges, not just single blocks. This algorithm relies on the fact that the + * queues are sorted, which is ensured by the fact that traverse_dataset + * traverses the dataset in a consistent order. We pull one entry off the front + * of the queues of each secure dataset traversal thread. Then we repeat the + * following: each record represents a range of blocks modified by one of the + * redaction snapshots, and each block in that range may need to be redacted in + * the send stream. Find the record with the latest start of its range, and the + * record with the earliest end of its range. If the last start is before the + * first end, then we know that the blocks in the range [last_start, first_end] + * are covered by all of the ranges at the front of the queues, which means + * every thread redacts that whole range. For example, let's say the ranges on + * each queue look like this: + * + * Block Id 1 2 3 4 5 6 7 8 9 10 11 + * Thread 1 | [====================] + * Thread 2 | [========] + * Thread 3 | [=================] + * + * Thread 3 has the last start (5), and the thread 2 has the last end (6). All + * three threads modified the range [5,6], so that data should not be sent over + * the wire. After we've determined whether or not to redact anything, we take + * the record with the first end. We discard that record, and pull a new one + * off the front of the queue it came from. In the above example, we would + * discard Thread 2's record, and pull a new one. Let's say the next record we + * pulled from Thread 2 covered range [10,11]. The new layout would look like + * this: + * + * Block Id 1 2 3 4 5 6 7 8 9 10 11 + * Thread 1 | [====================] + * Thread 2 | [==] + * Thread 3 | [=================] + * + * When we compare the last start (10, from Thread 2) and the first end (9, from + * Thread 1), we see that the last start is greater than the first end. + * Therefore, we do not redact anything from these records. We'll iterate by + * replacing the record from Thread 1. + * + * We iterate by replacing the record with the lowest end because we know + * that the record with the lowest end has helped us as much as it can. All the + * ranges before it that we will ever redact have been redacted. In addition, + * by replacing the one with the lowest end, we guarantee we catch all ranges + * that need to be redacted. For example, if in the case above we had replaced + * the record from Thread 1 instead, we might have ended up with the following: + * + * Block Id 1 2 3 4 5 6 7 8 9 10 11 12 + * Thread 1 | [==] + * Thread 2 | [========] + * Thread 3 | [=================] + * + * If the next record from Thread 2 had been [8,10], for example, we should have + * redacted part of that range, but because we updated Thread 1's record, we + * missed it. + * + * We implement this algorithm by using two trees. The first sorts the + * redaction records by their start_zb, and the second sorts them by their + * end_zb. We use these to find the record with the last start and the record + * with the first end. We create a record with that start and end, and send it + * on. The overall runtime of this implementation is O(n log m), where n is the + * total number of redaction records from all the different redaction snapshots, + * and m is the number of redaction snapshots. + * + * If we redact with respect to zero snapshots, we create a redaction + * record with the start object and blkid to 0, and the end object and blkid to + * UINT64_MAX. This will result in us redacting every block. + */ +static int +perform_thread_merge(bqueue_t *q, uint32_t num_threads, + struct redact_thread_arg *thread_args, boolean_t *cancel) +{ + struct redact_node *redact_nodes = NULL; + avl_tree_t start_tree, end_tree; + struct redact_record *record; + struct redact_record *current_record = NULL; + int err = 0; + struct merge_data md = { {0} }; + list_create(&md.md_redact_block_pending, + sizeof (struct redact_block_list_node), + offsetof(struct redact_block_list_node, node)); + + /* + * If we're redacting with respect to zero snapshots, then no data is + * permitted to be sent. We enqueue a record that redacts all blocks, + * and an eos marker. + */ + if (num_threads == 0) { + record = kmem_zalloc(sizeof (struct redact_record), + KM_SLEEP); + // We can't redact object 0, so don't try. + record->start_object = 1; + record->start_blkid = 0; + record->end_object = record->end_blkid = UINT64_MAX; + bqueue_enqueue(q, record, sizeof (*record)); + return (0); + } + if (num_threads > 0) { + redact_nodes = kmem_zalloc(num_threads * + sizeof (*redact_nodes), KM_SLEEP); + } + + avl_create(&start_tree, redact_node_compare_start, + sizeof (struct redact_node), + offsetof(struct redact_node, avl_node_start)); + avl_create(&end_tree, redact_node_compare_end, + sizeof (struct redact_node), + offsetof(struct redact_node, avl_node_end)); + + for (int i = 0; i < num_threads; i++) { + struct redact_node *node = &redact_nodes[i]; + struct redact_thread_arg *targ = &thread_args[i]; + node->record = bqueue_dequeue(&targ->q); + node->rt_arg = targ; + node->thread_num = i; + avl_add(&start_tree, node); + avl_add(&end_tree, node); + } + + /* + * Once the first record in the end tree has returned EOS, every record + * must be an EOS record, so we should stop. + */ + while (err == 0 && !((struct redact_node *)avl_first(&end_tree))-> + record->eos_marker) { + if (*cancel) { + err = EINTR; + break; + } + struct redact_node *last_start = avl_last(&start_tree); + struct redact_node *first_end = avl_first(&end_tree); + + /* + * If the last start record is before the first end record, + * then we have blocks that are redacted by all threads. + * Therefore, we should redact them. Copy the record, and send + * it to the main thread. + */ + if (redact_record_before(last_start->record, + first_end->record)) { + record = kmem_zalloc(sizeof (struct redact_record), + KM_SLEEP); + *record = *first_end->record; + record->start_object = last_start->record->start_object; + record->start_blkid = last_start->record->start_blkid; + record_merge_enqueue(q, ¤t_record, + record); + } + err = update_avl_trees(&start_tree, &end_tree, first_end); + } + + /* + * We're done; if we were cancelled, we need to cancel our workers and + * clear out their queues. Either way, we need to remove every thread's + * redact_node struct from the avl trees. + */ + for (int i = 0; i < num_threads; i++) { + if (err != 0) { + thread_args[i].cancel = B_TRUE; + while (!redact_nodes[i].record->eos_marker) { + (void) update_avl_trees(&start_tree, &end_tree, + &redact_nodes[i]); + } + } + avl_remove(&start_tree, &redact_nodes[i]); + avl_remove(&end_tree, &redact_nodes[i]); + kmem_free(redact_nodes[i].record, + sizeof (struct redact_record)); + } + + avl_destroy(&start_tree); + avl_destroy(&end_tree); + kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes)); + if (current_record != NULL) + bqueue_enqueue(q, current_record, sizeof (current_record)); + return (err); +} + +struct redact_merge_thread_arg { + bqueue_t q; + spa_t *spa; + int numsnaps; + struct redact_thread_arg *thr_args; + boolean_t cancel; + int error_code; +}; + +static void +redact_merge_thread(void *arg) +{ + struct redact_merge_thread_arg *rmta = arg; + rmta->error_code = perform_thread_merge(&rmta->q, + rmta->numsnaps, rmta->thr_args, &rmta->cancel); + struct redact_record *rec = kmem_zalloc(sizeof (*rec), KM_SLEEP); + rec->eos_marker = B_TRUE; + bqueue_enqueue_flush(&rmta->q, rec, 1); + thread_exit(); +} + +/* + * Find the next object in or after the redaction range passed in, and hold + * its dnode with the provided tag. Also update *object to contain the new + * object number. + */ +static int +hold_next_object(objset_t *os, struct redact_record *rec, void *tag, + uint64_t *object, dnode_t **dn) +{ + int err = 0; + if (*dn != NULL) + dnode_rele(*dn, FTAG); + *dn = NULL; + if (*object < rec->start_object) { + *object = rec->start_object - 1; + } + err = dmu_object_next(os, object, B_FALSE, 0); + if (err != 0) + return (err); + + err = dnode_hold(os, *object, tag, dn); + while (err == 0 && (*object < rec->start_object || + DMU_OT_IS_METADATA((*dn)->dn_type))) { + dnode_rele(*dn, tag); + *dn = NULL; + err = dmu_object_next(os, object, B_FALSE, 0); + if (err != 0) + break; + err = dnode_hold(os, *object, tag, dn); + } + return (err); +} + +static int +perform_redaction(objset_t *os, redaction_list_t *rl, + struct redact_merge_thread_arg *rmta) +{ + int err = 0; + bqueue_t *q = &rmta->q; + struct redact_record *rec = NULL; + struct merge_data md = { {0} }; + + list_create(&md.md_redact_block_pending, + sizeof (struct redact_block_list_node), + offsetof(struct redact_block_list_node, node)); + md.md_redaction_list = rl; + + for (int i = 0; i < TXG_SIZE; i++) { + list_create(&md.md_blocks[i], + sizeof (struct redact_block_list_node), + offsetof(struct redact_block_list_node, node)); + } + dnode_t *dn = NULL; + uint64_t prev_obj = 0; + for (rec = bqueue_dequeue(q); !rec->eos_marker && err == 0; + rec = get_next_redact_record(q, rec)) { + ASSERT3U(rec->start_object, !=, 0); + uint64_t object; + if (prev_obj != rec->start_object) { + object = rec->start_object - 1; + err = hold_next_object(os, rec, FTAG, &object, &dn); + } else { + object = prev_obj; + } + while (err == 0 && object <= rec->end_object) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + err = EINTR; + break; + } + /* + * Part of the current object is contained somewhere in + * the range covered by rec. + */ + uint64_t startblkid; + uint64_t endblkid; + uint64_t maxblkid = dn->dn_phys->dn_maxblkid; + + if (rec->start_object < object) + startblkid = 0; + else if (rec->start_blkid > maxblkid) + break; + else + startblkid = rec->start_blkid; + + if (rec->end_object > object || rec->end_blkid > + maxblkid) { + endblkid = maxblkid; + } else { + endblkid = rec->end_blkid; + } + update_redaction_list(&md, os, object, startblkid, + endblkid, dn->dn_datablksz); + + if (object == rec->end_object) + break; + err = hold_next_object(os, rec, FTAG, &object, &dn); + } + if (err == ESRCH) + err = 0; + if (dn != NULL) + prev_obj = object; + } + if (err == 0 && dn != NULL) + dnode_rele(dn, FTAG); + + if (err == ESRCH) + err = 0; + rmta->cancel = B_TRUE; + while (!rec->eos_marker) + rec = get_next_redact_record(q, rec); + kmem_free(rec, sizeof (*rec)); + + /* + * There may be a block that's being coalesced, sync that out before we + * return. + */ + if (err == 0 && md.md_coalesce_block.rbp_size_count != 0) { + struct redact_block_list_node *rbln = + kmem_alloc(sizeof (struct redact_block_list_node), + KM_SLEEP); + rbln->block = md.md_coalesce_block; + list_insert_tail(&md.md_redact_block_pending, rbln); + } + commit_rl_updates(os, &md, UINT64_MAX, UINT64_MAX); + + /* + * Wait for all the redaction info to sync out before we return, so that + * anyone who attempts to resume this redaction will have all the data + * they need. + */ + dsl_pool_t *dp = spa_get_dsl(os->os_spa); + if (md.md_latest_synctask_txg != 0) + txg_wait_synced(dp, md.md_latest_synctask_txg); + for (int i = 0; i < TXG_SIZE; i++) + list_destroy(&md.md_blocks[i]); + return (err); +} + +static boolean_t +redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid) +{ + for (int i = 0; i < num_snaps; i++) { + if (snaps[i] == guid) + return (B_TRUE); + } + return (B_FALSE); +} + +int +dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, + const char *redactbook) +{ + int err = 0; + dsl_pool_t *dp = NULL; + dsl_dataset_t *ds = NULL; + objset_t *os; + int numsnaps = 0; + dsl_dataset_t **redactsnaparr = NULL; + struct redact_thread_arg *args = NULL; + redaction_list_t *new_rl = NULL; + + if ((err = dsl_pool_hold(snapname, FTAG, &dp)) != 0) + return (err); + + if ((err = dsl_dataset_hold(dp, snapname, FTAG, &ds)) != 0) + goto out; + dsl_dataset_long_hold(ds, FTAG); + if (!ds->ds_is_snapshot || dmu_objset_from_ds(ds, &os) != 0) { + err = EINVAL; + goto out; + } + if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS)) { + err = EALREADY; + goto out; + } + nvpair_t *pair; + + if (fnvlist_num_pairs(redactnvl) > 0 && err == 0) { + redactsnaparr = kmem_zalloc(fnvlist_num_pairs(redactnvl) * + sizeof (dsl_dataset_t *), KM_SLEEP); + } + for (pair = nvlist_next_nvpair(redactnvl, NULL); err == 0 && + pair != NULL; pair = nvlist_next_nvpair(redactnvl, pair)) { + const char *name = nvpair_name(pair); + err = dsl_dataset_hold(dp, name, FTAG, + redactsnaparr + numsnaps); + if (err != 0) + break; + dsl_dataset_long_hold(redactsnaparr[numsnaps], FTAG); + if (!dsl_dataset_is_before(redactsnaparr[numsnaps], ds, 0)) { + err = EINVAL; + numsnaps++; + break; + } + if (dsl_dataset_feature_is_active(redactsnaparr[numsnaps], + SPA_FEATURE_REDACTED_DATASETS)) { + err = EALREADY; + numsnaps++; + break; + + } + numsnaps++; + } + if (err != 0) + goto out; + + ASSERT3U(fnvlist_num_pairs(redactnvl), ==, numsnaps); + + boolean_t resuming = B_FALSE; + char newredactbook[ZFS_MAX_DATASET_NAME_LEN]; + zfs_bookmark_phys_t bookmark; + + (void) strlcpy(newredactbook, snapname, ZFS_MAX_DATASET_NAME_LEN); + char *c = strchr(newredactbook, '@'); + ASSERT3P(c, !=, NULL); + int n = snprintf(c, ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook), + "#%s", redactbook); + if (n >= ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook)) { + dsl_pool_rele(dp, FTAG); + return (SET_ERROR(ENAMETOOLONG)); + } + err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark); + if (err == 0) { + resuming = B_TRUE; + if (bookmark.zbm_redaction_obj == 0) { + err = EEXIST; + goto out; + } + err = dsl_redaction_list_hold_obj(dp, + bookmark.zbm_redaction_obj, FTAG, &new_rl); + if (err != 0) { + err = EIO; + goto out; + } + dsl_redaction_list_long_hold(dp, new_rl, FTAG); + if (new_rl->rl_phys->rlp_num_snaps != numsnaps) { + err = ESRCH; + goto out; + } + for (int i = 0; i < numsnaps; i++) { + if (!redact_snaps_contains(new_rl->rl_phys->rlp_snaps, + new_rl->rl_phys->rlp_num_snaps, + dsl_dataset_phys(redactsnaparr[i])->ds_guid)) { + err = ESRCH; + goto out; + } + } + if (numsnaps > 0) + args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP); + if (new_rl->rl_phys->rlp_last_blkid == UINT64_MAX && + new_rl->rl_phys->rlp_last_object == UINT64_MAX) { + err = EEXIST; + goto out; + } + dsl_pool_rele(dp, FTAG); + dp = NULL; + } else { + uint64_t *guids = NULL; + if (numsnaps > 0) { + guids = kmem_zalloc(numsnaps * sizeof (uint64_t), + KM_SLEEP); + args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP); + } + for (int i = 0; i < numsnaps; i++) + guids[i] = dsl_dataset_phys(redactsnaparr[i])->ds_guid; + + dsl_pool_rele(dp, FTAG); + dp = NULL; + err = dsl_bookmark_create_redacted(newredactbook, snapname, + numsnaps, guids, FTAG, &new_rl); + kmem_free(guids, numsnaps * sizeof (uint64_t)); + if (err != 0) { + goto out; + } + } + + for (int i = 0; i < numsnaps; i++) { + args[i].ds = redactsnaparr[i]; + (void) bqueue_init(&args[i].q, zfs_redact_queue_ff, + zfs_redact_queue_length, + offsetof(struct redact_record, ln)); + if (resuming) { + args[i].resume.zb_blkid = + new_rl->rl_phys->rlp_last_blkid; + args[i].resume.zb_object = + new_rl->rl_phys->rlp_last_object; + } + args[i].txg = dsl_dataset_phys(ds)->ds_creation_txg; + (void) thread_create(NULL, 0, redact_traverse_thread, &args[i], + 0, curproc, TS_RUN, minclsyspri); + } + struct redact_merge_thread_arg rmta = { { {0} } }; + (void) bqueue_init(&rmta.q, zfs_redact_queue_ff, + zfs_redact_queue_length, offsetof(struct redact_record, ln)); + rmta.numsnaps = numsnaps; + rmta.spa = os->os_spa; + rmta.thr_args = args; + (void) thread_create(NULL, 0, redact_merge_thread, &rmta, 0, curproc, + TS_RUN, minclsyspri); + err = perform_redaction(os, new_rl, &rmta); +out: + if (args != NULL) { + kmem_free(args, numsnaps * sizeof (*args)); + } + if (new_rl != NULL) { + dsl_redaction_list_long_rele(new_rl, FTAG); + dsl_redaction_list_rele(new_rl, FTAG); + } + for (int i = 0; i < numsnaps; i++) { + dsl_dataset_long_rele(redactsnaparr[i], FTAG); + dsl_dataset_rele(redactsnaparr[i], FTAG); + } + + if (redactsnaparr != NULL) { + kmem_free(redactsnaparr, fnvlist_num_pairs(redactnvl) * + sizeof (dsl_dataset_t *)); + } + if (dp != NULL) + dsl_pool_rele(dp, FTAG); + if (ds != NULL) { + dsl_dataset_long_rele(ds, FTAG); + dsl_dataset_rele(ds, FTAG); + } + return (SET_ERROR(err)); + +} diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 1f4d3a1048b8..bcdaba42f267 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright 2016 RackTop Systems. @@ -58,106 +58,206 @@ #include #include #include +#include +#ifdef _KERNEL +#include +#endif /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ int zfs_send_corrupt_data = B_FALSE; +/* + * This tunable controls the amount of data (measured in bytes) that will be + * prefetched by zfs send. If the main thread is blocking on reads that haven't + * completed, this variable might need to be increased. If instead the main + * thread is issuing new reads because the prefetches have fallen out of the + * cache, this may need to be decreased. + */ int zfs_send_queue_length = SPA_MAXBLOCKSIZE; -int zfs_recv_queue_length = SPA_MAXBLOCKSIZE; -/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ -int zfs_send_set_freerecords_bit = B_TRUE; - -static char *dmu_recv_tag = "dmu_recv_tag"; -const char *recv_clone_name = "%recv"; +/* + * This tunable controls the length of the queues that zfs send worker threads + * use to communicate. If the send_main_thread is blocking on these queues, + * this variable may need to be increased. If there is a significant slowdown + * at the start of a send as these threads consume all the available IO + * resources, this variable may need to be decreased. + */ +int zfs_send_no_prefetch_queue_length = 1024 * 1024; +/* + * These tunables control the fill fraction of the queues by zfs send. The fill + * fraction controls the frequency with which threads have to be cv_signaled. + * If a lot of cpu time is being spent on cv_signal, then these should be tuned + * down. If the queues empty before the signalled thread can catch up, then + * these should be tuned up. + */ +int zfs_send_queue_ff = 20; +int zfs_send_no_prefetch_queue_ff = 20; /* * Use this to override the recordsize calculation for fast zfs send estimates. */ unsigned long zfs_override_estimate_recordsize = 0; -#define BP_SPAN(datablkszsec, indblkshift, level) \ - (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ - (level) * (indblkshift - SPA_BLKPTRSHIFT))) +/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ +int zfs_send_set_freerecords_bit = B_TRUE; + +static inline boolean_t +overflow_multiply(uint64_t a, uint64_t b, uint64_t *c) +{ + uint64_t temp = a * b; + if (b != 0 && temp / b != a) + return (B_FALSE); + *c = temp; + return (B_TRUE); +} -static void byteswap_record(dmu_replay_record_t *drr); +/* + * Return B_TRUE and modifies *out to the span if the span is less than 2^64, + * returns B_FALSE otherwise. + */ +static inline boolean_t +bp_span(uint32_t datablksz, uint8_t indblkshift, uint64_t level, uint64_t *out) +{ + uint64_t spanb = bp_span_in_blocks(indblkshift, level); + return (overflow_multiply(spanb, datablksz, out)); +} struct send_thread_arg { bqueue_t q; dsl_dataset_t *ds; /* Dataset to traverse */ + redaction_list_t *redaction_list; + struct send_redact_record *current_record; uint64_t fromtxg; /* Traverse from this txg */ int flags; /* flags to pass to traverse_dataset */ int error_code; boolean_t cancel; zbookmark_phys_t resume; + objlist_t *deleted_objs; + uint64_t *num_blocks_visited; }; -struct send_block_record { - boolean_t eos_marker; /* Marks the end of the stream */ - blkptr_t bp; - zbookmark_phys_t zb; - uint8_t indblkshift; - uint16_t datablkszsec; - bqueue_node_t ln; +struct redact_list_thread_arg { + boolean_t cancel; + bqueue_t q; + zbookmark_phys_t resume; + redaction_list_t *rl; + boolean_t mark_redact; + int error_code; + uint64_t *num_blocks_visited; }; -typedef struct dump_bytes_io { - dmu_sendarg_t *dbi_dsp; - void *dbi_buf; - int dbi_len; -} dump_bytes_io_t; +/* + * A wrapper around struct redact_block so it can be stored in a list_t. + */ +struct redact_block_list_node { + redact_block_phys_t block; + list_node_t node; +}; -static void -dump_bytes_cb(void *arg) -{ - dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; - dmu_sendarg_t *dsp = dbi->dbi_dsp; - dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os); - ssize_t resid; /* have to get resid to get detailed errno */ +struct redact_bookmark_info { + redact_block_phys_t rbi_furthest[TXG_SIZE]; + /* Lists of struct redact_block_list_node. */ + list_t rbi_blocks[TXG_SIZE]; + boolean_t rbi_synctasc_txg[TXG_SIZE]; + uint64_t rbi_latest_synctask_txg; + redaction_list_t *rbi_redaction_list; +}; +struct send_merge_thread_arg { + bqueue_t q; + objset_t *os; + struct redact_list_thread_arg *from_arg; + struct send_thread_arg *to_arg; + struct redact_list_thread_arg *redact_arg; + int error; + boolean_t cancel; + struct redact_bookmark_info rbi; /* - * The code does not rely on len being a multiple of 8. We keep - * this assertion because of the corresponding assertion in - * receive_read(). Keeping this assertion ensures that we do not - * inadvertently break backwards compatibility (causing the assertion - * in receive_read() to trigger on old software). Newer feature flags - * (such as raw send) may break this assertion since they were - * introduced after the requirement was made obsolete. + * If we're resuming a redacted send, then the object/offset from the + * resume token may be different from the object/offset that we have + * updated the bookmark to. resume_redact_zb will store the earlier of + * the two object/offset pairs, and bookmark_before will be B_TRUE if + * resume_redact_zb has the object/offset for resuming the redaction + * bookmark, and B_FALSE if resume_redact_zb is storing the + * object/offset from the resume token. */ + zbookmark_phys_t resume_redact_zb; + boolean_t bookmark_before; +}; - ASSERT(dbi->dbi_len % 8 == 0 || - (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0); - - dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, - (caddr_t)dbi->dbi_buf, dbi->dbi_len, - 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); +struct send_range { + boolean_t eos_marker; /* Marks the end of the stream */ + uint64_t object; + uint64_t start_blkid; + uint64_t end_blkid; + bqueue_node_t ln; + enum type {DATA, HOLE, OBJECT, OBJECT_RANGE, REDACT, + PREVIOUSLY_REDACTED} type; + union { + struct srd { + dmu_object_type_t obj_type; + uint32_t datablksz; + blkptr_t bp; + } data; + struct srh { + uint32_t datablksz; + } hole; + struct sro { + /* + * This is a pointer because embedding it in the + * struct causes these structures to be massively larger + * for all range types; this makes the code much less + * memory efficient. + */ + dnode_phys_t *dnp; + blkptr_t bp; + } object; + struct srr { + uint32_t datablksz; + } redact; + struct sror { + blkptr_t bp; + } object_range; + } sru; +}; - mutex_enter(&ds->ds_sendstream_lock); - *dsp->dsa_off += dbi->dbi_len; - mutex_exit(&ds->ds_sendstream_lock); -} +/* + * The list of data whose inclusion in a send stream can be pending from + * one call to backup_cb to another. Multiple calls to dump_free(), + * dump_freeobjects(), and dump_redact() can be aggregated into a single + * DRR_FREE, DRR_FREEOBJECTS, or DRR_REDACT replay record. + */ +typedef enum { + PENDING_NONE, + PENDING_FREE, + PENDING_FREEOBJECTS, + PENDING_REDACT +} dmu_pendop_t; + +typedef struct dmu_send_cookie { + dmu_replay_record_t *dsc_drr; + dmu_send_outparams_t *dsc_dso; + offset_t *dsc_off; + objset_t *dsc_os; + zio_cksum_t dsc_zc; + uint64_t dsc_toguid; + int dsc_err; + dmu_pendop_t dsc_pending_op; + uint64_t dsc_featureflags; + uint64_t dsc_last_data_object; + uint64_t dsc_last_data_offset; + uint64_t dsc_resume_object; + uint64_t dsc_resume_offset; + boolean_t dsc_sent_begin; + boolean_t dsc_sent_end; +} dmu_send_cookie_t; -static int -dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) +static void +range_free(struct send_range *range) { - dump_bytes_io_t dbi; - - dbi.dbi_dsp = dsp; - dbi.dbi_buf = buf; - dbi.dbi_len = len; - -#if defined(HAVE_LARGE_STACKS) - dump_bytes_cb(&dbi); -#else - /* - * The vn_rdwr() call is performed in a taskq to ensure that there is - * always enough stack space to write safely to the target filesystem. - * The ZIO_TYPE_FREE threads are used because there can be a lot of - * them and they are used in vdev_file.c for a similar purpose. - */ - spa_taskq_dispatch_sync(dmu_objset_spa(dsp->dsa_os), ZIO_TYPE_FREE, - ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); -#endif /* HAVE_LARGE_STACKS */ - - return (dsp->dsa_err); + if (range->type == OBJECT) { + kmem_free(range->sru.object.dnp, + sizeof (*range->sru.object.dnp)); + } + kmem_free(range, sizeof (*range)); } /* @@ -166,32 +266,60 @@ dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) * up to the start of the checksum itself. */ static int -dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) +dump_record(dmu_send_cookie_t *dscp, void *payload, int payload_len) { + dmu_send_outparams_t *dso = dscp->dsc_dso; ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - (void) fletcher_4_incremental_native(dsp->dsa_drr, + (void) fletcher_4_incremental_native(dscp->dsc_drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), - &dsp->dsa_zc); - if (dsp->dsa_drr->drr_type == DRR_BEGIN) { - dsp->dsa_sent_begin = B_TRUE; + &dscp->dsc_zc); + if (dscp->dsc_drr->drr_type == DRR_BEGIN) { + dscp->dsc_sent_begin = B_TRUE; } else { - ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u. + ASSERT(ZIO_CHECKSUM_IS_ZERO(&dscp->dsc_drr->drr_u. drr_checksum.drr_checksum)); - dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; + dscp->dsc_drr->drr_u.drr_checksum.drr_checksum = dscp->dsc_zc; } - if (dsp->dsa_drr->drr_type == DRR_END) { - dsp->dsa_sent_end = B_TRUE; + if (dscp->dsc_drr->drr_type == DRR_END) { + dscp->dsc_sent_end = B_TRUE; } - (void) fletcher_4_incremental_native(&dsp->dsa_drr-> + (void) fletcher_4_incremental_native(&dscp->dsc_drr-> drr_u.drr_checksum.drr_checksum, - sizeof (zio_cksum_t), &dsp->dsa_zc); - if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) + sizeof (zio_cksum_t), &dscp->dsc_zc); + *dscp->dsc_off += sizeof (dmu_replay_record_t); + dscp->dsc_err = dso->dso_outfunc(dscp->dsc_os, dscp->dsc_drr, + sizeof (dmu_replay_record_t), dso->dso_arg); + if (dscp->dsc_err != 0) return (SET_ERROR(EINTR)); if (payload_len != 0) { - (void) fletcher_4_incremental_native(payload, payload_len, - &dsp->dsa_zc); - if (dump_bytes(dsp, payload, payload_len) != 0) + *dscp->dsc_off += payload_len; + /* + * payload is null when dso->ryrun == B_TRUE (i.e. when we're + * doing a send size calculation) + */ + if (payload != NULL) { + (void) fletcher_4_incremental_native( + payload, payload_len, &dscp->dsc_zc); + } + + /* + * The code does not rely on this (len being a multiple of 8). + * We keep this assertion because of the corresponding assertion + * in receive_read(). Keeping this assertion ensures that we do + * not inadvertently break backwards compatibility (causing the + * assertion in receive_read() to trigger on old software). + * + * Raw sends cannot be received on old software, and so can + * bypass this assertion. + */ + + ASSERT((payload_len % 8 == 0) || + (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)); + + dscp->dsc_err = dso->dso_outfunc(dscp->dsc_os, payload, + payload_len, dso->dso_arg); + if (dscp->dsc_err != 0) return (SET_ERROR(EINTR)); } return (0); @@ -206,10 +334,10 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) * and freeobject records that were generated on the source. */ static int -dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, +dump_free(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, uint64_t length) { - struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); + struct drr_free *drrf = &(dscp->dsc_drr->drr_u.drr_free); /* * When we receive a free record, dbuf_free_range() assumes @@ -224,87 +352,131 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, * another way to assert that the one-record constraint is still * satisfied. */ - ASSERT(object > dsp->dsa_last_data_object || - (object == dsp->dsa_last_data_object && - offset > dsp->dsa_last_data_offset)); + ASSERT(object > dscp->dsc_last_data_object || + (object == dscp->dsc_last_data_object && + offset > dscp->dsc_last_data_offset)); /* * If there is a pending op, but it's not PENDING_FREE, push it out, * since free block aggregation can only be done for blocks of the * same type (i.e., DRR_FREE records can only be aggregated with * other DRR_FREE records. DRR_FREEOBJECTS records can only be - * aggregated with other DRR_FREEOBJECTS records. + * aggregated with other DRR_FREEOBJECTS records). */ - if (dsp->dsa_pending_op != PENDING_NONE && - dsp->dsa_pending_op != PENDING_FREE) { - if (dump_record(dsp, NULL, 0) != 0) + if (dscp->dsc_pending_op != PENDING_NONE && + dscp->dsc_pending_op != PENDING_FREE) { + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } - if (dsp->dsa_pending_op == PENDING_FREE) { - /* - * There should never be a PENDING_FREE if length is - * DMU_OBJECT_END (because dump_dnode is the only place where - * this function is called with a DMU_OBJECT_END, and only after - * flushing any pending record). - */ - ASSERT(length != DMU_OBJECT_END); + if (dscp->dsc_pending_op == PENDING_FREE) { /* * Check to see whether this free block can be aggregated * with pending one. */ if (drrf->drr_object == object && drrf->drr_offset + drrf->drr_length == offset) { - if (offset + length < offset) - drrf->drr_length = DMU_OBJECT_END; + if (offset + length < offset || length == UINT64_MAX) + drrf->drr_length = UINT64_MAX; else drrf->drr_length += length; return (0); } else { /* not a continuation. Push out pending record */ - if (dump_record(dsp, NULL, 0) != 0) + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } } /* create a FREE record and make it pending */ - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_FREE; + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_FREE; drrf->drr_object = object; drrf->drr_offset = offset; if (offset + length < offset) drrf->drr_length = DMU_OBJECT_END; else drrf->drr_length = length; - drrf->drr_toguid = dsp->dsa_toguid; + drrf->drr_toguid = dscp->dsc_toguid; if (length == DMU_OBJECT_END) { - if (dump_record(dsp, NULL, 0) != 0) + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); } else { - dsp->dsa_pending_op = PENDING_FREE; + dscp->dsc_pending_op = PENDING_FREE; + } + + return (0); +} + +/* + * Fill in the drr_redact struct, or perform aggregation if the previous record + * is also a redaction record, and the two are adjacent. + */ +static int +dump_redact(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, + uint64_t length) +{ + struct drr_redact *drrr = &dscp->dsc_drr->drr_u.drr_redact; + + /* + * If there is a pending op, but it's not PENDING_REDACT, push it out, + * since free block aggregation can only be done for blocks of the + * same type (i.e., DRR_REDACT records can only be aggregated with + * other DRR_REDACT records). + */ + if (dscp->dsc_pending_op != PENDING_NONE && + dscp->dsc_pending_op != PENDING_REDACT) { + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dscp->dsc_pending_op = PENDING_NONE; + } + + if (dscp->dsc_pending_op == PENDING_REDACT) { + /* + * Check to see whether this redacted block can be aggregated + * with pending one. + */ + if (drrr->drr_object == object && drrr->drr_offset + + drrr->drr_length == offset) { + drrr->drr_length += length; + return (0); + } else { + /* not a continuation. Push out pending record */ + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dscp->dsc_pending_op = PENDING_NONE; + } } + /* create a REDACT record and make it pending */ + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_REDACT; + drrr->drr_object = object; + drrr->drr_offset = offset; + drrr->drr_length = length; + drrr->drr_toguid = dscp->dsc_toguid; + dscp->dsc_pending_op = PENDING_REDACT; return (0); } static int -dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, +dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data) { uint64_t payload_size; - boolean_t raw = (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW); - struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); + boolean_t raw = (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW); + struct drr_write *drrw = &(dscp->dsc_drr->drr_u.drr_write); /* * We send data in increasing object, offset order. * See comment in dump_free() for details. */ - ASSERT(object > dsp->dsa_last_data_object || - (object == dsp->dsa_last_data_object && - offset > dsp->dsa_last_data_offset)); - dsp->dsa_last_data_object = object; - dsp->dsa_last_data_offset = offset + lsize - 1; + ASSERT(object > dscp->dsc_last_data_object || + (object == dscp->dsc_last_data_object && + offset > dscp->dsc_last_data_offset)); + dscp->dsc_last_data_object = object; + dscp->dsc_last_data_offset = offset + lsize - 1; /* * If there is any kind of pending aggregation (currently either @@ -312,22 +484,24 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, * the stream, since aggregation can't be done across operations * of different types. */ - if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_record(dsp, NULL, 0) != 0) + if (dscp->dsc_pending_op != PENDING_NONE) { + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } /* write a WRITE record */ - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_WRITE; + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_WRITE; drrw->drr_object = object; drrw->drr_type = type; drrw->drr_offset = offset; - drrw->drr_toguid = dsp->dsa_toguid; + drrw->drr_toguid = dscp->dsc_toguid; drrw->drr_logical_size = lsize; /* only set the compression fields if the buf is compressed or raw */ if (raw || lsize != psize) { + ASSERT(raw || dscp->dsc_featureflags & + DMU_BACKUP_FEATURE_COMPRESSED); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3S(psize, >, 0); @@ -347,7 +521,7 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, zio_crypt_decode_mac_bp(bp, drrw->drr_mac); } else { /* this is a compressed block */ - ASSERT(dsp->dsa_featureflags & + ASSERT(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_COMPRESSED); ASSERT(!BP_SHOULD_BYTESWAP(bp)); ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp))); @@ -383,33 +557,33 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, drrw->drr_key.ddk_cksum = bp->blk_cksum; } - if (dump_record(dsp, data, payload_size) != 0) + if (dump_record(dscp, data, payload_size) != 0) return (SET_ERROR(EINTR)); return (0); } static int -dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, +dump_write_embedded(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp) { char buf[BPE_PAYLOAD_SIZE]; struct drr_write_embedded *drrw = - &(dsp->dsa_drr->drr_u.drr_write_embedded); + &(dscp->dsc_drr->drr_u.drr_write_embedded); - if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_record(dsp, NULL, 0) != 0) + if (dscp->dsc_pending_op != PENDING_NONE) { + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } ASSERT(BP_IS_EMBEDDED(bp)); - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_WRITE_EMBEDDED; drrw->drr_object = object; drrw->drr_offset = offset; drrw->drr_length = blksz; - drrw->drr_toguid = dsp->dsa_toguid; + drrw->drr_toguid = dscp->dsc_toguid; drrw->drr_compression = BP_GET_COMPRESS(bp); drrw->drr_etype = BPE_GET_ETYPE(bp); drrw->drr_lsize = BPE_GET_LSIZE(bp); @@ -417,33 +591,34 @@ dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, decode_embedded_bp_compressed(bp, buf); - if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) + if (dump_record(dscp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) return (SET_ERROR(EINTR)); return (0); } static int -dump_spill(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, void *data) +dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, + void *data) { - struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); + struct drr_spill *drrs = &(dscp->dsc_drr->drr_u.drr_spill); uint64_t blksz = BP_GET_LSIZE(bp); uint64_t payload_size = blksz; - if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_record(dsp, NULL, 0) != 0) + if (dscp->dsc_pending_op != PENDING_NONE) { + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } /* write a SPILL record */ - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_SPILL; + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_SPILL; drrs->drr_object = object; drrs->drr_length = blksz; - drrs->drr_toguid = dsp->dsa_toguid; + drrs->drr_toguid = dscp->dsc_toguid; /* handle raw send fields */ - if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { + if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) { ASSERT(BP_IS_PROTECTED(bp)); if (BP_SHOULD_BYTESWAP(bp)) @@ -455,17 +630,17 @@ dump_spill(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, void *data) payload_size = drrs->drr_compressed_size; } - if (dump_record(dsp, data, payload_size) != 0) + if (dump_record(dscp, data, payload_size) != 0) return (SET_ERROR(EINTR)); return (0); } static int -dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) +dump_freeobjects(dmu_send_cookie_t *dscp, uint64_t firstobj, uint64_t numobjs) { - struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); + struct drr_freeobjects *drrfo = &(dscp->dsc_drr->drr_u.drr_freeobjects); uint64_t maxobj = DNODES_PER_BLOCK * - (DMU_META_DNODE(dsp->dsa_os)->dn_maxblkid + 1); + (DMU_META_DNODE(dscp->dsc_os)->dn_maxblkid + 1); /* * ZoL < 0.7 does not handle large FREEOBJECTS records correctly, @@ -486,15 +661,18 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) * push it out, since free block aggregation can only be done for * blocks of the same type (i.e., DRR_FREE records can only be * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records - * can only be aggregated with other DRR_FREEOBJECTS records. + * can only be aggregated with other DRR_FREEOBJECTS records). */ - if (dsp->dsa_pending_op != PENDING_NONE && - dsp->dsa_pending_op != PENDING_FREEOBJECTS) { - if (dump_record(dsp, NULL, 0) != 0) + if (dscp->dsc_pending_op != PENDING_NONE && + dscp->dsc_pending_op != PENDING_FREEOBJECTS) { + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } - if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { + if (numobjs == 0) + numobjs = UINT64_MAX - firstobj; + + if (dscp->dsc_pending_op == PENDING_FREEOBJECTS) { /* * See whether this free object array can be aggregated * with pending one @@ -504,32 +682,32 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) return (0); } else { /* can't be aggregated. Push out pending record */ - if (dump_record(dsp, NULL, 0) != 0) + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } } /* write a FREEOBJECTS record */ - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_FREEOBJECTS; drrfo->drr_firstobj = firstobj; drrfo->drr_numobjs = numobjs; - drrfo->drr_toguid = dsp->dsa_toguid; + drrfo->drr_toguid = dscp->dsc_toguid; - dsp->dsa_pending_op = PENDING_FREEOBJECTS; + dscp->dsc_pending_op = PENDING_FREEOBJECTS; return (0); } static int -dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, +dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, dnode_phys_t *dnp) { - struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); + struct drr_object *drro = &(dscp->dsc_drr->drr_u.drr_object); int bonuslen; - if (object < dsp->dsa_resume_object) { + if (object < dscp->dsc_resume_object) { /* * Note: when resuming, we will visit all the dnodes in * the block of dnodes that we are resuming from. In @@ -537,23 +715,23 @@ dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, * the one we are resuming from. We should be at most one * block's worth of dnodes behind the resume point. */ - ASSERT3U(dsp->dsa_resume_object - object, <, + ASSERT3U(dscp->dsc_resume_object - object, <, 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)); return (0); } if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) - return (dump_freeobjects(dsp, object, 1)); + return (dump_freeobjects(dscp, object, 1)); - if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_record(dsp, NULL, 0) != 0) + if (dscp->dsc_pending_op != PENDING_NONE) { + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } /* write an OBJECT record */ - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_OBJECT; + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_OBJECT; drro->drr_object = object; drro->drr_type = dnp->dn_type; drro->drr_bonustype = dnp->dn_bonustype; @@ -562,15 +740,15 @@ dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, drro->drr_dn_slots = dnp->dn_extra_slots + 1; drro->drr_checksumtype = dnp->dn_checksum; drro->drr_compress = dnp->dn_compress; - drro->drr_toguid = dsp->dsa_toguid; + drro->drr_toguid = dscp->dsc_toguid; - if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + if (!(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; bonuslen = P2ROUNDUP(dnp->dn_bonuslen, 8); - if ((dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW)) { + if ((dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)) { ASSERT(BP_IS_ENCRYPTED(bp)); if (BP_SHOULD_BYTESWAP(bp)) @@ -593,55 +771,55 @@ dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, } } - if (dump_record(dsp, DN_BONUS(dnp), bonuslen) != 0) + if (dump_record(dscp, DN_BONUS(dnp), bonuslen) != 0) return (SET_ERROR(EINTR)); /* Free anything past the end of the file. */ - if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * + if (dump_free(dscp, object, (dnp->dn_maxblkid + 1) * (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0) return (SET_ERROR(EINTR)); - if (dsp->dsa_err != 0) + if (dscp->dsc_err != 0) return (SET_ERROR(EINTR)); return (0); } static int -dump_object_range(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t firstobj, - uint64_t numslots) +dump_object_range(dmu_send_cookie_t *dscp, const blkptr_t *bp, + uint64_t firstobj, uint64_t numslots) { struct drr_object_range *drror = - &(dsp->dsa_drr->drr_u.drr_object_range); + &(dscp->dsc_drr->drr_u.drr_object_range); /* we only use this record type for raw sends */ ASSERT(BP_IS_PROTECTED(bp)); - ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW); + ASSERT(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE); ASSERT0(BP_GET_LEVEL(bp)); - if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_record(dsp, NULL, 0) != 0) + if (dscp->dsc_pending_op != PENDING_NONE) { + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_OBJECT_RANGE; + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_OBJECT_RANGE; drror->drr_firstobj = firstobj; drror->drr_numslots = numslots; - drror->drr_toguid = dsp->dsa_toguid; + drror->drr_toguid = dscp->dsc_toguid; if (BP_SHOULD_BYTESWAP(bp)) drror->drr_flags |= DRR_RAW_BYTESWAP; zio_crypt_decode_params_bp(bp, drror->drr_salt, drror->drr_iv); zio_crypt_decode_mac_bp(bp, drror->drr_mac); - if (dump_record(dsp, NULL, 0) != 0) + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); return (0); } static boolean_t -backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) +send_do_embed(dmu_send_cookie_t *dscp, const blkptr_t *bp) { if (!BP_IS_EMBEDDED(bp)) return (B_FALSE); @@ -650,7 +828,7 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) * Compression function must be legacy, or explicitly enabled. */ if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && - !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4))) + !(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_LZ4))) return (B_FALSE); /* @@ -658,7 +836,7 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) */ switch (BPE_GET_ETYPE(bp)) { case BP_EMBEDDED_TYPE_DATA: - if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) + if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) return (B_TRUE); break; default: @@ -667,192 +845,86 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) return (B_FALSE); } -/* - * This is the callback function to traverse_dataset that acts as the worker - * thread for dmu_send_impl. - */ -/*ARGSUSED*/ -static int -send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) -{ - struct send_thread_arg *sta = arg; - struct send_block_record *record; - uint64_t record_size; - int err = 0; - - ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || - zb->zb_object >= sta->resume.zb_object); - ASSERT3P(sta->ds, !=, NULL); - - if (sta->cancel) - return (SET_ERROR(EINTR)); - - if (bp == NULL) { - ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); - return (0); - } else if (zb->zb_level < 0) { - return (0); - } - - record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP); - record->eos_marker = B_FALSE; - record->bp = *bp; - record->zb = *zb; - record->indblkshift = dnp->dn_indblkshift; - record->datablkszsec = dnp->dn_datablkszsec; - record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; - bqueue_enqueue(&sta->q, record, record_size); - - return (err); -} - -/* - * This function kicks off the traverse_dataset. It also handles setting the - * error code of the thread in case something goes wrong, and pushes the End of - * Stream record when the traverse_dataset call has finished. If there is no - * dataset to traverse, the thread immediately pushes End of Stream marker. - */ -static void -send_traverse_thread(void *arg) -{ - struct send_thread_arg *st_arg = arg; - int err; - struct send_block_record *data; - fstrans_cookie_t cookie = spl_fstrans_mark(); - - if (st_arg->ds != NULL) { - err = traverse_dataset_resume(st_arg->ds, - st_arg->fromtxg, &st_arg->resume, - st_arg->flags, send_cb, st_arg); - - if (err != EINTR) - st_arg->error_code = err; - } - data = kmem_zalloc(sizeof (*data), KM_SLEEP); - data->eos_marker = B_TRUE; - bqueue_enqueue(&st_arg->q, data, 1); - spl_fstrans_unmark(cookie); - thread_exit(); -} - /* * This function actually handles figuring out what kind of record needs to be * dumped, reading the data (which has hopefully been prefetched), and calling * the appropriate helper function. */ static int -do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) +do_dump(dmu_send_cookie_t *dscp, struct send_range *range) { - dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os); - const blkptr_t *bp = &data->bp; - const zbookmark_phys_t *zb = &data->zb; - uint8_t indblkshift = data->indblkshift; - uint16_t dblkszsec = data->datablkszsec; - spa_t *spa = ds->ds_dir->dd_pool->dp_spa; - dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; int err = 0; - - ASSERT3U(zb->zb_level, >=, 0); - - ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || - zb->zb_object >= dsa->dsa_resume_object); - - /* - * All bps of an encrypted os should have the encryption bit set. - * If this is not true it indicates tampering and we report an error. - */ - if (dsa->dsa_os->os_encrypted && - !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { - spa_log_error(spa, zb); - zfs_panic_recover("unencrypted block in encrypted " - "object set %llu", ds->ds_object); - return (SET_ERROR(EIO)); - } - - if (zb->zb_object != DMU_META_DNODE_OBJECT && - DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { - return (0); - } else if (BP_IS_HOLE(bp) && - zb->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); - uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; - err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT); - } else if (BP_IS_HOLE(bp)) { - uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); - uint64_t offset = zb->zb_blkid * span; - /* Don't dump free records for offsets > DMU_OBJECT_END */ - if (zb->zb_blkid == 0 || span <= DMU_OBJECT_END / zb->zb_blkid) - err = dump_free(dsa, zb->zb_object, offset, span); - } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { - return (0); - } else if (type == DMU_OT_DNODE) { - int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - arc_flags_t aflags = ARC_FLAG_WAIT; - arc_buf_t *abuf; - enum zio_flag zioflags = ZIO_FLAG_CANFAIL; - - if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { - ASSERT(BP_IS_ENCRYPTED(bp)); - ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); - zioflags |= ZIO_FLAG_RAW; + switch (range->type) { + case OBJECT: + err = dump_dnode(dscp, &range->sru.object.bp, range->object, + range->sru.object.dnp); + return (err); + case OBJECT_RANGE: { + ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); + if (!(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)) { + return (0); } + uint64_t epb = BP_GET_LSIZE(&range->sru.object_range.bp) >> + DNODE_SHIFT; + uint64_t firstobj = range->start_blkid * epb; + err = dump_object_range(dscp, &range->sru.object_range.bp, + firstobj, epb); + break; + } + case REDACT: { + struct srr *srrp = &range->sru.redact; + err = dump_redact(dscp, range->object, range->start_blkid * + srrp->datablksz, (range->end_blkid - range->start_blkid) * + srrp->datablksz); + return (err); + } + case DATA: { + struct srd *srdp = &range->sru.data; + blkptr_t *bp = &srdp->bp; + spa_t *spa = + dmu_objset_spa(dscp->dsc_os); + + ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp)); + ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); + if (BP_GET_TYPE(bp) == DMU_OT_SA) { + arc_flags_t aflags = ARC_FLAG_WAIT; + enum zio_flag zioflags = ZIO_FLAG_CANFAIL; + + if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) { + ASSERT(BP_IS_PROTECTED(bp)); + zioflags |= ZIO_FLAG_RAW; + } - ASSERT0(zb->zb_level); - - if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) - return (SET_ERROR(EIO)); - - dnode_phys_t *blk = abuf->b_data; - uint64_t dnobj = zb->zb_blkid * epb; - - /* - * Raw sends require sending encryption parameters for the - * block of dnodes. Regular sends do not need to send this - * info. - */ - if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { - ASSERT(arc_is_encrypted(abuf)); - err = dump_object_range(dsa, bp, dnobj, epb); - } + arc_buf_t *abuf; + zbookmark_phys_t zb; + ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID); + zb.zb_objset = dmu_objset_id(dscp->dsc_os); + zb.zb_object = range->object; + zb.zb_level = 0; + zb.zb_blkid = range->start_blkid; + + if (!dscp->dsc_dso->dso_dryrun && arc_read(NULL, spa, + bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + zioflags, &aflags, &zb) != 0) + return (SET_ERROR(EIO)); - if (err == 0) { - for (int i = 0; i < epb; - i += blk[i].dn_extra_slots + 1) { - err = dump_dnode(dsa, bp, dnobj + i, blk + i); - if (err != 0) - break; - } + err = dump_spill(dscp, bp, zb.zb_object, abuf->b_data); + arc_buf_destroy(abuf, &abuf); + return (err); } - arc_buf_destroy(abuf, &abuf); - } else if (type == DMU_OT_SA) { - arc_flags_t aflags = ARC_FLAG_WAIT; - arc_buf_t *abuf; - enum zio_flag zioflags = ZIO_FLAG_CANFAIL; - - if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { - ASSERT(BP_IS_PROTECTED(bp)); - zioflags |= ZIO_FLAG_RAW; + if (send_do_embed(dscp, bp)) { + err = dump_write_embedded(dscp, range->object, + range->start_blkid * srdp->datablksz, + srdp->datablksz, bp); + return (err); } - - if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) - return (SET_ERROR(EIO)); - - err = dump_spill(dsa, bp, zb->zb_object, abuf->b_data); - arc_buf_destroy(abuf, &abuf); - } else if (backup_do_embed(dsa, bp)) { - /* it's an embedded level-0 block of a regular object */ - int blksz = dblkszsec << SPA_MINBLOCKSHIFT; - ASSERT0(zb->zb_level); - err = dump_write_embedded(dsa, zb->zb_object, - zb->zb_blkid * blksz, blksz, bp); - } else { + ASSERT(range->object > dscp->dsc_resume_object || + (range->object == dscp->dsc_resume_object && + range->start_blkid * srdp->datablksz >= + dscp->dsc_resume_offset)); /* it's a level-0 block of a regular object */ arc_flags_t aflags = ARC_FLAG_WAIT; - arc_buf_t *abuf; - int blksz = dblkszsec << SPA_MINBLOCKSHIFT; + arc_buf_t *abuf = NULL; uint64_t offset; /* @@ -860,15 +932,16 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) * don't allow us to send large blocks, we split the data from * the arc buf into chunks. */ - boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE && - !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); + boolean_t split_large_blocks = + srdp->datablksz > SPA_OLD_MAXBLOCKSIZE && + !(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); /* * Raw sends require that we always get raw data as it exists * on disk, so we assert that we are not splitting blocks here. */ boolean_t request_raw = - (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0; + (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) != 0; /* * We should only request compressed data from the ARC if all @@ -881,3481 +954,1922 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) * system it can be byteswapped more easily) */ boolean_t request_compressed = - (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) && + (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) && !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); IMPLY(request_raw, !split_large_blocks); IMPLY(request_raw, BP_IS_PROTECTED(bp)); - ASSERT0(zb->zb_level); - ASSERT(zb->zb_object > dsa->dsa_resume_object || - (zb->zb_object == dsa->dsa_resume_object && - zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); - - ASSERT3U(blksz, ==, BP_GET_LSIZE(bp)); - - enum zio_flag zioflags = ZIO_FLAG_CANFAIL; - if (request_raw) - zioflags |= ZIO_FLAG_RAW; - else if (request_compressed) - zioflags |= ZIO_FLAG_RAW_COMPRESS; - - if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) { - if (zfs_send_corrupt_data) { + if (!dscp->dsc_dso->dso_dryrun) { + enum zio_flag zioflags = ZIO_FLAG_CANFAIL; + + ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp)); + + if (request_raw) + zioflags |= ZIO_FLAG_RAW; + else if (request_compressed) + zioflags |= ZIO_FLAG_RAW_COMPRESS; + zbookmark_phys_t zb; + zb.zb_objset = dmu_objset_id(dscp->dsc_os); + zb.zb_object = range->object; + zb.zb_level = 0; + zb.zb_blkid = range->start_blkid; + + err = arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, &zb); + } + + if (err != 0) { + if (zfs_send_corrupt_data && + !dscp->dsc_dso->dso_dryrun) { /* Send a block filled with 0x"zfs badd bloc" */ abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA, - blksz); + srdp->datablksz); uint64_t *ptr; for (ptr = abuf->b_data; - (char *)ptr < (char *)abuf->b_data + blksz; - ptr++) + (char *)ptr < (char *)abuf->b_data + + srdp->datablksz; ptr++) *ptr = 0x2f5baddb10cULL; } else { return (SET_ERROR(EIO)); } } - offset = zb->zb_blkid * blksz; + offset = range->start_blkid * srdp->datablksz; if (split_large_blocks) { ASSERT0(arc_is_encrypted(abuf)); ASSERT3U(arc_get_compression(abuf), ==, ZIO_COMPRESS_OFF); char *buf = abuf->b_data; - while (blksz > 0 && err == 0) { - int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); - err = dump_write(dsa, type, zb->zb_object, - offset, n, n, NULL, buf); + while (srdp->datablksz > 0 && err == 0) { + int n = MIN(srdp->datablksz, + SPA_OLD_MAXBLOCKSIZE); + err = dump_write(dscp, srdp->obj_type, + range->object, offset, n, n, NULL, buf); offset += n; buf += n; - blksz -= n; + srdp->datablksz -= n; } } else { - err = dump_write(dsa, type, zb->zb_object, offset, - blksz, arc_buf_size(abuf), bp, abuf->b_data); + int psize; + if (abuf != NULL) { + psize = arc_buf_size(abuf); + if (arc_get_compression(abuf) != + ZIO_COMPRESS_OFF) { + ASSERT3S(psize, ==, BP_GET_PSIZE(bp)); + } + } else if (!request_compressed) { + psize = srdp->datablksz; + } else { + psize = BP_GET_PSIZE(bp); + } + err = dump_write(dscp, srdp->obj_type, range->object, + offset, srdp->datablksz, psize, bp, + (abuf == NULL ? NULL : abuf->b_data)); } - arc_buf_destroy(abuf, &abuf); + if (abuf != NULL) + arc_buf_destroy(abuf, &abuf); + return (err); } + case HOLE: { + struct srh *srhp = &range->sru.hole; + if (range->object == DMU_META_DNODE_OBJECT) { + uint32_t span = srhp->datablksz >> DNODE_SHIFT; + uint64_t first_obj = range->start_blkid * span; + uint64_t numobj = range->end_blkid * span - first_obj; + return (dump_freeobjects(dscp, first_obj, numobj)); + } + uint64_t offset = 0; + + /* + * If this multiply overflows, we don't need to send this block. + * Even if it has a birth time, it can never not be a hole, so + * we don't need to send records for it. + */ + if (!overflow_multiply(range->start_blkid, srhp->datablksz, + &offset)) { + return (0); + } + uint64_t len = 0; - ASSERT(err == 0 || err == EINTR); + if (!overflow_multiply(range->end_blkid, srhp->datablksz, &len)) + len = UINT64_MAX; + len = len - offset; + return (dump_free(dscp, range->object, offset, len)); + } + default: + panic("Invalid range type in do_dump: %d", range->type); + } return (err); } -/* - * Pop the new data off the queue, and free the old data. - */ -static struct send_block_record * -get_next_record(bqueue_t *bq, struct send_block_record *data) +struct send_range * +range_alloc(enum type type, uint64_t object, uint64_t start_blkid, + uint64_t end_blkid, boolean_t eos) { - struct send_block_record *tmp = bqueue_dequeue(bq); - kmem_free(data, sizeof (*data)); - return (tmp); + struct send_range *range = kmem_alloc(sizeof (*range), KM_SLEEP); + range->type = type; + range->object = object; + range->start_blkid = start_blkid; + range->end_blkid = end_blkid; + range->eos_marker = eos; + return (range); } /* - * Actually do the bulk of the work in a zfs send. - * - * Note: Releases dp using the specified tag. + * This is the callback function to traverse_dataset that acts as a worker + * thread for dmu_send_impl. */ +/*ARGSUSED*/ static int -dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, - zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, - boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, - boolean_t rawok, int outfd, uint64_t resumeobj, uint64_t resumeoff, - vnode_t *vp, offset_t *off) +send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) { - objset_t *os; - dmu_replay_record_t *drr; - dmu_sendarg_t *dsp; - int err; - uint64_t fromtxg = 0; - uint64_t featureflags = 0; - struct send_thread_arg to_arg; - void *payload = NULL; - size_t payload_len = 0; - struct send_block_record *to_data; + struct send_thread_arg *sta = arg; + struct send_range *record; - err = dmu_objset_from_ds(to_ds, &os); - if (err != 0) { - dsl_pool_rele(dp, tag); - return (err); - } + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= sta->resume.zb_object); + ASSERT3P(sta->ds, !=, NULL); /* - * If this is a non-raw send of an encrypted ds, we can ensure that - * the objset_phys_t is authenticated. This is safe because this is - * either a snapshot or we have owned the dataset, ensuring that - * it can't be modified. + * All bps of an encrypted os should have the encryption bit set. + * If this is not true it indicates tampering and we report an error. */ - if (!rawok && os->os_encrypted && - arc_is_unauthenticated(os->os_phys_buf)) { - zbookmark_phys_t zb; + objset_t *os; + VERIFY0(dmu_objset_from_ds(sta->ds, &os)); + if (os->os_encrypted && + !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { + spa_log_error(spa, zb); + zfs_panic_recover("unencrypted block in encrypted " + "object set %llu", sta->ds->ds_object); + return (SET_ERROR(EIO)); + } - SET_BOOKMARK(&zb, to_ds->ds_object, ZB_ROOT_OBJECT, - ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - err = arc_untransform(os->os_phys_buf, os->os_spa, - &zb, B_FALSE); - if (err != 0) { - dsl_pool_rele(dp, tag); - return (err); - } + if (sta->cancel) + return (SET_ERROR(EINTR)); + if (zb->zb_object != DMU_META_DNODE_OBJECT && + DMU_OBJECT_IS_SPECIAL(zb->zb_object)) + return (0); + atomic_inc_64(sta->num_blocks_visited); - ASSERT0(arc_is_unauthenticated(os->os_phys_buf)); + if (zb->zb_level == ZB_DNODE_LEVEL) { + if (zb->zb_object == DMU_META_DNODE_OBJECT) + return (0); + record = range_alloc(OBJECT, zb->zb_object, 0, 0, B_FALSE); + record->sru.object.bp = *bp; + record->sru.object.dnp = kmem_alloc(sizeof (*dnp), KM_SLEEP); + *record->sru.object.dnp = *dnp; + bqueue_enqueue(&sta->q, record, sizeof (*record)); + return (0); + } + if (zb->zb_level == 0 && zb->zb_object == DMU_META_DNODE_OBJECT) { + if (BP_IS_HOLE(bp)) + return (0); + record = range_alloc(OBJECT_RANGE, 0, zb->zb_blkid, + zb->zb_blkid + 1, B_FALSE); + record->sru.object_range.bp = *bp; + bqueue_enqueue(&sta->q, record, sizeof (*record)); + return (0); } + if (zb->zb_level < 0 || (zb->zb_level > 0 && !BP_IS_HOLE(bp))) + return (0); + if (zb->zb_object == DMU_META_DNODE_OBJECT && !BP_IS_HOLE(bp)) + return (0); - drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); - drr->drr_type = DRR_BEGIN; - drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; - DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, - DMU_SUBSTREAM); - - bzero(&to_arg, sizeof (to_arg)); - -#ifdef _KERNEL - if (dmu_objset_type(os) == DMU_OST_ZFS) { - uint64_t version; - if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { - kmem_free(drr, sizeof (dmu_replay_record_t)); - dsl_pool_rele(dp, tag); - return (SET_ERROR(EINVAL)); - } - if (version >= ZPL_VERSION_SA) { - featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; - } - } -#endif - - /* raw sends imply large_block_ok */ - if ((large_block_ok || rawok) && - to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS]) - featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; - if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) - featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE; - - /* encrypted datasets will not have embedded blocks */ - if ((embedok || rawok) && !os->os_encrypted && - spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { - featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; - } - - /* raw send implies compressok */ - if (compressok || rawok) - featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; - if (rawok && os->os_encrypted) - featureflags |= DMU_BACKUP_FEATURE_RAW; - - if ((featureflags & - (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED | - DMU_BACKUP_FEATURE_RAW)) != 0 && - spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { - featureflags |= DMU_BACKUP_FEATURE_LZ4; - } - - if (resumeobj != 0 || resumeoff != 0) { - featureflags |= DMU_BACKUP_FEATURE_RESUMING; - } - - DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, - featureflags); - - drr->drr_u.drr_begin.drr_creation_time = - dsl_dataset_phys(to_ds)->ds_creation_time; - drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); - if (is_clone) - drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; - drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; - if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) - drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; - if (zfs_send_set_freerecords_bit) - drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS; - - if (ancestor_zb != NULL) { - drr->drr_u.drr_begin.drr_fromguid = - ancestor_zb->zbm_guid; - fromtxg = ancestor_zb->zbm_creation_txg; - } - dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname); - if (!to_ds->ds_is_snapshot) { - (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", - sizeof (drr->drr_u.drr_begin.drr_toname)); - } - - dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); - - dsp->dsa_drr = drr; - dsp->dsa_vp = vp; - dsp->dsa_outfd = outfd; - dsp->dsa_proc = curproc; - dsp->dsa_os = os; - dsp->dsa_off = off; - dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; - dsp->dsa_pending_op = PENDING_NONE; - dsp->dsa_featureflags = featureflags; - dsp->dsa_resume_object = resumeobj; - dsp->dsa_resume_offset = resumeoff; - - mutex_enter(&to_ds->ds_sendstream_lock); - list_insert_head(&to_ds->ds_sendstreams, dsp); - mutex_exit(&to_ds->ds_sendstream_lock); - - dsl_dataset_long_hold(to_ds, FTAG); - dsl_pool_rele(dp, tag); - - /* handle features that require a DRR_BEGIN payload */ - if (featureflags & - (DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_RAW)) { - nvlist_t *keynvl = NULL; - nvlist_t *nvl = fnvlist_alloc(); - - if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { - dmu_object_info_t to_doi; - err = dmu_object_info(os, resumeobj, &to_doi); - if (err != 0) { - fnvlist_free(nvl); - goto out; - } - - SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, - resumeobj, 0, - resumeoff / to_doi.doi_data_block_size); - - fnvlist_add_uint64(nvl, "resume_object", resumeobj); - fnvlist_add_uint64(nvl, "resume_offset", resumeoff); - } - - if (featureflags & DMU_BACKUP_FEATURE_RAW) { - ASSERT(os->os_encrypted); - - err = dsl_crypto_populate_key_nvlist(to_ds, &keynvl); - if (err != 0) { - fnvlist_free(nvl); - goto out; - } - - fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl); - } - - payload = fnvlist_pack(nvl, &payload_len); - drr->drr_payloadlen = payload_len; - fnvlist_free(keynvl); - fnvlist_free(nvl); - } - - err = dump_record(dsp, payload, payload_len); - fnvlist_pack_free(payload, payload_len); - if (err != 0) { - err = dsp->dsa_err; - goto out; - } - - err = bqueue_init(&to_arg.q, - MAX(zfs_send_queue_length, 2 * zfs_max_recordsize), - offsetof(struct send_block_record, ln)); - to_arg.error_code = 0; - to_arg.cancel = B_FALSE; - to_arg.ds = to_ds; - to_arg.fromtxg = fromtxg; - to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH; - if (rawok) - to_arg.flags |= TRAVERSE_NO_DECRYPT; - (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc, - TS_RUN, minclsyspri); - - to_data = bqueue_dequeue(&to_arg.q); - - while (!to_data->eos_marker && err == 0) { - err = do_dump(dsp, to_data); - to_data = get_next_record(&to_arg.q, to_data); - if (issig(JUSTLOOKING) && issig(FORREAL)) - err = EINTR; - } - - if (err != 0) { - to_arg.cancel = B_TRUE; - while (!to_data->eos_marker) { - to_data = get_next_record(&to_arg.q, to_data); - } - } - kmem_free(to_data, sizeof (*to_data)); - - bqueue_destroy(&to_arg.q); - - if (err == 0 && to_arg.error_code != 0) - err = to_arg.error_code; - - if (err != 0) - goto out; - - if (dsp->dsa_pending_op != PENDING_NONE) - if (dump_record(dsp, NULL, 0) != 0) - err = SET_ERROR(EINTR); - - if (err != 0) { - if (err == EINTR && dsp->dsa_err != 0) - err = dsp->dsa_err; - goto out; - } - - bzero(drr, sizeof (dmu_replay_record_t)); - drr->drr_type = DRR_END; - drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; - drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; - - if (dump_record(dsp, NULL, 0) != 0) - err = dsp->dsa_err; -out: - mutex_enter(&to_ds->ds_sendstream_lock); - list_remove(&to_ds->ds_sendstreams, dsp); - mutex_exit(&to_ds->ds_sendstream_lock); - - VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end)); - - kmem_free(drr, sizeof (dmu_replay_record_t)); - kmem_free(dsp, sizeof (dmu_sendarg_t)); - - dsl_dataset_long_rele(to_ds, FTAG); - - return (err); -} - -int -dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, - boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, - boolean_t rawok, int outfd, vnode_t *vp, offset_t *off) -{ - dsl_pool_t *dp; - dsl_dataset_t *ds; - dsl_dataset_t *fromds = NULL; - ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; - int err; - - err = dsl_pool_hold(pool, FTAG, &dp); - if (err != 0) - return (err); - - err = dsl_dataset_hold_obj_flags(dp, tosnap, dsflags, FTAG, &ds); - if (err != 0) { - dsl_pool_rele(dp, FTAG); - return (err); - } - - if (fromsnap != 0) { - zfs_bookmark_phys_t zb; - boolean_t is_clone; - - err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); - if (err != 0) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); - dsl_pool_rele(dp, FTAG); - return (err); - } - if (!dsl_dataset_is_before(ds, fromds, 0)) - err = SET_ERROR(EXDEV); - zb.zbm_creation_time = - dsl_dataset_phys(fromds)->ds_creation_time; - zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; - zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; - is_clone = (fromds->ds_dir != ds->ds_dir); - dsl_dataset_rele(fromds, FTAG); - err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, compressok, rawok, outfd, - 0, 0, vp, off); - } else { - err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, compressok, rawok, outfd, - 0, 0, vp, off); - } - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (err); -} - -int -dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, - boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, - int outfd, uint64_t resumeobj, uint64_t resumeoff, vnode_t *vp, - offset_t *off) -{ - dsl_pool_t *dp; - dsl_dataset_t *ds; - int err; - ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; - boolean_t owned = B_FALSE; - - if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) - return (SET_ERROR(EINVAL)); - - err = dsl_pool_hold(tosnap, FTAG, &dp); - if (err != 0) - return (err); - - if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { - /* - * We are sending a filesystem or volume. Ensure - * that it doesn't change by owning the dataset. - */ - err = dsl_dataset_own(dp, tosnap, dsflags, FTAG, &ds); - owned = B_TRUE; - } else { - err = dsl_dataset_hold_flags(dp, tosnap, dsflags, FTAG, &ds); - } - if (err != 0) { - dsl_pool_rele(dp, FTAG); - return (err); - } - - if (fromsnap != NULL) { - zfs_bookmark_phys_t zb; - boolean_t is_clone = B_FALSE; - int fsnamelen = strchr(tosnap, '@') - tosnap; - - /* - * If the fromsnap is in a different filesystem, then - * mark the send stream as a clone. - */ - if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || - (fromsnap[fsnamelen] != '@' && - fromsnap[fsnamelen] != '#')) { - is_clone = B_TRUE; - } - - if (strchr(fromsnap, '@')) { - dsl_dataset_t *fromds; - err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); - if (err == 0) { - if (!dsl_dataset_is_before(ds, fromds, 0)) - err = SET_ERROR(EXDEV); - zb.zbm_creation_time = - dsl_dataset_phys(fromds)->ds_creation_time; - zb.zbm_creation_txg = - dsl_dataset_phys(fromds)->ds_creation_txg; - zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; - is_clone = (ds->ds_dir != fromds->ds_dir); - dsl_dataset_rele(fromds, FTAG); - } - } else { - err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); - } - if (err != 0) { - if (owned) - dsl_dataset_disown(ds, dsflags, FTAG); - else - dsl_dataset_rele_flags(ds, dsflags, FTAG); - - dsl_pool_rele(dp, FTAG); - return (err); - } - err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, compressok, rawok, - outfd, resumeobj, resumeoff, vp, off); - } else { - err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, compressok, rawok, - outfd, resumeobj, resumeoff, vp, off); - } - if (owned) - dsl_dataset_disown(ds, dsflags, FTAG); - else - dsl_dataset_rele_flags(ds, dsflags, FTAG); - - return (err); -} - -static int -dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed, - uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep) -{ - int err = 0; - uint64_t size; - /* - * Assume that space (both on-disk and in-stream) is dominated by - * data. We will adjust for indirect blocks and the copies property, - * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). - */ - - uint64_t recordsize; - uint64_t record_count; - objset_t *os; - VERIFY0(dmu_objset_from_ds(ds, &os)); - - /* Assume all (uncompressed) blocks are recordsize. */ - if (zfs_override_estimate_recordsize != 0) { - recordsize = zfs_override_estimate_recordsize; - } else if (os->os_phys->os_type == DMU_OST_ZVOL) { - err = dsl_prop_get_int_ds(ds, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize); - } else { - err = dsl_prop_get_int_ds(ds, - zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize); - } - if (err != 0) - return (err); - record_count = uncompressed / recordsize; - - /* - * If we're estimating a send size for a compressed stream, use the - * compressed data size to estimate the stream size. Otherwise, use the - * uncompressed data size. - */ - size = stream_compressed ? compressed : uncompressed; - - /* - * Subtract out approximate space used by indirect blocks. - * Assume most space is used by data blocks (non-indirect, non-dnode). - * Assume no ditto blocks or internal fragmentation. - * - * Therefore, space used by indirect blocks is sizeof(blkptr_t) per - * block. - */ - size -= record_count * sizeof (blkptr_t); - - /* Add in the space for the record associated with each block. */ - size += record_count * sizeof (dmu_replay_record_t); - - *sizep = size; - - return (0); -} - -int -dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, - boolean_t stream_compressed, uint64_t *sizep) -{ - int err; - uint64_t uncomp, comp; - - ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); - - /* tosnap must be a snapshot */ - if (!ds->ds_is_snapshot) - return (SET_ERROR(EINVAL)); - - /* fromsnap, if provided, must be a snapshot */ - if (fromds != NULL && !fromds->ds_is_snapshot) - return (SET_ERROR(EINVAL)); - - /* - * fromsnap must be an earlier snapshot from the same fs as tosnap, - * or the origin's fs. - */ - if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) - return (SET_ERROR(EXDEV)); - - /* Get compressed and uncompressed size estimates of changed data. */ - if (fromds == NULL) { - uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes; - comp = dsl_dataset_phys(ds)->ds_compressed_bytes; - } else { - uint64_t used; - err = dsl_dataset_space_written(fromds, ds, - &used, &comp, &uncomp); - if (err != 0) - return (err); - } - - err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp, - stream_compressed, sizep); - /* - * Add the size of the BEGIN and END records to the estimate. - */ - *sizep += 2 * sizeof (dmu_replay_record_t); - return (err); -} - -struct calculate_send_arg { - uint64_t uncompressed; - uint64_t compressed; -}; - -/* - * Simple callback used to traverse the blocks of a snapshot and sum their - * uncompressed and compressed sizes. - */ -/* ARGSUSED */ -static int -dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) -{ - struct calculate_send_arg *space = arg; - if (bp != NULL && !BP_IS_HOLE(bp)) { - space->uncompressed += BP_GET_UCSIZE(bp); - space->compressed += BP_GET_PSIZE(bp); - } - return (0); -} - -/* - * Given a desination snapshot and a TXG, calculate the approximate size of a - * send stream sent from that TXG. from_txg may be zero, indicating that the - * whole snapshot will be sent. - */ -int -dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, - boolean_t stream_compressed, uint64_t *sizep) -{ - int err; - struct calculate_send_arg size = { 0 }; - - ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); - - /* tosnap must be a snapshot */ - if (!dsl_dataset_is_snapshot(ds)) - return (SET_ERROR(EINVAL)); - - /* verify that from_txg is before the provided snapshot was taken */ - if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) { - return (SET_ERROR(EXDEV)); - } - /* - * traverse the blocks of the snapshot with birth times after - * from_txg, summing their uncompressed size - */ - err = traverse_dataset(ds, from_txg, - TRAVERSE_POST | TRAVERSE_NO_DECRYPT, - dmu_calculate_send_traversal, &size); - - if (err) - return (err); - - err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed, - size.compressed, stream_compressed, sizep); - return (err); -} - -typedef struct dmu_recv_begin_arg { - const char *drba_origin; - dmu_recv_cookie_t *drba_cookie; - cred_t *drba_cred; - dsl_crypto_params_t *drba_dcp; - uint64_t drba_snapobj; -} dmu_recv_begin_arg_t; - -static int -recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, - uint64_t fromguid, uint64_t featureflags) -{ - uint64_t val; - int error; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0; - boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0; - boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0; - - /* temporary clone name must not exist */ - error = zap_lookup(dp->dp_meta_objset, - dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, - 8, 1, &val); - if (error != ENOENT) - return (error == 0 ? EBUSY : error); - - /* new snapshot name must not exist */ - error = zap_lookup(dp->dp_meta_objset, - dsl_dataset_phys(ds)->ds_snapnames_zapobj, - drba->drba_cookie->drc_tosnap, 8, 1, &val); - if (error != ENOENT) - return (error == 0 ? EEXIST : error); - - /* - * Check snapshot limit before receiving. We'll recheck again at the - * end, but might as well abort before receiving if we're already over - * the limit. - * - * Note that we do not check the file system limit with - * dsl_dir_fscount_check because the temporary %clones don't count - * against that limit. - */ - error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, - NULL, drba->drba_cred); - if (error != 0) - return (error); - - if (fromguid != 0) { - dsl_dataset_t *snap; - uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; - - /* Can't perform a raw receive on top of a non-raw receive */ - if (!encrypted && raw) - return (SET_ERROR(EINVAL)); - - /* Encryption is incompatible with embedded data */ - if (encrypted && embed) - return (SET_ERROR(EINVAL)); - - /* Find snapshot in this dir that matches fromguid. */ - while (obj != 0) { - error = dsl_dataset_hold_obj(dp, obj, FTAG, - &snap); - if (error != 0) - return (SET_ERROR(ENODEV)); - if (snap->ds_dir != ds->ds_dir) { - dsl_dataset_rele(snap, FTAG); - return (SET_ERROR(ENODEV)); - } - if (dsl_dataset_phys(snap)->ds_guid == fromguid) - break; - obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; - dsl_dataset_rele(snap, FTAG); - } - if (obj == 0) - return (SET_ERROR(ENODEV)); - - if (drba->drba_cookie->drc_force) { - drba->drba_snapobj = obj; - } else { - /* - * If we are not forcing, there must be no - * changes since fromsnap. - */ - if (dsl_dataset_modified_since_snap(ds, snap)) { - dsl_dataset_rele(snap, FTAG); - return (SET_ERROR(ETXTBSY)); - } - drba->drba_snapobj = ds->ds_prev->ds_object; - } - - dsl_dataset_rele(snap, FTAG); - } else { - /* if full, then must be forced */ - if (!drba->drba_cookie->drc_force) - return (SET_ERROR(EEXIST)); - - /* - * We don't support using zfs recv -F to blow away - * encrypted filesystems. This would require the - * dsl dir to point to the old encryption key and - * the new one at the same time during the receive. - */ - if ((!encrypted && raw) || encrypted) - return (SET_ERROR(EINVAL)); - - /* - * Perform the same encryption checks we would if - * we were creating a new dataset from scratch. - */ - if (!raw) { - boolean_t will_encrypt; - - error = dmu_objset_create_crypt_check( - ds->ds_dir->dd_parent, drba->drba_dcp, - &will_encrypt); - if (error != 0) - return (error); - - if (will_encrypt && embed) - return (SET_ERROR(EINVAL)); - } - - drba->drba_snapobj = 0; - } - - return (0); - -} - -static int -dmu_recv_begin_check(void *arg, dmu_tx_t *tx) -{ - dmu_recv_begin_arg_t *drba = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - struct drr_begin *drrb = drba->drba_cookie->drc_drrb; - uint64_t fromguid = drrb->drr_fromguid; - int flags = drrb->drr_flags; - ds_hold_flags_t dsflags = 0; - int error; - uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); - dsl_dataset_t *ds; - const char *tofs = drba->drba_cookie->drc_tofs; - - /* already checked */ - ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); - ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING)); - - if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == - DMU_COMPOUNDSTREAM || - drrb->drr_type >= DMU_OST_NUMTYPES || - ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) - return (SET_ERROR(EINVAL)); - - /* Verify pool version supports SA if SA_SPILL feature set */ - if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && - spa_version(dp->dp_spa) < SPA_VERSION_SA) - return (SET_ERROR(ENOTSUP)); - - if (drba->drba_cookie->drc_resumable && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) - return (SET_ERROR(ENOTSUP)); - - /* - * The receiving code doesn't know how to translate a WRITE_EMBEDDED - * record to a plain WRITE record, so the pool must have the - * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED - * records. Same with WRITE_EMBEDDED records that use LZ4 compression. - */ - if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) - return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) - return (SET_ERROR(ENOTSUP)); - - /* - * The receiving code doesn't know how to translate large blocks - * to smaller ones, so the pool must have the LARGE_BLOCKS - * feature enabled if the stream has LARGE_BLOCKS. Same with - * large dnodes. - */ - if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) - return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE)) - return (SET_ERROR(ENOTSUP)); - - if (featureflags & DMU_BACKUP_FEATURE_RAW) { - /* raw receives require the encryption feature */ - if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) - return (SET_ERROR(ENOTSUP)); - - /* embedded data is incompatible with encryption and raw recv */ - if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) - return (SET_ERROR(EINVAL)); - } else { - dsflags |= DS_HOLD_FLAG_DECRYPT; - } - - error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); - if (error == 0) { - /* target fs already exists; recv into temp clone */ - - /* Can't recv a clone into an existing fs */ - if (flags & DRR_FLAG_CLONE || drba->drba_origin) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (SET_ERROR(EINVAL)); - } - - error = recv_begin_check_existing_impl(drba, ds, fromguid, - featureflags); - dsl_dataset_rele_flags(ds, dsflags, FTAG); - } else if (error == ENOENT) { - /* target fs does not exist; must be a full backup or clone */ - char buf[ZFS_MAX_DATASET_NAME_LEN]; - - /* - * If it's a non-clone incremental, we are missing the - * target fs, so fail the recv. - */ - if (fromguid != 0 && !(flags & DRR_FLAG_CLONE || - drba->drba_origin)) - return (SET_ERROR(ENOENT)); - - /* - * If we're receiving a full send as a clone, and it doesn't - * contain all the necessary free records and freeobject - * records, reject it. - */ - if (fromguid == 0 && drba->drba_origin && - !(flags & DRR_FLAG_FREERECORDS)) - return (SET_ERROR(EINVAL)); - - /* Open the parent of tofs */ - ASSERT3U(strlen(tofs), <, sizeof (buf)); - (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); - error = dsl_dataset_hold_flags(dp, buf, dsflags, FTAG, &ds); - if (error != 0) - return (error); - - if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0 && - drba->drba_origin == NULL) { - boolean_t will_encrypt; - - /* - * Check that we aren't breaking any encryption rules - * and that we have all the parameters we need to - * create an encrypted dataset if necessary. If we are - * making an encrypted dataset the stream can't have - * embedded data. - */ - error = dmu_objset_create_crypt_check(ds->ds_dir, - drba->drba_dcp, &will_encrypt); - if (error != 0) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (error); - } - - if (will_encrypt && - (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (SET_ERROR(EINVAL)); - } - } - - /* - * Check filesystem and snapshot limits before receiving. We'll - * recheck snapshot limits again at the end (we create the - * filesystems and increment those counts during begin_sync). - */ - error = dsl_fs_ss_limit_check(ds->ds_dir, 1, - ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); - if (error != 0) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (error); - } - - error = dsl_fs_ss_limit_check(ds->ds_dir, 1, - ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); - if (error != 0) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (error); - } - - if (drba->drba_origin != NULL) { - dsl_dataset_t *origin; - - error = dsl_dataset_hold_flags(dp, drba->drba_origin, - dsflags, FTAG, &origin); - if (error != 0) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (error); - } - if (!origin->ds_is_snapshot) { - dsl_dataset_rele_flags(origin, dsflags, FTAG); - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (SET_ERROR(EINVAL)); - } - if (dsl_dataset_phys(origin)->ds_guid != fromguid && - fromguid != 0) { - dsl_dataset_rele_flags(origin, dsflags, FTAG); - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (SET_ERROR(ENODEV)); - } - if (origin->ds_dir->dd_crypto_obj != 0 && - (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { - dsl_dataset_rele_flags(origin, dsflags, FTAG); - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (SET_ERROR(EINVAL)); - } - dsl_dataset_rele_flags(origin, - dsflags, FTAG); - } - dsl_dataset_rele_flags(ds, dsflags, FTAG); - error = 0; - } - return (error); -} - -static void -dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) -{ - dmu_recv_begin_arg_t *drba = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - objset_t *mos = dp->dp_meta_objset; - struct drr_begin *drrb = drba->drba_cookie->drc_drrb; - const char *tofs = drba->drba_cookie->drc_tofs; - uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); - dsl_dataset_t *ds, *newds; - objset_t *os; - uint64_t dsobj; - ds_hold_flags_t dsflags = 0; - int error; - uint64_t crflags = 0; - dsl_crypto_params_t dummy_dcp = { 0 }; - dsl_crypto_params_t *dcp = drba->drba_dcp; - - if (drrb->drr_flags & DRR_FLAG_CI_DATA) - crflags |= DS_FLAG_CI_DATASET; - - if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0) - dsflags |= DS_HOLD_FLAG_DECRYPT; - - /* - * Raw, non-incremental recvs always use a dummy dcp with - * the raw cmd set. Raw incremental recvs do not use a dcp - * since the encryption parameters are already set in stone. - */ - if (dcp == NULL && drba->drba_snapobj == 0 && - drba->drba_origin == NULL) { - ASSERT3P(dcp, ==, NULL); - dcp = &dummy_dcp; - - if (featureflags & DMU_BACKUP_FEATURE_RAW) - dcp->cp_cmd = DCP_CMD_RAW_RECV; - } - - error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); - if (error == 0) { - /* create temporary clone */ - dsl_dataset_t *snap = NULL; - - if (drba->drba_snapobj != 0) { - VERIFY0(dsl_dataset_hold_obj(dp, - drba->drba_snapobj, FTAG, &snap)); - ASSERT3P(dcp, ==, NULL); - } - - dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, - snap, crflags, drba->drba_cred, dcp, tx); - if (drba->drba_snapobj != 0) - dsl_dataset_rele(snap, FTAG); - dsl_dataset_rele_flags(ds, dsflags, FTAG); - } else { - dsl_dir_t *dd; - const char *tail; - dsl_dataset_t *origin = NULL; - - VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); - - if (drba->drba_origin != NULL) { - VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, - FTAG, &origin)); - ASSERT3P(dcp, ==, NULL); - } - - /* Create new dataset. */ - dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1, - origin, crflags, drba->drba_cred, dcp, tx); - if (origin != NULL) - dsl_dataset_rele(origin, FTAG); - dsl_dir_rele(dd, FTAG); - drba->drba_cookie->drc_newfs = B_TRUE; - } - - VERIFY0(dsl_dataset_own_obj(dp, dsobj, dsflags, dmu_recv_tag, &newds)); - VERIFY0(dmu_objset_from_ds(newds, &os)); - - if (drba->drba_cookie->drc_resumable) { - dsl_dataset_zapify(newds, tx); - if (drrb->drr_fromguid != 0) { - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, - 8, 1, &drrb->drr_fromguid, tx)); - } - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID, - 8, 1, &drrb->drr_toguid, tx)); - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME, - 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx)); - uint64_t one = 1; - uint64_t zero = 0; - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT, - 8, 1, &one, tx)); - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET, - 8, 1, &zero, tx)); - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, - 8, 1, &zero, tx)); - if (featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) { - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK, - 8, 1, &one, tx)); - } - if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) { - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, - 8, 1, &one, tx)); - } - if (featureflags & DMU_BACKUP_FEATURE_COMPRESSED) { - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK, - 8, 1, &one, tx)); - } - if (featureflags & DMU_BACKUP_FEATURE_RAW) { - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_RAWOK, - 8, 1, &one, tx)); - } - } - - /* - * Usually the os->os_encrypted value is tied to the presence of a - * DSL Crypto Key object in the dd. However, that will not be received - * until dmu_recv_stream(), so we set the value manually for now. - */ - if (featureflags & DMU_BACKUP_FEATURE_RAW) { - os->os_encrypted = B_TRUE; - drba->drba_cookie->drc_raw = B_TRUE; - } - - dmu_buf_will_dirty(newds->ds_dbuf, tx); - dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; - - /* - * If we actually created a non-clone, we need to create the objset - * in our new dataset. If this is a raw send we postpone this until - * dmu_recv_stream() so that we can allocate the metadnode with the - * properties from the DRR_BEGIN payload. - */ - rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG); - if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) && - (featureflags & DMU_BACKUP_FEATURE_RAW) == 0) { - (void) dmu_objset_create_impl(dp->dp_spa, - newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); - } - rrw_exit(&newds->ds_bp_rwlock, FTAG); - - drba->drba_cookie->drc_ds = newds; - - spa_history_log_internal_ds(newds, "receive", tx, ""); -} - -static int -dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) -{ - dmu_recv_begin_arg_t *drba = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - struct drr_begin *drrb = drba->drba_cookie->drc_drrb; - int error; - ds_hold_flags_t dsflags = 0; - uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); - dsl_dataset_t *ds; - const char *tofs = drba->drba_cookie->drc_tofs; - - /* already checked */ - ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); - ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING); - - if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == - DMU_COMPOUNDSTREAM || - drrb->drr_type >= DMU_OST_NUMTYPES) - return (SET_ERROR(EINVAL)); - - /* Verify pool version supports SA if SA_SPILL feature set */ - if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && - spa_version(dp->dp_spa) < SPA_VERSION_SA) - return (SET_ERROR(ENOTSUP)); - - /* - * The receiving code doesn't know how to translate a WRITE_EMBEDDED - * record to a plain WRITE record, so the pool must have the - * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED - * records. Same with WRITE_EMBEDDED records that use LZ4 compression. - */ - if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) - return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) - return (SET_ERROR(ENOTSUP)); - - /* - * The receiving code doesn't know how to translate large blocks - * to smaller ones, so the pool must have the LARGE_BLOCKS - * feature enabled if the stream has LARGE_BLOCKS. Same with - * large dnodes. - */ - if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) - return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE)) - return (SET_ERROR(ENOTSUP)); - - /* 6 extra bytes for /%recv */ - char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; - (void) snprintf(recvname, sizeof (recvname), "%s/%s", - tofs, recv_clone_name); - - if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0) - dsflags |= DS_HOLD_FLAG_DECRYPT; - - if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) { - /* %recv does not exist; continue in tofs */ - error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); - if (error != 0) - return (error); - } - - /* check that ds is marked inconsistent */ - if (!DS_IS_INCONSISTENT(ds)) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (SET_ERROR(EINVAL)); - } - - /* check that there is resuming data, and that the toguid matches */ - if (!dsl_dataset_is_zapified(ds)) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (SET_ERROR(EINVAL)); - } - uint64_t val; - error = zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); - if (error != 0 || drrb->drr_toguid != val) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (SET_ERROR(EINVAL)); - } - - /* - * Check if the receive is still running. If so, it will be owned. - * Note that nothing else can own the dataset (e.g. after the receive - * fails) because it will be marked inconsistent. - */ - if (dsl_dataset_has_owner(ds)) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (SET_ERROR(EBUSY)); - } - - /* There should not be any snapshots of this fs yet. */ - if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (SET_ERROR(EINVAL)); - } - - /* - * Note: resume point will be checked when we process the first WRITE - * record. - */ - - /* check that the origin matches */ - val = 0; - (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); - if (drrb->drr_fromguid != val) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (SET_ERROR(EINVAL)); - } - - dsl_dataset_rele_flags(ds, dsflags, FTAG); - return (0); -} - -static void -dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) -{ - dmu_recv_begin_arg_t *drba = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - const char *tofs = drba->drba_cookie->drc_tofs; - struct drr_begin *drrb = drba->drba_cookie->drc_drrb; - uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); - dsl_dataset_t *ds; - objset_t *os; - ds_hold_flags_t dsflags = 0; - uint64_t dsobj; - /* 6 extra bytes for /%recv */ - char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; - - (void) snprintf(recvname, sizeof (recvname), "%s/%s", - tofs, recv_clone_name); - - if (featureflags & DMU_BACKUP_FEATURE_RAW) { - drba->drba_cookie->drc_raw = B_TRUE; - } else { - dsflags |= DS_HOLD_FLAG_DECRYPT; - } - - if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) { - /* %recv does not exist; continue in tofs */ - VERIFY0(dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds)); - drba->drba_cookie->drc_newfs = B_TRUE; - } - - /* clear the inconsistent flag so that we can own it */ - ASSERT(DS_IS_INCONSISTENT(ds)); - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; - dsobj = ds->ds_object; - dsl_dataset_rele_flags(ds, dsflags, FTAG); - - VERIFY0(dsl_dataset_own_obj(dp, dsobj, dsflags, dmu_recv_tag, &ds)); - VERIFY0(dmu_objset_from_ds(ds, &os)); - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; - - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) || - drba->drba_cookie->drc_raw); - rrw_exit(&ds->ds_bp_rwlock, FTAG); - - drba->drba_cookie->drc_ds = ds; - - spa_history_log_internal_ds(ds, "resume receive", tx, ""); -} - -/* - * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() - * succeeds; otherwise we will leak the holds on the datasets. - */ -int -dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, - boolean_t force, boolean_t resumable, nvlist_t *localprops, - nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc) -{ - dmu_recv_begin_arg_t drba = { 0 }; - - bzero(drc, sizeof (dmu_recv_cookie_t)); - drc->drc_drr_begin = drr_begin; - drc->drc_drrb = &drr_begin->drr_u.drr_begin; - drc->drc_tosnap = tosnap; - drc->drc_tofs = tofs; - drc->drc_force = force; - drc->drc_resumable = resumable; - drc->drc_cred = CRED(); - drc->drc_clone = (origin != NULL); - - if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { - drc->drc_byteswap = B_TRUE; - (void) fletcher_4_incremental_byteswap(drr_begin, - sizeof (dmu_replay_record_t), &drc->drc_cksum); - byteswap_record(drr_begin); - } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { - (void) fletcher_4_incremental_native(drr_begin, - sizeof (dmu_replay_record_t), &drc->drc_cksum); - } else { - return (SET_ERROR(EINVAL)); - } - - drba.drba_origin = origin; - drba.drba_cookie = drc; - drba.drba_cred = CRED(); - - if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & - DMU_BACKUP_FEATURE_RESUMING) { - return (dsl_sync_task(tofs, - dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, - &drba, 5, ZFS_SPACE_CHECK_NORMAL)); - } else { - int err; - - /* - * For non-raw, non-incremental, non-resuming receives the - * user can specify encryption parameters on the command line - * with "zfs recv -o". For these receives we create a dcp and - * pass it to the sync task. Creating the dcp will implicitly - * remove the encryption params from the localprops nvlist, - * which avoids errors when trying to set these normally - * read-only properties. Any other kind of receive that - * attempts to set these properties will fail as a result. - */ - if ((DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & - DMU_BACKUP_FEATURE_RAW) == 0 && - origin == NULL && drc->drc_drrb->drr_fromguid == 0) { - err = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, - localprops, hidden_args, &drba.drba_dcp); - if (err != 0) - return (err); - } - - err = dsl_sync_task(tofs, - dmu_recv_begin_check, dmu_recv_begin_sync, - &drba, 5, ZFS_SPACE_CHECK_NORMAL); - dsl_crypto_params_free(drba.drba_dcp, !!err); - - return (err); - } -} - -struct receive_record_arg { - dmu_replay_record_t header; - void *payload; /* Pointer to a buffer containing the payload */ - /* - * If the record is a write, pointer to the arc_buf_t containing the - * payload. - */ - arc_buf_t *arc_buf; - int payload_size; - uint64_t bytes_read; /* bytes read from stream when record created */ - boolean_t eos_marker; /* Marks the end of the stream */ - bqueue_node_t node; -}; - -struct receive_writer_arg { - objset_t *os; - boolean_t byteswap; - bqueue_t q; - - /* - * These three args are used to signal to the main thread that we're - * done. - */ - kmutex_t mutex; - kcondvar_t cv; - boolean_t done; - - int err; - /* A map from guid to dataset to help handle dedup'd streams. */ - avl_tree_t *guid_to_ds_map; - boolean_t resumable; - boolean_t raw; - uint64_t last_object; - uint64_t last_offset; - uint64_t max_object; /* highest object ID referenced in stream */ - uint64_t bytes_read; /* bytes read when current record created */ - - /* Encryption parameters for the last received DRR_OBJECT_RANGE */ - boolean_t or_crypt_params_present; - uint64_t or_firstobj; - uint64_t or_numslots; - uint8_t or_salt[ZIO_DATA_SALT_LEN]; - uint8_t or_iv[ZIO_DATA_IV_LEN]; - uint8_t or_mac[ZIO_DATA_MAC_LEN]; - boolean_t or_byteorder; -}; - -struct objlist { - list_t list; /* List of struct receive_objnode. */ - /* - * Last object looked up. Used to assert that objects are being looked - * up in ascending order. - */ - uint64_t last_lookup; -}; - -struct receive_objnode { - list_node_t node; - uint64_t object; -}; - -struct receive_arg { - objset_t *os; - vnode_t *vp; /* The vnode to read the stream from */ - uint64_t voff; /* The current offset in the stream */ - uint64_t bytes_read; - /* - * A record that has had its payload read in, but hasn't yet been handed - * off to the worker thread. - */ - struct receive_record_arg *rrd; - /* A record that has had its header read in, but not its payload. */ - struct receive_record_arg *next_rrd; - zio_cksum_t cksum; - zio_cksum_t prev_cksum; - int err; - boolean_t byteswap; - boolean_t raw; - uint64_t featureflags; - /* Sorted list of objects not to issue prefetches for. */ - struct objlist ignore_objlist; -}; - -typedef struct guid_map_entry { - uint64_t guid; - boolean_t raw; - dsl_dataset_t *gme_ds; - avl_node_t avlnode; -} guid_map_entry_t; - -static int -guid_compare(const void *arg1, const void *arg2) -{ - const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1; - const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2; - - return (AVL_CMP(gmep1->guid, gmep2->guid)); -} - -static void -free_guid_map_onexit(void *arg) -{ - avl_tree_t *ca = arg; - void *cookie = NULL; - guid_map_entry_t *gmep; - - while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { - ds_hold_flags_t dsflags = DS_HOLD_FLAG_DECRYPT; - - if (gmep->raw) { - gmep->gme_ds->ds_objset->os_raw_receive = B_FALSE; - dsflags &= ~DS_HOLD_FLAG_DECRYPT; - } - - dsl_dataset_disown(gmep->gme_ds, dsflags, gmep); - kmem_free(gmep, sizeof (guid_map_entry_t)); - } - avl_destroy(ca); - kmem_free(ca, sizeof (avl_tree_t)); -} - -static int -receive_read(struct receive_arg *ra, int len, void *buf) -{ - int done = 0; - - /* - * The code doesn't rely on this (lengths being multiples of 8). See - * comment in dump_bytes. - */ - ASSERT(len % 8 == 0 || - (ra->featureflags & DMU_BACKUP_FEATURE_RAW) != 0); - - while (done < len) { - ssize_t resid; - - ra->err = vn_rdwr(UIO_READ, ra->vp, - (char *)buf + done, len - done, - ra->voff, UIO_SYSSPACE, FAPPEND, - RLIM64_INFINITY, CRED(), &resid); - - if (resid == len - done) { - /* - * Note: ECKSUM indicates that the receive - * was interrupted and can potentially be resumed. - */ - ra->err = SET_ERROR(ECKSUM); - } - ra->voff += len - done - resid; - done = len - resid; - if (ra->err != 0) - return (ra->err); - } - - ra->bytes_read += len; - - ASSERT3U(done, ==, len); - return (0); -} - -noinline static void -byteswap_record(dmu_replay_record_t *drr) -{ -#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) -#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) - drr->drr_type = BSWAP_32(drr->drr_type); - drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); - - switch (drr->drr_type) { - case DRR_BEGIN: - DO64(drr_begin.drr_magic); - DO64(drr_begin.drr_versioninfo); - DO64(drr_begin.drr_creation_time); - DO32(drr_begin.drr_type); - DO32(drr_begin.drr_flags); - DO64(drr_begin.drr_toguid); - DO64(drr_begin.drr_fromguid); - break; - case DRR_OBJECT: - DO64(drr_object.drr_object); - DO32(drr_object.drr_type); - DO32(drr_object.drr_bonustype); - DO32(drr_object.drr_blksz); - DO32(drr_object.drr_bonuslen); - DO32(drr_object.drr_raw_bonuslen); - DO64(drr_object.drr_toguid); - DO64(drr_object.drr_maxblkid); - break; - case DRR_FREEOBJECTS: - DO64(drr_freeobjects.drr_firstobj); - DO64(drr_freeobjects.drr_numobjs); - DO64(drr_freeobjects.drr_toguid); - break; - case DRR_WRITE: - DO64(drr_write.drr_object); - DO32(drr_write.drr_type); - DO64(drr_write.drr_offset); - DO64(drr_write.drr_logical_size); - DO64(drr_write.drr_toguid); - ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); - DO64(drr_write.drr_key.ddk_prop); - DO64(drr_write.drr_compressed_size); - break; - case DRR_WRITE_BYREF: - DO64(drr_write_byref.drr_object); - DO64(drr_write_byref.drr_offset); - DO64(drr_write_byref.drr_length); - DO64(drr_write_byref.drr_toguid); - DO64(drr_write_byref.drr_refguid); - DO64(drr_write_byref.drr_refobject); - DO64(drr_write_byref.drr_refoffset); - ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. - drr_key.ddk_cksum); - DO64(drr_write_byref.drr_key.ddk_prop); - break; - case DRR_WRITE_EMBEDDED: - DO64(drr_write_embedded.drr_object); - DO64(drr_write_embedded.drr_offset); - DO64(drr_write_embedded.drr_length); - DO64(drr_write_embedded.drr_toguid); - DO32(drr_write_embedded.drr_lsize); - DO32(drr_write_embedded.drr_psize); - break; - case DRR_FREE: - DO64(drr_free.drr_object); - DO64(drr_free.drr_offset); - DO64(drr_free.drr_length); - DO64(drr_free.drr_toguid); - break; - case DRR_SPILL: - DO64(drr_spill.drr_object); - DO64(drr_spill.drr_length); - DO64(drr_spill.drr_toguid); - DO64(drr_spill.drr_compressed_size); - DO32(drr_spill.drr_type); - break; - case DRR_OBJECT_RANGE: - DO64(drr_object_range.drr_firstobj); - DO64(drr_object_range.drr_numslots); - DO64(drr_object_range.drr_toguid); - break; - case DRR_END: - DO64(drr_end.drr_toguid); - ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); - break; - default: - break; - } - - if (drr->drr_type != DRR_BEGIN) { - ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); - } - -#undef DO64 -#undef DO32 -} - -static inline uint8_t -deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) -{ - if (bonus_type == DMU_OT_SA) { - return (1); - } else { - return (1 + - ((DN_OLD_MAX_BONUSLEN - - MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT)); - } -} - -static void -save_resume_state(struct receive_writer_arg *rwa, - uint64_t object, uint64_t offset, dmu_tx_t *tx) -{ - int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; - - if (!rwa->resumable) - return; - - /* - * We use ds_resume_bytes[] != 0 to indicate that we need to - * update this on disk, so it must not be 0. - */ - ASSERT(rwa->bytes_read != 0); - - /* - * We only resume from write records, which have a valid - * (non-meta-dnode) object number. - */ - ASSERT(object != 0); - - /* - * For resuming to work correctly, we must receive records in order, - * sorted by object,offset. This is checked by the callers, but - * assert it here for good measure. - */ - ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]); - ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] || - offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]); - ASSERT3U(rwa->bytes_read, >=, - rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]); - - rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object; - rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset; - rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; -} - -noinline static int -receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, - void *data) -{ - dmu_object_info_t doi; - dmu_tx_t *tx; - uint64_t object; - int err; - uint8_t dn_slots = drro->drr_dn_slots != 0 ? - drro->drr_dn_slots : DNODE_MIN_SLOTS; - - if (drro->drr_type == DMU_OT_NONE || - !DMU_OT_IS_VALID(drro->drr_type) || - !DMU_OT_IS_VALID(drro->drr_bonustype) || - drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || - drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || - P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || - drro->drr_blksz < SPA_MINBLOCKSIZE || - drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || - drro->drr_bonuslen > - DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) || - dn_slots > - (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) { - return (SET_ERROR(EINVAL)); - } - - if (rwa->raw) { - /* - * We should have received a DRR_OBJECT_RANGE record - * containing this block and stored it in rwa. - */ - if (drro->drr_object < rwa->or_firstobj || - drro->drr_object >= rwa->or_firstobj + rwa->or_numslots || - drro->drr_raw_bonuslen < drro->drr_bonuslen || - drro->drr_indblkshift > SPA_MAXBLOCKSHIFT || - drro->drr_nlevels > DN_MAX_LEVELS || - drro->drr_nblkptr > DN_MAX_NBLKPTR || - DN_SLOTS_TO_BONUSLEN(dn_slots) < - drro->drr_raw_bonuslen) - return (SET_ERROR(EINVAL)); - } else { - if (drro->drr_flags != 0 || drro->drr_raw_bonuslen != 0 || - drro->drr_indblkshift != 0 || drro->drr_nlevels != 0 || - drro->drr_nblkptr != 0) - return (SET_ERROR(EINVAL)); - } - - err = dmu_object_info(rwa->os, drro->drr_object, &doi); - if (err != 0 && err != ENOENT && err != EEXIST) - return (SET_ERROR(EINVAL)); - - if (drro->drr_object > rwa->max_object) - rwa->max_object = drro->drr_object; - - /* - * If we are losing blkptrs or changing the block size this must - * be a new file instance. We must clear out the previous file - * contents before we can change this type of metadata in the dnode. - * Raw receives will also check that the indirect structure of the - * dnode hasn't changed. - */ - if (err == 0) { - uint32_t indblksz = drro->drr_indblkshift ? - 1ULL << drro->drr_indblkshift : 0; - int nblkptr = deduce_nblkptr(drro->drr_bonustype, - drro->drr_bonuslen); - - object = drro->drr_object; - - /* nblkptr will be bounded by the bonus size and type */ - if (rwa->raw && nblkptr != drro->drr_nblkptr) - return (SET_ERROR(EINVAL)); - - if (drro->drr_blksz != doi.doi_data_block_size || - nblkptr < doi.doi_nblkptr || - dn_slots != doi.doi_dnodesize >> DNODE_SHIFT || - (rwa->raw && - (indblksz != doi.doi_metadata_block_size || - drro->drr_nlevels < doi.doi_indirection))) { - err = dmu_free_long_range(rwa->os, - drro->drr_object, 0, DMU_OBJECT_END); - if (err != 0) - return (SET_ERROR(EINVAL)); - } - - /* - * The dmu does not currently support decreasing nlevels - * on an object. For non-raw sends, this does not matter - * and the new object can just use the previous one's nlevels. - * For raw sends, however, the structure of the received dnode - * (including nlevels) must match that of the send side. - * Therefore, instead of using dmu_object_reclaim(), we must - * free the object completely and call dmu_object_claim_dnsize() - * instead. - */ - if ((rwa->raw && drro->drr_nlevels < doi.doi_indirection) || - dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) { - err = dmu_free_long_object(rwa->os, drro->drr_object); - if (err != 0) - return (SET_ERROR(EINVAL)); - - txg_wait_synced(dmu_objset_pool(rwa->os), 0); - object = DMU_NEW_OBJECT; - } - } else if (err == EEXIST) { - /* - * The object requested is currently an interior slot of a - * multi-slot dnode. This will be resolved when the next txg - * is synced out, since the send stream will have told us - * to free this slot when we freed the associated dnode - * earlier in the stream. - */ - txg_wait_synced(dmu_objset_pool(rwa->os), 0); - object = drro->drr_object; - } else { - /* object is free and we are about to allocate a new one */ - object = DMU_NEW_OBJECT; - } - - /* - * If this is a multi-slot dnode there is a chance that this - * object will expand into a slot that is already used by - * another object from the previous snapshot. We must free - * these objects before we attempt to allocate the new dnode. - */ - if (dn_slots > 1) { - boolean_t need_sync = B_FALSE; - - for (uint64_t slot = drro->drr_object + 1; - slot < drro->drr_object + dn_slots; - slot++) { - dmu_object_info_t slot_doi; - - err = dmu_object_info(rwa->os, slot, &slot_doi); - if (err == ENOENT || err == EEXIST) - continue; - else if (err != 0) - return (err); - - err = dmu_free_long_object(rwa->os, slot); - - if (err != 0) - return (err); - - need_sync = B_TRUE; - } - - if (need_sync) - txg_wait_synced(dmu_objset_pool(rwa->os), 0); - } - - tx = dmu_tx_create(rwa->os); - dmu_tx_hold_bonus(tx, object); - dmu_tx_hold_write(tx, object, 0, 0); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err != 0) { - dmu_tx_abort(tx); - return (err); - } - - if (object == DMU_NEW_OBJECT) { - /* currently free, want to be allocated */ - err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, - drro->drr_type, drro->drr_blksz, - drro->drr_bonustype, drro->drr_bonuslen, - dn_slots << DNODE_SHIFT, tx); - } else if (drro->drr_type != doi.doi_type || - drro->drr_blksz != doi.doi_data_block_size || - drro->drr_bonustype != doi.doi_bonus_type || - drro->drr_bonuslen != doi.doi_bonus_size) { - /* currently allocated, but with different properties */ - err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object, - drro->drr_type, drro->drr_blksz, - drro->drr_bonustype, drro->drr_bonuslen, - dn_slots << DNODE_SHIFT, tx); - } - if (err != 0) { - dmu_tx_commit(tx); - return (SET_ERROR(EINVAL)); - } - - if (rwa->or_crypt_params_present) { - /* - * Set the crypt params for the buffer associated with this - * range of dnodes. This causes the blkptr_t to have the - * same crypt params (byteorder, salt, iv, mac) as on the - * sending side. - * - * Since we are committing this tx now, it is possible for - * the dnode block to end up on-disk with the incorrect MAC, - * if subsequent objects in this block are received in a - * different txg. However, since the dataset is marked as - * inconsistent, no code paths will do a non-raw read (or - * decrypt the block / verify the MAC). The receive code and - * scrub code can safely do raw reads and verify the - * checksum. They don't need to verify the MAC. - */ - dmu_buf_t *db = NULL; - uint64_t offset = rwa->or_firstobj * DNODE_MIN_SIZE; - - err = dmu_buf_hold_by_dnode(DMU_META_DNODE(rwa->os), - offset, FTAG, &db, DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT); - if (err != 0) { - dmu_tx_commit(tx); - return (SET_ERROR(EINVAL)); - } - - dmu_buf_set_crypt_params(db, rwa->or_byteorder, - rwa->or_salt, rwa->or_iv, rwa->or_mac, tx); - - dmu_buf_rele(db, FTAG); - - rwa->or_crypt_params_present = B_FALSE; - } - - dmu_object_set_checksum(rwa->os, drro->drr_object, - drro->drr_checksumtype, tx); - dmu_object_set_compress(rwa->os, drro->drr_object, - drro->drr_compress, tx); + uint64_t span = bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level); + uint64_t start; - /* handle more restrictive dnode structuring for raw recvs */ - if (rwa->raw) { - /* - * Set the indirect block shift and nlevels. This will not fail - * because we ensured all of the blocks were free earlier if - * this is a new object. - */ - VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object, - drro->drr_blksz, drro->drr_indblkshift, tx)); - VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object, - drro->drr_nlevels, tx)); - VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object, - drro->drr_maxblkid, tx)); + /* + * If this multiply overflows, we don't need to send this block. + * Even if it has a birth time, it can never not be a hole, so + * we don't need to send records for it. + */ + if (!overflow_multiply(span, zb->zb_blkid, &start) || (!(zb->zb_blkid == + DMU_SPILL_BLKID || DMU_OT_IS_METADATA(dnp->dn_type)) && + span * zb->zb_blkid > dnp->dn_maxblkid)) { + ASSERT(BP_IS_HOLE(bp)); + return (0); } - if (data != NULL) { - dmu_buf_t *db; - uint32_t flags = DMU_READ_NO_PREFETCH; - - if (rwa->raw) - flags |= DMU_READ_NO_DECRYPT; + if (zb->zb_blkid == DMU_SPILL_BLKID) + ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA); - VERIFY0(dmu_bonus_hold_impl(rwa->os, drro->drr_object, - FTAG, flags, &db)); - dmu_buf_will_dirty(db, tx); + record = range_alloc(DATA, zb->zb_object, start, (start + span < start ? + 0 : start + span), B_FALSE); - ASSERT3U(db->db_size, >=, drro->drr_bonuslen); - bcopy(data, db->db_data, DRR_OBJECT_PAYLOAD_SIZE(drro)); - - /* - * Raw bonus buffers have their byteorder determined by the - * DRR_OBJECT_RANGE record. - */ - if (rwa->byteswap && !rwa->raw) { - dmu_object_byteswap_t byteswap = - DMU_OT_BYTESWAP(drro->drr_bonustype); - dmu_ot_byteswap[byteswap].ob_func(db->db_data, - DRR_OBJECT_PAYLOAD_SIZE(drro)); - } - dmu_buf_rele(db, FTAG); + uint64_t datablksz = (zb->zb_blkid == DMU_SPILL_BLKID ? + BP_GET_LSIZE(bp) : dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + if (BP_IS_HOLE(bp)) { + record->type = HOLE; + record->sru.hole.datablksz = datablksz; + } else if (BP_IS_REDACTED(bp)) { + record->type = REDACT; + record->sru.redact.datablksz = datablksz; + } else { + record->type = DATA; + record->sru.data.datablksz = datablksz; + record->sru.data.obj_type = dnp->dn_type; + record->sru.data.bp = *bp; } - dmu_tx_commit(tx); - + bqueue_enqueue(&sta->q, record, sizeof (*record)); return (0); } -/* ARGSUSED */ -noinline static int -receive_freeobjects(struct receive_writer_arg *rwa, - struct drr_freeobjects *drrfo) -{ - uint64_t obj; - int next_err = 0; - - if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) - return (SET_ERROR(EINVAL)); - - for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj; - obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0; - next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { - dmu_object_info_t doi; - int err; - - err = dmu_object_info(rwa->os, obj, &doi); - if (err == ENOENT) - continue; - else if (err != 0) - return (err); - - err = dmu_free_long_object(rwa->os, obj); - - if (err != 0) - return (err); +struct redact_list_cb_arg { + uint64_t *num_blocks_visited; + bqueue_t *q; + boolean_t *cancel; + boolean_t mark_redact; +}; - if (obj > rwa->max_object) - rwa->max_object = obj; +static int +redact_list_cb(redact_block_phys_t *rb, void *arg) +{ + struct redact_list_cb_arg *rlcap = arg; + + atomic_inc_64(rlcap->num_blocks_visited); + if (*rlcap->cancel) + return (-1); + + struct send_range *data = range_alloc(REDACT, rb->rbp_object, + rb->rbp_blkid, rb->rbp_blkid + redact_block_get_count(rb), B_FALSE); + ASSERT3U(data->end_blkid, >, rb->rbp_blkid); + if (rlcap->mark_redact) { + data->type = REDACT; + data->sru.redact.datablksz = redact_block_get_size(rb); + } else { + data->type = PREVIOUSLY_REDACTED; } - if (next_err != ESRCH) - return (next_err); + bqueue_enqueue(rlcap->q, data, sizeof (*data)); + return (0); } -noinline static int -receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, - arc_buf_t *abuf) +/* + * This function kicks off the traverse_dataset. It also handles setting the + * error code of the thread in case something goes wrong, and pushes the End of + * Stream record when the traverse_dataset call has finished. If there is no + * dataset to traverse, then we traverse the redaction list provided and enqueue + * records for that. If neither is provided, the thread immediately pushes an + * End of Stream marker. + */ +static void +send_traverse_thread(void *arg) { - int err; - dmu_tx_t *tx; - dnode_t *dn; - - if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset || - !DMU_OT_IS_VALID(drrw->drr_type)) - return (SET_ERROR(EINVAL)); - - /* - * For resuming to work, records must be in increasing order - * by (object, offset). - */ - if (drrw->drr_object < rwa->last_object || - (drrw->drr_object == rwa->last_object && - drrw->drr_offset < rwa->last_offset)) { - return (SET_ERROR(EINVAL)); - } - rwa->last_object = drrw->drr_object; - rwa->last_offset = drrw->drr_offset; - - if (rwa->last_object > rwa->max_object) - rwa->max_object = rwa->last_object; - - if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) - return (SET_ERROR(EINVAL)); - - tx = dmu_tx_create(rwa->os); - dmu_tx_hold_write(tx, drrw->drr_object, - drrw->drr_offset, drrw->drr_logical_size); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err != 0) { - dmu_tx_abort(tx); - return (err); - } - - if (rwa->byteswap && !arc_is_encrypted(abuf) && - arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { - dmu_object_byteswap_t byteswap = - DMU_OT_BYTESWAP(drrw->drr_type); - dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, - DRR_WRITE_PAYLOAD_SIZE(drrw)); - } - - VERIFY0(dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn)); - dmu_assign_arcbuf_by_dnode(dn, drrw->drr_offset, abuf, tx); - dnode_rele(dn, FTAG); - - /* - * Note: If the receive fails, we want the resume stream to start - * with the same record that we last successfully received (as opposed - * to the next record), so that we can verify that we are - * resuming from the correct location. - */ - save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); - dmu_tx_commit(tx); + struct send_thread_arg *st_arg = arg; + int err = 0; + struct send_range *data; + fstrans_cookie_t cookie = spl_fstrans_mark(); - return (0); + if (st_arg->ds != NULL) { + ASSERT3P(st_arg->redaction_list, ==, NULL); + err = traverse_dataset_resume(st_arg->ds, + st_arg->fromtxg, &st_arg->resume, + st_arg->flags, send_cb, st_arg); + } else if (st_arg->redaction_list != NULL) { + struct redact_list_cb_arg rlcba = {0}; + rlcba.cancel = &st_arg->cancel; + rlcba.num_blocks_visited = st_arg->num_blocks_visited; + rlcba.q = &st_arg->q; + rlcba.mark_redact = B_FALSE; + err = dsl_redaction_list_traverse(st_arg->redaction_list, + &st_arg->resume, redact_list_cb, &rlcba); + } + + if (err != EINTR) + st_arg->error_code = err; + data = range_alloc(DATA, 0, 0, 0, B_TRUE); + bqueue_enqueue_flush(&st_arg->q, data, sizeof (*data)); + spl_fstrans_unmark(cookie); + thread_exit(); } /* - * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed - * streams to refer to a copy of the data that is already on the - * system because it came in earlier in the stream. This function - * finds the earlier copy of the data, and uses that copy instead of - * data from the stream to fulfill this write. + * Utility function that causes End of Stream records to compare after of all + * others, so that other threads' comparison logic can stay simple. */ static int -receive_write_byref(struct receive_writer_arg *rwa, - struct drr_write_byref *drrwbr) +send_range_after(const struct send_range *from, const struct send_range *to) { - dmu_tx_t *tx; - int err; - guid_map_entry_t gmesrch; - guid_map_entry_t *gmep; - avl_index_t where; - objset_t *ref_os = NULL; - int flags = DMU_READ_PREFETCH; - dmu_buf_t *dbp; - - if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) - return (SET_ERROR(EINVAL)); - - /* - * If the GUID of the referenced dataset is different from the - * GUID of the target dataset, find the referenced dataset. - */ - if (drrwbr->drr_toguid != drrwbr->drr_refguid) { - gmesrch.guid = drrwbr->drr_refguid; - if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch, - &where)) == NULL) { - return (SET_ERROR(EINVAL)); - } - if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) - return (SET_ERROR(EINVAL)); - } else { - ref_os = rwa->os; - } - - if (drrwbr->drr_object > rwa->max_object) - rwa->max_object = drrwbr->drr_object; - - if (rwa->raw) - flags |= DMU_READ_NO_DECRYPT; - - /* may return either a regular db or an encrypted one */ - err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, - drrwbr->drr_refoffset, FTAG, &dbp, flags); - if (err != 0) - return (err); - - tx = dmu_tx_create(rwa->os); - - dmu_tx_hold_write(tx, drrwbr->drr_object, - drrwbr->drr_offset, drrwbr->drr_length); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err != 0) { - dmu_tx_abort(tx); - return (err); - } - - if (rwa->raw) { - dmu_copy_from_buf(rwa->os, drrwbr->drr_object, - drrwbr->drr_offset, dbp, tx); - } else { - dmu_write(rwa->os, drrwbr->drr_object, - drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); - } - dmu_buf_rele(dbp, FTAG); - - /* See comment in restore_write. */ - save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx); - dmu_tx_commit(tx); + if (from->eos_marker == B_TRUE) + return (1); + if (to->eos_marker == B_TRUE) + return (-1); + + uint64_t from_obj = from->object; + uint64_t from_end_obj = from->object + 1; + uint64_t to_obj = to->object; + uint64_t to_end_obj = to->object + 1; + if (from_obj == 0) { + ASSERT(from->type == HOLE || from->type == OBJECT_RANGE); + from_obj = from->start_blkid << DNODES_PER_BLOCK_SHIFT; + from_end_obj = from->end_blkid << DNODES_PER_BLOCK_SHIFT; + } + if (to_obj == 0) { + ASSERT(to->type == HOLE || to->type == OBJECT_RANGE); + to_obj = to->start_blkid << DNODES_PER_BLOCK_SHIFT; + to_end_obj = to->end_blkid << DNODES_PER_BLOCK_SHIFT; + } + + if (from_end_obj <= to_obj) + return (-1); + if (from_obj >= to_end_obj) + return (1); + if (from->type == OBJECT_RANGE && to->type != OBJECT_RANGE) + return (-1); + if (from->type != OBJECT_RANGE && to->type == OBJECT_RANGE) + return (1); + if (from->type == OBJECT && to->type != OBJECT) + return (-1); + if (from->type != OBJECT && to->type == OBJECT) + return (1); + if (from->end_blkid <= to->start_blkid) + return (-1); + if (from->start_blkid >= to->end_blkid) + return (1); return (0); } -static int -receive_write_embedded(struct receive_writer_arg *rwa, - struct drr_write_embedded *drrwe, void *data) +/* + * Pop the new data off the queue, check that the records we receive are in + * the right order, but do not free the old data. This is used so that the + * records can be sent on to the main thread without copying the data. + */ +static struct send_range * +get_next_range_nofree(bqueue_t *bq, struct send_range *prev) { - dmu_tx_t *tx; - int err; - - if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset) - return (SET_ERROR(EINVAL)); - - if (drrwe->drr_psize > BPE_PAYLOAD_SIZE) - return (SET_ERROR(EINVAL)); - - if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES) - return (SET_ERROR(EINVAL)); - if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) - return (SET_ERROR(EINVAL)); - if (rwa->raw) - return (SET_ERROR(EINVAL)); - - if (drrwe->drr_object > rwa->max_object) - rwa->max_object = drrwe->drr_object; + struct send_range *next = bqueue_dequeue(bq); + ASSERTV(send_range_after(prev, next) == -1); + return (next); +} - tx = dmu_tx_create(rwa->os); +/* + * Pop the new data off the queue, check that the records we receive are in + * the right order, and free the old data. + */ +static struct send_range * +get_next_range(bqueue_t *bq, struct send_range *prev) +{ + struct send_range *next = get_next_range_nofree(bq, prev); + range_free(prev); + return (next); +} - dmu_tx_hold_write(tx, drrwe->drr_object, - drrwe->drr_offset, drrwe->drr_length); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err != 0) { - dmu_tx_abort(tx); - return (err); +static void +redact_list_thread(void *arg) +{ + struct redact_list_thread_arg *rlt_arg = arg; + struct send_range *record; + fstrans_cookie_t cookie = spl_fstrans_mark(); + if (rlt_arg->rl != NULL) { + struct redact_list_cb_arg rlcba = {0}; + rlcba.cancel = &rlt_arg->cancel; + rlcba.q = &rlt_arg->q; + rlcba.num_blocks_visited = rlt_arg->num_blocks_visited; + rlcba.mark_redact = rlt_arg->mark_redact; + int err = dsl_redaction_list_traverse(rlt_arg->rl, + &rlt_arg->resume, redact_list_cb, &rlcba); + if (err != EINTR) + rlt_arg->error_code = err; } - - dmu_write_embedded(rwa->os, drrwe->drr_object, - drrwe->drr_offset, data, drrwe->drr_etype, - drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize, - rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); - - /* See comment in restore_write. */ - save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx); - dmu_tx_commit(tx); - return (0); + record = range_alloc(DATA, 0, 0, 0, B_TRUE); + bqueue_enqueue_flush(&rlt_arg->q, record, sizeof (*record)); + spl_fstrans_unmark(cookie); } +/* + * Compare the start point of the two provided ranges. End of stream ranges + * compare last, objects compare before any data or hole inside that object and + * multi-object holes that start at the same object. + */ static int -receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, - arc_buf_t *abuf) +send_range_start_compare(struct send_range *r1, struct send_range *r2) { - dmu_tx_t *tx; - dmu_buf_t *db, *db_spill; - int err; - uint32_t flags = 0; - - if (drrs->drr_length < SPA_MINBLOCKSIZE || - drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) - return (SET_ERROR(EINVAL)); - - if (rwa->raw) { - if (!DMU_OT_IS_VALID(drrs->drr_type) || - drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS || - drrs->drr_compressed_size == 0) - return (SET_ERROR(EINVAL)); - - flags |= DMU_READ_NO_DECRYPT; - } - - if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) - return (SET_ERROR(EINVAL)); - - if (drrs->drr_object > rwa->max_object) - rwa->max_object = drrs->drr_object; - - VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); - if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG, - &db_spill)) != 0) { - dmu_buf_rele(db, FTAG); - return (err); - } - - tx = dmu_tx_create(rwa->os); - - dmu_tx_hold_spill(tx, db->db_object); - - err = dmu_tx_assign(tx, TXG_WAIT); - if (err != 0) { - dmu_buf_rele(db, FTAG); - dmu_buf_rele(db_spill, FTAG); - dmu_tx_abort(tx); - return (err); + uint64_t r1_objequiv = r1->object; + uint64_t r1_l0equiv = r1->start_blkid; + uint64_t r2_objequiv = r2->object; + uint64_t r2_l0equiv = r2->start_blkid; + if (r1->eos_marker) + return (1); + if (r2->eos_marker) + return (-1); + if (r1->object == 0) { + r1_objequiv = r1->start_blkid * DNODES_PER_BLOCK; + r1_l0equiv = 0; } - - if (db_spill->db_size < drrs->drr_length) - VERIFY(0 == dbuf_spill_set_blksz(db_spill, - drrs->drr_length, tx)); - - if (rwa->byteswap && !arc_is_encrypted(abuf) && - arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { - dmu_object_byteswap_t byteswap = - DMU_OT_BYTESWAP(drrs->drr_type); - dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, - DRR_SPILL_PAYLOAD_SIZE(drrs)); + if (r2->object == 0) { + r2_objequiv = r2->start_blkid * DNODES_PER_BLOCK; + r2_l0equiv = 0; } - dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx); - - dmu_buf_rele(db, FTAG); - dmu_buf_rele(db_spill, FTAG); - - dmu_tx_commit(tx); + if (r1_objequiv < r2_objequiv) + return (-1); + if (r1_objequiv > r2_objequiv) + return (1); + if (r1->type == OBJECT_RANGE && r2->type != OBJECT_RANGE) + return (-1); + if (r1->type != OBJECT_RANGE && r2->type == OBJECT_RANGE) + return (1); + if (r1->type == OBJECT && r2->type != OBJECT) + return (-1); + if (r1->type != OBJECT && r2->type == OBJECT) + return (1); + if (r1_l0equiv < r2_l0equiv) + return (-1); + if (r1_l0equiv > r2_l0equiv) + return (1); return (0); } -/* ARGSUSED */ -noinline static int -receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) -{ - int err; - - if (drrf->drr_length != DMU_OBJECT_END && - drrf->drr_offset + drrf->drr_length < drrf->drr_offset) - return (SET_ERROR(EINVAL)); - - if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) - return (SET_ERROR(EINVAL)); - - if (drrf->drr_object > rwa->max_object) - rwa->max_object = drrf->drr_object; - - err = dmu_free_long_range(rwa->os, drrf->drr_object, - drrf->drr_offset, drrf->drr_length); - - return (err); -} +enum q_idx { + REDACT_IDX = 0, + TO_IDX, + FROM_IDX, + NUM_THREADS +}; -static int -receive_object_range(struct receive_writer_arg *rwa, - struct drr_object_range *drror) +/* + * This function returns the next range the send_merge_thread should operate on. + * The inputs are two arrays; the first one stores the range at the front of the + * queues stored in the second one. The ranges are sorted in descending + * priority order; the metadata from earlier ranges overrules metadata from + * later ranges. out_mask is used to return which threads the ranges came from; + * bit i is set if ranges[i] started at the same place as the returned range. + * + * This code is not hardcoded to compare a specific number of threads; it could + * be used with any number, just by changing the q_idx enum. + * + * The "next range" is the one with the earliest start; if two starts are equal, + * the highest-priority range is the next to operate on. If a higher-priority + * range starts in the middle of the first range, then the first range will be + * truncated to end where the higher-priority range starts, and we will operate + * on that one next time. In this way, we make sure that each block covered by + * some range gets covered by a returned range, and each block covered is + * returned using the metadata of the highest-priority range it appears in. + * + * For example, if the three ranges at the front of the queues were [2,4), + * [3,5), and [1,3), then the ranges returned would be [1,2) with the metadata + * from the third range, [2,4) with the metadata from the first range, and then + * [4,5) with the metadata from the second. + */ +static struct send_range * +find_next_range(struct send_range **ranges, bqueue_t **qs, uint64_t *out_mask) { + int idx = 0; // index of the range with the earliest start + int i; + uint64_t bmask = 0; + for (i = 1; i < NUM_THREADS; i++) { + if (send_range_start_compare(ranges[i], ranges[idx]) < 0) + idx = i; + } + if (ranges[idx]->eos_marker) { + struct send_range *ret = range_alloc(DATA, 0, 0, 0, B_TRUE); + *out_mask = 0; + return (ret); + } /* - * By default, we assume this block is in our native format - * (ZFS_HOST_BYTEORDER). We then take into account whether - * the send stream is byteswapped (rwa->byteswap). Finally, - * we need to byteswap again if this particular block was - * in non-native format on the send side. + * Find all the ranges that start at that same point. */ - boolean_t byteorder = ZFS_HOST_BYTEORDER ^ rwa->byteswap ^ - !!DRR_IS_RAW_BYTESWAPPED(drror->drr_flags); - + for (i = 0; i < NUM_THREADS; i++) { + if (send_range_start_compare(ranges[i], ranges[idx]) == 0) + bmask |= 1 << i; + } + *out_mask = bmask; /* - * Since dnode block sizes are constant, we should not need to worry - * about making sure that the dnode block size is the same on the - * sending and receiving sides for the time being. For non-raw sends, - * this does not matter (and in fact we do not send a DRR_OBJECT_RANGE - * record at all). Raw sends require this record type because the - * encryption parameters are used to protect an entire block of bonus - * buffers. If the size of dnode blocks ever becomes variable, - * handling will need to be added to ensure that dnode block sizes - * match on the sending and receiving side. + * OBJECT_RANGE records only come from the TO thread, and should always + * be treated as overlapping with nothing and sent on immediately. They + * are only used in raw sends, and are never redacted. */ - if (drror->drr_numslots != DNODES_PER_BLOCK || - P2PHASE(drror->drr_firstobj, DNODES_PER_BLOCK) != 0 || - !rwa->raw) - return (SET_ERROR(EINVAL)); - - if (drror->drr_firstobj > rwa->max_object) - rwa->max_object = drror->drr_firstobj; + if (ranges[idx]->type == OBJECT_RANGE) { + ASSERT3U(idx, ==, TO_IDX); + ASSERT3U(*out_mask, ==, 1 << TO_IDX); + struct send_range *ret = ranges[idx]; + ranges[idx] = get_next_range_nofree(qs[idx], ranges[idx]); + return (ret); + } + /* + * Find the first start or end point after the start of the first range. + */ + uint64_t first_change = ranges[idx]->end_blkid; + for (i = 0; i < NUM_THREADS; i++) { + if (i == idx || ranges[i]->eos_marker || + ranges[i]->object > ranges[idx]->object || + ranges[i]->object == DMU_META_DNODE_OBJECT) + continue; + ASSERT3U(ranges[i]->object, ==, ranges[idx]->object); + if (first_change > ranges[i]->start_blkid && + (bmask & (1 << i)) == 0) + first_change = ranges[i]->start_blkid; + else if (first_change > ranges[i]->end_blkid) + first_change = ranges[i]->end_blkid; + } + /* + * Update all ranges to no longer overlap with the range we're + * returning. All such ranges must start at the same place as the range + * being returned, and end at or after first_change. Thus we update + * their start to first_change. If that makes them size 0, then free + * them and pull a new range from that thread. + */ + for (i = 0; i < NUM_THREADS; i++) { + if (i == idx || (bmask & (1 << i)) == 0) + continue; + ASSERT3U(first_change, >, ranges[i]->start_blkid); + ranges[i]->start_blkid = first_change; + ASSERT3U(ranges[i]->start_blkid, <=, ranges[i]->end_blkid); + if (ranges[i]->start_blkid == ranges[i]->end_blkid) + ranges[i] = get_next_range(qs[i], ranges[i]); + } + /* + * Short-circuit the simple case; if the range doesn't overlap with + * anything else, or it only overlaps with things that start at the same + * place and are longer, send it on. + */ + if (first_change == ranges[idx]->end_blkid) { + struct send_range *ret = ranges[idx]; + ranges[idx] = get_next_range_nofree(qs[idx], ranges[idx]); + return (ret); + } /* - * The DRR_OBJECT_RANGE handling must be deferred to receive_object() - * so that the block of dnodes is not written out when it's empty, - * and converted to a HOLE BP. + * Otherwise, return a truncated copy of ranges[idx] and move the start + * of ranges[idx] back to first_change. */ - rwa->or_crypt_params_present = B_TRUE; - rwa->or_firstobj = drror->drr_firstobj; - rwa->or_numslots = drror->drr_numslots; - bcopy(drror->drr_salt, rwa->or_salt, ZIO_DATA_SALT_LEN); - bcopy(drror->drr_iv, rwa->or_iv, ZIO_DATA_IV_LEN); - bcopy(drror->drr_mac, rwa->or_mac, ZIO_DATA_MAC_LEN); - rwa->or_byteorder = byteorder; + struct send_range *ret = kmem_alloc(sizeof (*ret), KM_SLEEP); + *ret = *ranges[idx]; + ret->end_blkid = first_change; + ranges[idx]->start_blkid = first_change; + return (ret); +} - return (0); +#define FROM_AND_REDACT_BITS ((1 << REDACT_IDX) | (1 << FROM_IDX)) + +/* + * Merge the results from the from thread and the to thread, and then hand the + * records off to send_prefetch_thread to prefetch them. If this is not a + * send from a redaction bookmark, the from thread will push an end of stream + * record and stop, and we'll just send everything that was changed in the + * to_ds since the ancestor's creation txg. If it is, then since + * traverse_dataset has a canonical order, we can compare each change as + * they're pulled off the queues. That will give us a stream that is + * appropriately sorted, and covers all records. In addition, we pull the + * data from the redact_list_thread and use that to determine which blocks + * should be redacted. + */ +static void +send_merge_thread(void *arg) +{ + struct send_merge_thread_arg *smt_arg = arg; + struct send_range *front_ranges[NUM_THREADS]; + bqueue_t *queues[NUM_THREADS]; + int err = 0; + fstrans_cookie_t cookie = spl_fstrans_mark(); + + if (smt_arg->redact_arg == NULL) { + front_ranges[REDACT_IDX] = + kmem_zalloc(sizeof (struct send_range), KM_SLEEP); + front_ranges[REDACT_IDX]->eos_marker = B_TRUE; + front_ranges[REDACT_IDX]->type = REDACT; + queues[REDACT_IDX] = NULL; + } else { + front_ranges[REDACT_IDX] = + bqueue_dequeue(&smt_arg->redact_arg->q); + queues[REDACT_IDX] = &smt_arg->redact_arg->q; + } + front_ranges[TO_IDX] = bqueue_dequeue(&smt_arg->to_arg->q); + queues[TO_IDX] = &smt_arg->to_arg->q; + front_ranges[FROM_IDX] = bqueue_dequeue(&smt_arg->from_arg->q); + queues[FROM_IDX] = &smt_arg->from_arg->q; + uint64_t mask = 0; + struct send_range *range; + for (range = find_next_range(front_ranges, queues, &mask); + !range->eos_marker && err == 0 && !smt_arg->cancel; + range = find_next_range(front_ranges, queues, &mask)) { + /* + * If the range in question was in both the from redact bookmark + * and the bookmark we're using to redact, then don't send it. + * It's already redacted on the receiving system, so a redaction + * record would be redundant. + */ + if ((mask & FROM_AND_REDACT_BITS) == FROM_AND_REDACT_BITS) { + ASSERT3U(range->type, ==, REDACT); + range_free(range); + continue; + } + bqueue_enqueue(&smt_arg->q, range, sizeof (*range)); + + if (smt_arg->to_arg->error_code != 0) { + err = smt_arg->to_arg->error_code; + } else if (smt_arg->from_arg->error_code != 0) { + err = smt_arg->from_arg->error_code; + } else if (smt_arg->redact_arg != NULL && + smt_arg->redact_arg->error_code != 0) { + err = smt_arg->redact_arg->error_code; + } + } + if (smt_arg->cancel && err == 0) + err = SET_ERROR(EINTR); + smt_arg->error = err; + if (smt_arg->error != 0) { + smt_arg->to_arg->cancel = B_TRUE; + smt_arg->from_arg->cancel = B_TRUE; + if (smt_arg->redact_arg != NULL) + smt_arg->redact_arg->cancel = B_TRUE; + } + for (int i = 0; i < NUM_THREADS; i++) { + while (!front_ranges[i]->eos_marker) { + front_ranges[i] = get_next_range(queues[i], + front_ranges[i]); + } + range_free(front_ranges[i]); + } + if (range == NULL) + range = kmem_zalloc(sizeof (*range), KM_SLEEP); + range->eos_marker = B_TRUE; + bqueue_enqueue_flush(&smt_arg->q, range, 1); + spl_fstrans_unmark(cookie); + thread_exit(); } -/* used to destroy the drc_ds on error */ +struct send_prefetch_thread_arg { + struct send_merge_thread_arg *smta; + bqueue_t q; + boolean_t cancel; + boolean_t issue_prefetches; + int error; +}; + +/* + * Create a new record with the given values. If the record is of a type that + * can be coalesced, and if it can be coalesced with the previous record, then + * coalesce those and don't push anything out. If either of those are not true, + * we push out the pending record and create a new one out of the current + * record. + */ static void -dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) +enqueue_range(struct send_prefetch_thread_arg *spta, bqueue_t *q, dnode_t *dn, + uint64_t blkid, blkptr_t *bp, uint32_t datablksz, struct send_range **pendp) { - dsl_dataset_t *ds = drc->drc_ds; - ds_hold_flags_t dsflags = (drc->drc_raw) ? 0 : DS_HOLD_FLAG_DECRYPT; + struct send_range *pending = *pendp; + enum type pending_type = (pending == NULL ? PREVIOUSLY_REDACTED : + pending->type); + enum type new_type = (BP_IS_HOLE(bp) ? HOLE : + (BP_IS_REDACTED(bp) ? REDACT : DATA)); + + if (pending_type == new_type) { + pending->end_blkid = blkid; + return; + } + if (pending_type != PREVIOUSLY_REDACTED) { + bqueue_enqueue(q, pending, sizeof (*pending)); + pending = NULL; + } + ASSERT3P(pending, ==, NULL); + pending = range_alloc(new_type, dn->dn_object, blkid, blkid + 1, + B_FALSE); - /* - * Wait for the txg sync before cleaning up the receive. For - * resumable receives, this ensures that our resume state has - * been written out to disk. For raw receives, this ensures - * that the user accounting code will not attempt to do anything - * after we stopped receiving the dataset. - */ - txg_wait_synced(ds->ds_dir->dd_pool, 0); - ds->ds_objset->os_raw_receive = B_FALSE; + if (blkid == DMU_SPILL_BLKID) + ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA); - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - if (drc->drc_resumable && !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) { - rrw_exit(&ds->ds_bp_rwlock, FTAG); - dsl_dataset_disown(ds, dsflags, dmu_recv_tag); - } else { - char name[ZFS_MAX_DATASET_NAME_LEN]; - rrw_exit(&ds->ds_bp_rwlock, FTAG); - dsl_dataset_name(ds, name); - dsl_dataset_disown(ds, dsflags, dmu_recv_tag); - (void) dsl_destroy_head(name); + switch (new_type) { + case HOLE: + pending->sru.hole.datablksz = datablksz; + break; + case DATA: + pending->sru.data.datablksz = datablksz; + pending->sru.data.obj_type = dn->dn_type; + pending->sru.data.bp = *bp; + if (spta->issue_prefetches) { + zbookmark_phys_t zb = {0}; + zb.zb_objset = dmu_objset_id(dn->dn_objset); + zb.zb_object = dn->dn_object; + zb.zb_level = 0; + zb.zb_blkid = blkid; + arc_flags_t aflags = ARC_FLAG_NOWAIT | + ARC_FLAG_PREFETCH; + (void) arc_read(NULL, dn->dn_objset->os_spa, bp, NULL, + NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE, &aflags, &zb); + } + bqueue_enqueue(q, pending, datablksz); + pending = NULL; + break; + case REDACT: + pending->sru.redact.datablksz = datablksz; + break; + default: + break; } + *pendp = pending; } +/* + * This thread is responsible for two things: First, it retrieves the correct + * blkptr in the to ds if we need to send the data because of something from + * the from thread. As a result of this, we're the first ones to discover that + * some indirect blocks can be discarded because they're not holes. Second, + * it issues prefetches for the data we need to send. + */ static void -receive_cksum(struct receive_arg *ra, int len, void *buf) +send_prefetch_thread(void *arg) { - if (ra->byteswap) { - (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum); - } else { - (void) fletcher_4_incremental_native(buf, len, &ra->cksum); + struct send_prefetch_thread_arg *spta = arg; + struct send_merge_thread_arg *smta = spta->smta; + bqueue_t *inq = &smta->q; + bqueue_t *outq = &spta->q; + objset_t *os = smta->os; + fstrans_cookie_t cookie = spl_fstrans_mark(); + struct send_range *range = bqueue_dequeue(inq); + int err = 0; + + /* + * If the record we're analyzing is from a redaction bookmark from the + * fromds, then we need to know whether or not it exists in the tods so + * we know whether to create records for it or not. If it does, we need + * the datablksz so we can generate an appropriate record for it. + * Finally, if it isn't redacted, we need the blkptr so that we can send + * a WRITE record containing the actual data. + */ + uint64_t last_obj = UINT64_MAX; + uint64_t last_obj_exists = B_TRUE; + while (!range->eos_marker && !spta->cancel && smta->error == 0) { + switch (range->type) { + case DATA: { + zbookmark_phys_t zb; + zb.zb_objset = dmu_objset_id(os); + zb.zb_object = range->object; + zb.zb_level = 0; + zb.zb_blkid = range->start_blkid; + ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); + if (!BP_IS_REDACTED(&range->sru.data.bp) && + spta->issue_prefetches && + !BP_IS_EMBEDDED(&range->sru.data.bp)) { + arc_flags_t aflags = ARC_FLAG_NOWAIT | + ARC_FLAG_PREFETCH; + (void) arc_read(NULL, os->os_spa, + &range->sru.data.bp, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE, &aflags, &zb); + } + bqueue_enqueue(outq, range, range->sru.data.datablksz); + range = get_next_range_nofree(inq, range); + break; + } + case HOLE: + case OBJECT: + case OBJECT_RANGE: + case REDACT: // Redacted blocks must exist + bqueue_enqueue(outq, range, sizeof (*range)); + range = get_next_range_nofree(inq, range); + break; + case PREVIOUSLY_REDACTED: { + /* + * This entry came from the "from bookmark" when + * sending from a bookmark that has a redaction + * list. We need to check if this object/blkid + * exists in the target ("to") dataset, and if + * not then we drop this entry. We also need + * to fill in the block pointer so that we know + * what to prefetch. + * + * To accomplish the above, we first cache whether or + * not the last object we examined exists. If it + * doesn't, we can drop this record. If it does, we hold + * the dnode and use it to call dbuf_dnode_findbp. We do + * this instead of dbuf_bookmark_findbp because we will + * often operate on large ranges, and holding the dnode + * once is more efficient. + */ + boolean_t object_exists = B_TRUE; + /* + * If the data is redacted, we only care if it exists, + * so that we don't send records for objects that have + * been deleted. + */ + dnode_t *dn; + if (range->object == last_obj && !last_obj_exists) { + /* + * If we're still examining the same object as + * previously, and it doesn't exist, we don't + * need to call dbuf_bookmark_findbp. + */ + object_exists = B_FALSE; + } else { + err = dnode_hold(os, range->object, FTAG, &dn); + if (err == ENOENT) { + object_exists = B_FALSE; + err = 0; + } + last_obj = range->object; + last_obj_exists = object_exists; + } + + if (err != 0) { + break; + } else if (!object_exists) { + /* + * The block was modified, but doesn't + * exist in the to dataset; if it was + * deleted in the to dataset, then we'll + * visit the hole bp for it at some point. + */ + range = get_next_range(inq, range); + continue; + } + struct send_range *pending = NULL; + uint64_t file_max = + (dn->dn_maxblkid < range->end_blkid ? + dn->dn_maxblkid : range->end_blkid); + /* + * The object exists, so we need to try to find the + * blkptr for each block in the range we're processing. + */ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + for (uint64_t blkid = range->start_blkid; + blkid < file_max; blkid++) { + uint16_t datablkszsec; + blkptr_t bp; + err = dbuf_dnode_findbp(dn, 0, blkid, &bp, + &datablkszsec, NULL); + if (err != 0) + break; + enqueue_range(spta, outq, dn, blkid, &bp, + datablkszsec << SPA_MINBLOCKSHIFT, + &pending); + } + if (pending != NULL) { + bqueue_enqueue(outq, pending, + sizeof (*pending)); + } + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + range = get_next_range(inq, range); + } + } + } + if (spta->cancel || err != 0) { + smta->cancel = B_TRUE; + spta->error = err; + } else if (smta->error != 0) { + spta->error = smta->error; } + while (!range->eos_marker) + range = get_next_range(inq, range); + + bqueue_enqueue_flush(outq, range, 1); + spl_fstrans_unmark(cookie); + thread_exit(); } -/* - * Read the payload into a buffer of size len, and update the current record's - * payload field. - * Allocate ra->next_rrd and read the next record's header into - * ra->next_rrd->header. - * Verify checksum of payload and next record. - */ +#define NUM_SNAPS_NOT_REDACTED UINT64_MAX + +struct dmu_send_params { + /* Pool args */ + void *tag; // Tag that dp was held with, will be used to release dp. + dsl_pool_t *dp; + /* To snapshot args */ + const char *tosnap; + dsl_dataset_t *to_ds; + /* From snapshot args */ + zfs_bookmark_phys_t ancestor_zb; + uint64_t *fromredactsnaps; + /* NUM_SNAPS_NOT_REDACTED if not sending from redaction bookmark */ + uint64_t numfromredactsnaps; + /* Stream params */ + boolean_t is_clone; + boolean_t embedok; + boolean_t large_block_ok; + boolean_t compressok; + uint64_t resumeobj; + uint64_t resumeoff; + zfs_bookmark_phys_t *redactbook; + /* Stream output params */ + dmu_send_outparams_t *dso; + + /* Stream progress params */ + offset_t *off; + int outfd; + boolean_t rawok; +}; + static int -receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) +setup_featureflags(struct dmu_send_params *dspp, objset_t *os, + uint64_t *featureflags) { - int err; - zio_cksum_t cksum_orig; - zio_cksum_t *cksump; + dsl_dataset_t *to_ds = dspp->to_ds; + dsl_pool_t *dp = dspp->dp; +#ifdef _KERNEL + if (dmu_objset_type(os) == DMU_OST_ZFS) { + uint64_t version; + if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) + return (SET_ERROR(EINVAL)); - if (len != 0) { - ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); - err = receive_read(ra, len, buf); - if (err != 0) - return (err); - receive_cksum(ra, len, buf); + if (version >= ZPL_VERSION_SA) + *featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; + } +#endif - /* note: rrd is NULL when reading the begin record's payload */ - if (ra->rrd != NULL) { - ra->rrd->payload = buf; - ra->rrd->payload_size = len; - ra->rrd->bytes_read = ra->bytes_read; - } + /* raw sends imply large_block_ok */ + if ((dspp->rawok || dspp->large_block_ok) && + dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_BLOCKS)) { + *featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; } - ra->prev_cksum = ra->cksum; + /* encrypted datasets will not have embedded blocks */ + if ((dspp->embedok || dspp->rawok) && !os->os_encrypted && + spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { + *featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; + } - ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); - err = receive_read(ra, sizeof (ra->next_rrd->header), - &ra->next_rrd->header); - ra->next_rrd->bytes_read = ra->bytes_read; + /* raw send implies compressok */ + if (dspp->compressok || dspp->rawok) + *featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; + if (dspp->rawok && os->os_encrypted) + *featureflags |= DMU_BACKUP_FEATURE_RAW; - if (err != 0) { - kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); - ra->next_rrd = NULL; - return (err); + if ((*featureflags & + (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED | + DMU_BACKUP_FEATURE_RAW)) != 0 && + spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { + *featureflags |= DMU_BACKUP_FEATURE_LZ4; } - if (ra->next_rrd->header.drr_type == DRR_BEGIN) { - kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); - ra->next_rrd = NULL; - return (SET_ERROR(EINVAL)); + + if (dspp->resumeobj != 0 || dspp->resumeoff != 0) { + *featureflags |= DMU_BACKUP_FEATURE_RESUMING; } - /* - * Note: checksum is of everything up to but not including the - * checksum itself. - */ - ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), - ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - receive_cksum(ra, - offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), - &ra->next_rrd->header); + if (dspp->redactbook != NULL) { + *featureflags |= DMU_BACKUP_FEATURE_REDACTED; + } + + if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_DNODE)) { + *featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE; + } + return (0); +} + +static dmu_replay_record_t * +create_begin_record(struct dmu_send_params *dspp, objset_t *os, + uint64_t featureflags) +{ + dmu_replay_record_t *drr = kmem_zalloc(sizeof (dmu_replay_record_t), + KM_SLEEP); + drr->drr_type = DRR_BEGIN; - cksum_orig = ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; - cksump = &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; + struct drr_begin *drrb = &drr->drr_u.drr_begin; + dsl_dataset_t *to_ds = dspp->to_ds; - if (ra->byteswap) - byteswap_record(&ra->next_rrd->header); + drrb->drr_magic = DMU_BACKUP_MAGIC; + drrb->drr_creation_time = dsl_dataset_phys(to_ds)->ds_creation_time; + drrb->drr_type = dmu_objset_type(os); + drrb->drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; + drrb->drr_fromguid = dspp->ancestor_zb.zbm_guid; - if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && - !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) { - kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); - ra->next_rrd = NULL; - return (SET_ERROR(ECKSUM)); - } + DMU_SET_STREAM_HDRTYPE(drrb->drr_versioninfo, DMU_SUBSTREAM); + DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, featureflags); - receive_cksum(ra, sizeof (cksum_orig), &cksum_orig); + if (dspp->is_clone) + drrb->drr_flags |= DRR_FLAG_CLONE; + if (dsl_dataset_phys(dspp->to_ds)->ds_flags & DS_FLAG_CI_DATASET) + drrb->drr_flags |= DRR_FLAG_CI_DATA; + if (zfs_send_set_freerecords_bit) + drrb->drr_flags |= DRR_FLAG_FREERECORDS; - return (0); + dsl_dataset_name(to_ds, drrb->drr_toname); + if (!to_ds->ds_is_snapshot) { + (void) strlcat(drrb->drr_toname, "@--head--", + sizeof (drrb->drr_toname)); + } + return (drr); } static void -objlist_create(struct objlist *list) +setup_to_thread(struct send_thread_arg *to_arg, dsl_dataset_t *to_ds, + dmu_sendstatus_t *dssp, uint64_t fromtxg, boolean_t rawok) { - list_create(&list->list, sizeof (struct receive_objnode), - offsetof(struct receive_objnode, node)); - list->last_lookup = 0; + VERIFY0(bqueue_init(&to_arg->q, zfs_send_no_prefetch_queue_ff, + MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize), + offsetof(struct send_range, ln))); + to_arg->error_code = 0; + to_arg->cancel = B_FALSE; + to_arg->ds = to_ds; + to_arg->fromtxg = fromtxg; + to_arg->flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA; + if (rawok) + to_arg->flags |= TRAVERSE_NO_DECRYPT; + to_arg->redaction_list = NULL; + to_arg->num_blocks_visited = &dssp->dss_blocks; + (void) thread_create(NULL, 0, send_traverse_thread, to_arg, 0, + curproc, TS_RUN, minclsyspri); } static void -objlist_destroy(struct objlist *list) +setup_from_thread(struct redact_list_thread_arg *from_arg, + redaction_list_t *from_rl, dmu_sendstatus_t *dssp) { - for (struct receive_objnode *n = list_remove_head(&list->list); - n != NULL; n = list_remove_head(&list->list)) { - kmem_free(n, sizeof (*n)); - } - list_destroy(&list->list); + VERIFY0(bqueue_init(&from_arg->q, zfs_send_no_prefetch_queue_ff, + MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize), + offsetof(struct send_range, ln))); + from_arg->error_code = 0; + from_arg->cancel = B_FALSE; + from_arg->rl = from_rl; + from_arg->mark_redact = B_FALSE; + from_arg->num_blocks_visited = &dssp->dss_blocks; + /* + * If from_ds is null, send_traverse_thread just returns success and + * enqueues an eos marker. + */ + (void) thread_create(NULL, 0, redact_list_thread, from_arg, 0, + curproc, TS_RUN, minclsyspri); } -/* - * This function looks through the objlist to see if the specified object number - * is contained in the objlist. In the process, it will remove all object - * numbers in the list that are smaller than the specified object number. Thus, - * any lookup of an object number smaller than a previously looked up object - * number will always return false; therefore, all lookups should be done in - * ascending order. - */ -static boolean_t -objlist_exists(struct objlist *list, uint64_t object) +static void +setup_redact_list_thread(struct redact_list_thread_arg *rlt_arg, + struct dmu_send_params *dspp, redaction_list_t *rl, dmu_sendstatus_t *dssp) { - struct receive_objnode *node = list_head(&list->list); - ASSERT3U(object, >=, list->last_lookup); - list->last_lookup = object; - while (node != NULL && node->object < object) { - VERIFY3P(node, ==, list_remove_head(&list->list)); - kmem_free(node, sizeof (*node)); - node = list_head(&list->list); - } - return (node != NULL && node->object == object); + if (dspp->redactbook == NULL) + return; + + rlt_arg->cancel = B_FALSE; + VERIFY0(bqueue_init(&rlt_arg->q, zfs_send_no_prefetch_queue_ff, + MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize), + offsetof(struct send_range, ln))); + rlt_arg->error_code = 0; + rlt_arg->mark_redact = B_TRUE; + rlt_arg->rl = rl; + rlt_arg->num_blocks_visited = &dssp->dss_blocks; + + (void) thread_create(NULL, 0, redact_list_thread, rlt_arg, 0, + curproc, TS_RUN, minclsyspri); } -/* - * The objlist is a list of object numbers stored in ascending order. However, - * the insertion of new object numbers does not seek out the correct location to - * store a new object number; instead, it appends it to the list for simplicity. - * Thus, any users must take care to only insert new object numbers in ascending - * order. - */ static void -objlist_insert(struct objlist *list, uint64_t object) +setup_merge_thread(struct send_merge_thread_arg *smt_arg, + struct dmu_send_params *dspp, struct redact_list_thread_arg *from_arg, + struct send_thread_arg *to_arg, struct redact_list_thread_arg *rlt_arg, + objset_t *os) { - struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP); - node->object = object; -#ifdef ZFS_DEBUG - { - struct receive_objnode *last_object = list_tail(&list->list); - uint64_t last_objnum = (last_object != NULL ? last_object->object : 0); - ASSERT3U(node->object, >, last_objnum); - } -#endif - list_insert_tail(&list->list, node); + VERIFY0(bqueue_init(&smt_arg->q, zfs_send_no_prefetch_queue_ff, + MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize), + offsetof(struct send_range, ln))); + smt_arg->cancel = B_FALSE; + smt_arg->error = 0; + smt_arg->from_arg = from_arg; + smt_arg->to_arg = to_arg; + if (dspp->redactbook != NULL) + smt_arg->redact_arg = rlt_arg; + + smt_arg->os = os; + (void) thread_create(NULL, 0, send_merge_thread, smt_arg, 0, curproc, + TS_RUN, minclsyspri); } -/* - * Issue the prefetch reads for any necessary indirect blocks. - * - * We use the object ignore list to tell us whether or not to issue prefetches - * for a given object. We do this for both correctness (in case the blocksize - * of an object has changed) and performance (if the object doesn't exist, don't - * needlessly try to issue prefetches). We also trim the list as we go through - * the stream to prevent it from growing to an unbounded size. - * - * The object numbers within will always be in sorted order, and any write - * records we see will also be in sorted order, but they're not sorted with - * respect to each other (i.e. we can get several object records before - * receiving each object's write records). As a result, once we've reached a - * given object number, we can safely remove any reference to lower object - * numbers in the ignore list. In practice, we receive up to 32 object records - * before receiving write records, so the list can have up to 32 nodes in it. - */ -/* ARGSUSED */ static void -receive_read_prefetch(struct receive_arg *ra, - uint64_t object, uint64_t offset, uint64_t length) +setup_prefetch_thread(struct send_prefetch_thread_arg *spt_arg, + struct dmu_send_params *dspp, struct send_merge_thread_arg *smt_arg) { - if (!objlist_exists(&ra->ignore_objlist, object)) { - dmu_prefetch(ra->os, object, 1, offset, length, - ZIO_PRIORITY_SYNC_READ); - } + VERIFY0(bqueue_init(&spt_arg->q, zfs_send_queue_ff, + MAX(zfs_send_queue_length, 2 * zfs_max_recordsize), + offsetof(struct send_range, ln))); + spt_arg->smta = smt_arg; + spt_arg->issue_prefetches = !dspp->dso->dso_dryrun; + (void) thread_create(NULL, 0, send_prefetch_thread, spt_arg, 0, + curproc, TS_RUN, minclsyspri); } -/* - * Read records off the stream, issuing any necessary prefetches. - */ static int -receive_read_record(struct receive_arg *ra) +setup_resume_points(struct dmu_send_params *dspp, + struct send_thread_arg *to_arg, struct redact_list_thread_arg *from_arg, + struct redact_list_thread_arg *rlt_arg, + struct send_merge_thread_arg *smt_arg, boolean_t resuming, objset_t *os, + redaction_list_t *redact_rl, nvlist_t *nvl) { - int err; - - switch (ra->rrd->header.drr_type) { - case DRR_OBJECT: - { - struct drr_object *drro = &ra->rrd->header.drr_u.drr_object; - uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro); - void *buf = kmem_zalloc(size, KM_SLEEP); - dmu_object_info_t doi; + dsl_dataset_t *to_ds = dspp->to_ds; + int err = 0; - err = receive_read_payload_and_next_header(ra, size, buf); - if (err != 0) { - kmem_free(buf, size); + uint64_t obj = 0; + uint64_t blkid = 0; + if (resuming) { + obj = dspp->resumeobj; + dmu_object_info_t to_doi; + err = dmu_object_info(os, obj, &to_doi); + if (err != 0) return (err); - } - err = dmu_object_info(ra->os, drro->drr_object, &doi); - /* - * See receive_read_prefetch for an explanation why we're - * storing this object in the ignore_obj_list. - */ - if (err == ENOENT || err == EEXIST || - (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { - objlist_insert(&ra->ignore_objlist, drro->drr_object); - err = 0; - } - return (err); - } - case DRR_FREEOBJECTS: - { - err = receive_read_payload_and_next_header(ra, 0, NULL); - return (err); - } - case DRR_WRITE: - { - struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; - arc_buf_t *abuf; - boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type); - - if (ra->raw) { - boolean_t byteorder = ZFS_HOST_BYTEORDER ^ - !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^ - ra->byteswap; - - abuf = arc_loan_raw_buf(dmu_objset_spa(ra->os), - drrw->drr_object, byteorder, drrw->drr_salt, - drrw->drr_iv, drrw->drr_mac, drrw->drr_type, - drrw->drr_compressed_size, drrw->drr_logical_size, - drrw->drr_compressiontype); - } else if (DRR_WRITE_COMPRESSED(drrw)) { - ASSERT3U(drrw->drr_compressed_size, >, 0); - ASSERT3U(drrw->drr_logical_size, >=, - drrw->drr_compressed_size); - ASSERT(!is_meta); - abuf = arc_loan_compressed_buf( - dmu_objset_spa(ra->os), - drrw->drr_compressed_size, drrw->drr_logical_size, - drrw->drr_compressiontype); - } else { - abuf = arc_loan_buf(dmu_objset_spa(ra->os), - is_meta, drrw->drr_logical_size); - } - err = receive_read_payload_and_next_header(ra, - DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data); - if (err != 0) { - dmu_return_arcbuf(abuf); - return (err); - } - ra->rrd->arc_buf = abuf; - receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, - drrw->drr_logical_size); - return (err); + blkid = dspp->resumeoff / to_doi.doi_data_block_size; } - case DRR_WRITE_BYREF: - { - struct drr_write_byref *drrwb = - &ra->rrd->header.drr_u.drr_write_byref; - err = receive_read_payload_and_next_header(ra, 0, NULL); - receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset, - drrwb->drr_length); - return (err); + /* + * If we're resuming a redacted send, we can skip to the appropriate + * point in the redaction bookmark by binary searching through it. + */ + smt_arg->bookmark_before = B_FALSE; + if (redact_rl != NULL) { + SET_BOOKMARK(&rlt_arg->resume, to_ds->ds_object, obj, 0, blkid); } - case DRR_WRITE_EMBEDDED: - { - struct drr_write_embedded *drrwe = - &ra->rrd->header.drr_u.drr_write_embedded; - uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); - void *buf = kmem_zalloc(size, KM_SLEEP); - - err = receive_read_payload_and_next_header(ra, size, buf); - if (err != 0) { - kmem_free(buf, size); - return (err); - } - receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset, - drrwe->drr_length); - return (err); - } - case DRR_FREE: - { + SET_BOOKMARK(&to_arg->resume, to_ds->ds_object, obj, 0, blkid); + if (nvlist_exists(nvl, BEGINNV_REDACT_FROM_SNAPS)) { + uint64_t objset = dspp->ancestor_zb.zbm_redaction_obj; /* - * It might be beneficial to prefetch indirect blocks here, but - * we don't really have the data to decide for sure. + * Note: If the resume point is in an object whose + * blocksize is different in the from vs to snapshots, + * we will have divided by the "wrong" blocksize. + * However, in this case fromsnap's send_cb() will + * detect that the blocksize has changed and therefore + * ignore this object. + * + * If we're resuming a send from a redaction bookmark, + * we still cannot accidentally suggest blocks behind + * the to_ds. In addition, we know that any blocks in + * the object in the to_ds will have to be sent, since + * the size changed. Therefore, we can't cause any harm + * this way either. */ - err = receive_read_payload_and_next_header(ra, 0, NULL); - return (err); - } - case DRR_END: - { - struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; - if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) - return (SET_ERROR(ECKSUM)); - return (0); - } - case DRR_SPILL: - { - struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill; - arc_buf_t *abuf; - int len = DRR_SPILL_PAYLOAD_SIZE(drrs); - - /* DRR_SPILL records are either raw or uncompressed */ - if (ra->raw) { - boolean_t byteorder = ZFS_HOST_BYTEORDER ^ - !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^ - ra->byteswap; - - abuf = arc_loan_raw_buf(dmu_objset_spa(ra->os), - dmu_objset_id(ra->os), byteorder, drrs->drr_salt, - drrs->drr_iv, drrs->drr_mac, drrs->drr_type, - drrs->drr_compressed_size, drrs->drr_length, - drrs->drr_compressiontype); - } else { - abuf = arc_loan_buf(dmu_objset_spa(ra->os), - DMU_OT_IS_METADATA(drrs->drr_type), - drrs->drr_length); - } - - err = receive_read_payload_and_next_header(ra, len, - abuf->b_data); - if (err != 0) { - dmu_return_arcbuf(abuf); - return (err); - } - ra->rrd->arc_buf = abuf; - return (err); - } - case DRR_OBJECT_RANGE: - { - err = receive_read_payload_and_next_header(ra, 0, NULL); - return (err); + SET_BOOKMARK(&from_arg->resume, objset, obj, 0, blkid); } - default: - return (SET_ERROR(EINVAL)); + if (resuming) { + fnvlist_add_uint64(nvl, BEGINNV_RESUME_OBJECT, dspp->resumeobj); + fnvlist_add_uint64(nvl, BEGINNV_RESUME_OFFSET, dspp->resumeoff); } + return (0); } -static void -dprintf_drr(struct receive_record_arg *rrd, int err) +static dmu_sendstatus_t * +setup_send_progress(struct dmu_send_params *dspp) { -#ifdef ZFS_DEBUG - switch (rrd->header.drr_type) { - case DRR_OBJECT: - { - struct drr_object *drro = &rrd->header.drr_u.drr_object; - dprintf("drr_type = OBJECT obj = %llu type = %u " - "bonustype = %u blksz = %u bonuslen = %u cksumtype = %u " - "compress = %u dn_slots = %u err = %d\n", - drro->drr_object, drro->drr_type, drro->drr_bonustype, - drro->drr_blksz, drro->drr_bonuslen, - drro->drr_checksumtype, drro->drr_compress, - drro->drr_dn_slots, err); - break; - } - case DRR_FREEOBJECTS: - { - struct drr_freeobjects *drrfo = - &rrd->header.drr_u.drr_freeobjects; - dprintf("drr_type = FREEOBJECTS firstobj = %llu " - "numobjs = %llu err = %d\n", - drrfo->drr_firstobj, drrfo->drr_numobjs, err); - break; - } - case DRR_WRITE: - { - struct drr_write *drrw = &rrd->header.drr_u.drr_write; - dprintf("drr_type = WRITE obj = %llu type = %u offset = %llu " - "lsize = %llu cksumtype = %u cksumflags = %u " - "compress = %u psize = %llu err = %d\n", - drrw->drr_object, drrw->drr_type, drrw->drr_offset, - drrw->drr_logical_size, drrw->drr_checksumtype, - drrw->drr_flags, drrw->drr_compressiontype, - drrw->drr_compressed_size, err); - break; - } - case DRR_WRITE_BYREF: - { - struct drr_write_byref *drrwbr = - &rrd->header.drr_u.drr_write_byref; - dprintf("drr_type = WRITE_BYREF obj = %llu offset = %llu " - "length = %llu toguid = %llx refguid = %llx " - "refobject = %llu refoffset = %llu cksumtype = %u " - "cksumflags = %u err = %d\n", - drrwbr->drr_object, drrwbr->drr_offset, - drrwbr->drr_length, drrwbr->drr_toguid, - drrwbr->drr_refguid, drrwbr->drr_refobject, - drrwbr->drr_refoffset, drrwbr->drr_checksumtype, - drrwbr->drr_flags, err); - break; - } - case DRR_WRITE_EMBEDDED: - { - struct drr_write_embedded *drrwe = - &rrd->header.drr_u.drr_write_embedded; - dprintf("drr_type = WRITE_EMBEDDED obj = %llu offset = %llu " - "length = %llu compress = %u etype = %u lsize = %u " - "psize = %u err = %d\n", - drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length, - drrwe->drr_compression, drrwe->drr_etype, - drrwe->drr_lsize, drrwe->drr_psize, err); - break; - } - case DRR_FREE: - { - struct drr_free *drrf = &rrd->header.drr_u.drr_free; - dprintf("drr_type = FREE obj = %llu offset = %llu " - "length = %lld err = %d\n", - drrf->drr_object, drrf->drr_offset, drrf->drr_length, - err); - break; - } - case DRR_SPILL: - { - struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; - dprintf("drr_type = SPILL obj = %llu length = %llu " - "err = %d\n", drrs->drr_object, drrs->drr_length, err); - break; - } - default: - return; - } -#endif + dmu_sendstatus_t *dssp = kmem_zalloc(sizeof (*dssp), KM_SLEEP); + dssp->dss_outfd = dspp->outfd; + dssp->dss_off = dspp->off; + dssp->dss_proc = curproc; + mutex_enter(&dspp->to_ds->ds_sendstream_lock); + list_insert_head(&dspp->to_ds->ds_sendstreams, dssp); + mutex_exit(&dspp->to_ds->ds_sendstream_lock); + return (dssp); } /* - * Commit the records to the pool. + * Actually do the bulk of the work in a zfs send. + * + * The idea is that we want to do a send from ancestor_zb to to_ds. We also + * want to not send any data that has been modified by all the datasets in + * redactsnaparr, and store the list of blocks that are redacted in this way in + * a bookmark named redactbook, created on the to_ds. We do this by creating + * several worker threads, whose function is described below. + * + * There are three cases. + * The first case is a redacted zfs send. In this case there are 5 threads. + * The first thread is the to_ds traversal thread: it calls dataset_traverse on + * the to_ds and finds all the blocks that have changed since ancestor_zb (if + * it's a full send, that's all blocks in the dataset). It then sends those + * blocks on to the send merge thread. The redact list thread takes the data + * from the redaction bookmark and sends those blocks on to the send merge + * thread. The send merge thread takes the data from the to_ds traversal + * thread, and combines it with the redaction records from the redact list + * thread. If a block appears in both the to_ds's data and the redaction data, + * the send merge thread will mark it as redacted and send it on to the prefetch + * thread. Otherwise, the send merge thread will send the block on to the + * prefetch thread unchanged. The prefetch thread will issue prefetch reads for + * any data that isn't redacted, and then send the data on to the main thread. + * The main thread behaves the same as in a normal send case, issuing demand + * reads for data blocks and sending out records over the network + * + * The graphic below diagrams the flow of data in the case of a redacted zfs + * send. Each box represents a thread, and each line represents the flow of + * data. + * + * Records from the | + * redaction bookmark | + * +--------------------+ | +---------------------------+ + * | | v | Send Merge Thread | + * | Redact List Thread +----------> Apply redaction marks to | + * | | | records as specified by | + * +--------------------+ | redaction ranges | + * +----^---------------+------+ + * | | Merged data + * | | + * | +------------v--------+ + * | | Prefetch Thread | + * +--------------------+ | | Issues prefetch | + * | to_ds Traversal | | | reads of data blocks| + * | Thread (finds +---------------+ +------------+--------+ + * | candidate blocks) | Blocks modified | Prefetched data + * +--------------------+ by to_ds since | + * ancestor_zb +------------v----+ + * | Main Thread | File Descriptor + * | Sends data over +->(to zfs receive) + * | wire | + * +-----------------+ + * + * The second case is an incremental send from a redaction bookmark. The to_ds + * traversal thread and the main thread behave the same as in the redacted + * send case. The new thread is the from bookmark traversal thread. It + * iterates over the redaction list in the redaction bookmark, and enqueues + * records for each block that was redacted in the original send. The send + * merge thread now has to merge the data from the two threads. For details + * about that process, see the header comment of send_merge_thread(). Any data + * it decides to send on will be prefetched by the prefetch thread. Note that + * you can perform a redacted send from a redaction bookmark; in that case, + * the data flow behaves very similarly to the flow in the redacted send case, + * except with the addition of the bookmark traversal thread iterating over the + * redaction bookmark. The send_merge_thread also has to take on the + * responsibility of merging the redact list thread's records, the bookmark + * traversal thread's records, and the to_ds records. + * + * +---------------------+ + * | | + * | Redact List Thread +--------------+ + * | | | + * +---------------------+ | + * Blocks in redaction list | Ranges modified by every secure snap + * of from bookmark | (or EOS if not readcted) + * | + * +---------------------+ | +----v----------------------+ + * | bookmark Traversal | v | Send Merge Thread | + * | Thread (finds +---------> Merges bookmark, rlt, and | + * | candidate blocks) | | to_ds send records | + * +---------------------+ +----^---------------+------+ + * | | Merged data + * | +------------v--------+ + * | | Prefetch Thread | + * +--------------------+ | | Issues prefetch | + * | to_ds Traversal | | | reads of data blocks| + * | Thread (finds +---------------+ +------------+--------+ + * | candidate blocks) | Blocks modified | Prefetched data + * +--------------------+ by to_ds since +------------v----+ + * ancestor_zb | Main Thread | File Descriptor + * | Sends data over +->(to zfs receive) + * | wire | + * +-----------------+ + * + * The final case is a simple zfs full or incremental send. The to_ds traversal + * thread behaves the same as always. The redact list thread is never started. + * The send merge thread takes all the blocks that the to_ds traveral thread + * sends it, prefetches the data, and sends the blocks on to the main thread. + * The main thread sends the data over the wire. + * + * To keep performance acceptable, we want to prefetch the data in the worker + * threads. While the to_ds thread could simply use the TRAVERSE_PREFETCH + * feature built into traverse_dataset, the combining and deletion of records + * due to redaction and sends from redaction bookmarks mean that we could + * issue many unnecessary prefetches. As a result, we only prefetch data + * after we've determined that the record is not going to be redacted. To + * prevent the prefetching from getting too far ahead of the main thread, the + * blocking queues that are used for communication are capped not by the + * number of entries in the queue, but by the sum of the size of the + * prefetches associated with them. The limit on the amount of data that the + * thread can prefetch beyond what the main thread has reached is controlled + * by the global variable zfs_send_queue_length. In addition, to prevent poor + * performance in the beginning of a send, we also limit the distance ahead + * that the traversal threads can be. That distance is controlled by the + * zfs_send_no_prefetch_queue_length tunable. + * + * Note: Releases dp using the specified tag. */ static int -receive_process_record(struct receive_writer_arg *rwa, - struct receive_record_arg *rrd) +dmu_send_impl(struct dmu_send_params *dspp) { + objset_t *os; + dmu_replay_record_t *drr; + dmu_sendstatus_t *dssp; + dmu_send_cookie_t dsc = {0}; int err; + uint64_t fromtxg = dspp->ancestor_zb.zbm_creation_txg; + uint64_t featureflags = 0; + struct redact_list_thread_arg *from_arg; + struct send_thread_arg *to_arg; + struct redact_list_thread_arg *rlt_arg; + struct send_merge_thread_arg *smt_arg; + struct send_prefetch_thread_arg *spt_arg; + struct send_range *range; + redaction_list_t *from_rl = NULL; + redaction_list_t *redact_rl = NULL; + boolean_t resuming = (dspp->resumeobj != 0 || dspp->resumeoff != 0); + boolean_t book_resuming = resuming; + + dsl_dataset_t *to_ds = dspp->to_ds; + zfs_bookmark_phys_t *ancestor_zb = &dspp->ancestor_zb; + dsl_pool_t *dp = dspp->dp; + void *tag = dspp->tag; - /* Processing in order, therefore bytes_read should be increasing. */ - ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); - rwa->bytes_read = rrd->bytes_read; - - switch (rrd->header.drr_type) { - case DRR_OBJECT: - { - struct drr_object *drro = &rrd->header.drr_u.drr_object; - err = receive_object(rwa, drro, rrd->payload); - kmem_free(rrd->payload, rrd->payload_size); - rrd->payload = NULL; - break; - } - case DRR_FREEOBJECTS: - { - struct drr_freeobjects *drrfo = - &rrd->header.drr_u.drr_freeobjects; - err = receive_freeobjects(rwa, drrfo); - break; - } - case DRR_WRITE: - { - struct drr_write *drrw = &rrd->header.drr_u.drr_write; - err = receive_write(rwa, drrw, rrd->arc_buf); - /* if receive_write() is successful, it consumes the arc_buf */ - if (err != 0) - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; - rrd->payload = NULL; - break; - } - case DRR_WRITE_BYREF: - { - struct drr_write_byref *drrwbr = - &rrd->header.drr_u.drr_write_byref; - err = receive_write_byref(rwa, drrwbr); - break; - } - case DRR_WRITE_EMBEDDED: - { - struct drr_write_embedded *drrwe = - &rrd->header.drr_u.drr_write_embedded; - err = receive_write_embedded(rwa, drrwe, rrd->payload); - kmem_free(rrd->payload, rrd->payload_size); - rrd->payload = NULL; - break; - } - case DRR_FREE: - { - struct drr_free *drrf = &rrd->header.drr_u.drr_free; - err = receive_free(rwa, drrf); - break; - } - case DRR_SPILL: - { - struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; - err = receive_spill(rwa, drrs, rrd->arc_buf); - /* if receive_spill() is successful, it consumes the arc_buf */ - if (err != 0) - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; - rrd->payload = NULL; - break; - } - case DRR_OBJECT_RANGE: - { - struct drr_object_range *drror = - &rrd->header.drr_u.drr_object_range; - return (receive_object_range(rwa, drror)); - } - default: - return (SET_ERROR(EINVAL)); + err = dmu_objset_from_ds(to_ds, &os); + if (err != 0) { + dsl_pool_rele(dp, tag); + return (err); } + /* + * If this is a non-raw send of an encrypted ds, we can ensure that + * the objset_phys_t is authenticated. This is safe because this is + * either a snapshot or we have owned the dataset, ensuring that + * it can't be modified. + */ + if (!dspp->rawok && os->os_encrypted && + arc_is_unauthenticated(os->os_phys_buf)) { + zbookmark_phys_t zb; - if (err != 0) - dprintf_drr(rrd, err); - - return (err); -} - -/* - * dmu_recv_stream's worker thread; pull records off the queue, and then call - * receive_process_record When we're done, signal the main thread and exit. - */ -static void -receive_writer_thread(void *arg) -{ - struct receive_writer_arg *rwa = arg; - struct receive_record_arg *rrd; - fstrans_cookie_t cookie = spl_fstrans_mark(); - - for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; - rrd = bqueue_dequeue(&rwa->q)) { - /* - * If there's an error, the main thread will stop putting things - * on the queue, but we need to clear everything in it before we - * can exit. - */ - if (rwa->err == 0) { - rwa->err = receive_process_record(rwa, rrd); - } else if (rrd->arc_buf != NULL) { - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; - rrd->payload = NULL; - } else if (rrd->payload != NULL) { - kmem_free(rrd->payload, rrd->payload_size); - rrd->payload = NULL; + SET_BOOKMARK(&zb, to_ds->ds_object, ZB_ROOT_OBJECT, + ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + err = arc_untransform(os->os_phys_buf, os->os_spa, + &zb, B_FALSE); + if (err != 0) { + dsl_pool_rele(dp, tag); + return (err); } - kmem_free(rrd, sizeof (*rrd)); - } - kmem_free(rrd, sizeof (*rrd)); - mutex_enter(&rwa->mutex); - rwa->done = B_TRUE; - cv_signal(&rwa->cv); - mutex_exit(&rwa->mutex); - spl_fstrans_unmark(cookie); - thread_exit(); -} -static int -resume_check(struct receive_arg *ra, nvlist_t *begin_nvl) -{ - uint64_t val; - objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset; - uint64_t dsobj = dmu_objset_id(ra->os); - uint64_t resume_obj, resume_off; - - if (nvlist_lookup_uint64(begin_nvl, - "resume_object", &resume_obj) != 0 || - nvlist_lookup_uint64(begin_nvl, - "resume_offset", &resume_off) != 0) { - return (SET_ERROR(EINVAL)); + ASSERT0(arc_is_unauthenticated(os->os_phys_buf)); } - VERIFY0(zap_lookup(mos, dsobj, - DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val)); - if (resume_obj != val) - return (SET_ERROR(EINVAL)); - VERIFY0(zap_lookup(mos, dsobj, - DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val)); - if (resume_off != val) - return (SET_ERROR(EINVAL)); - - return (0); -} -/* - * Read in the stream's records, one by one, and apply them to the pool. There - * are two threads involved; the thread that calls this function will spin up a - * worker thread, read the records off the stream one by one, and issue - * prefetches for any necessary indirect blocks. It will then push the records - * onto an internal blocking queue. The worker thread will pull the records off - * the queue, and actually write the data into the DMU. This way, the worker - * thread doesn't have to wait for reads to complete, since everything it needs - * (the indirect blocks) will be prefetched. - * - * NB: callers *must* call dmu_recv_end() if this succeeds. - */ -int -dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - int cleanup_fd, uint64_t *action_handlep) -{ - int err = 0; - struct receive_arg *ra; - struct receive_writer_arg *rwa; - int featureflags; - uint32_t payloadlen; - void *payload; - nvlist_t *begin_nvl = NULL; - - ra = kmem_zalloc(sizeof (*ra), KM_SLEEP); - rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP); - - ra->byteswap = drc->drc_byteswap; - ra->raw = drc->drc_raw; - ra->cksum = drc->drc_cksum; - ra->vp = vp; - ra->voff = *voffp; - - if (dsl_dataset_is_zapified(drc->drc_ds)) { - (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, - drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, - sizeof (ra->bytes_read), 1, &ra->bytes_read); + if ((err = setup_featureflags(dspp, os, &featureflags)) != 0) { + dsl_pool_rele(dp, tag); + return (err); } - objlist_create(&ra->ignore_objlist); - - /* these were verified in dmu_recv_begin */ - ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, - DMU_SUBSTREAM); - ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); + from_arg = kmem_zalloc(sizeof (*from_arg), KM_SLEEP); + to_arg = kmem_zalloc(sizeof (*to_arg), KM_SLEEP); + rlt_arg = kmem_zalloc(sizeof (*rlt_arg), KM_SLEEP); + smt_arg = kmem_zalloc(sizeof (*smt_arg), KM_SLEEP); + spt_arg = kmem_zalloc(sizeof (*spt_arg), KM_SLEEP); /* - * Open the objset we are modifying. + * If we're doing a redacted send, hold the bookmark's redaction list. */ - VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra->os)); + if (dspp->redactbook != NULL) { + err = dsl_redaction_list_hold_obj(dp, + dspp->redactbook->zbm_redaction_obj, FTAG, + &redact_rl); + if (err != 0) { + dsl_pool_rele(dp, tag); + return (SET_ERROR(EINVAL)); + } + dsl_redaction_list_long_hold(dp, redact_rl, FTAG); + } - ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); + /* + * If we're sending from a redaction bookmark, hold the redaction list + * so that we can consider sending the redacted blocks. + */ + if (ancestor_zb->zbm_redaction_obj != 0) { + err = dsl_redaction_list_hold_obj(dp, + ancestor_zb->zbm_redaction_obj, FTAG, &from_rl); + if (err != 0) { + if (redact_rl != NULL) { + dsl_redaction_list_long_rele(redact_rl, FTAG); + dsl_redaction_list_rele(redact_rl, FTAG); + } + dsl_pool_rele(dp, tag); + return (SET_ERROR(EINVAL)); + } + dsl_redaction_list_long_hold(dp, from_rl, FTAG); + } - featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); - ra->featureflags = featureflags; + dsl_dataset_long_hold(to_ds, FTAG); - ASSERT0(ra->os->os_encrypted && - (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)); + drr = create_begin_record(dspp, os, featureflags); + dssp = setup_send_progress(dspp); - /* if this stream is dedup'ed, set up the avl tree for guid mapping */ - if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { - minor_t minor; + dsc.dsc_drr = drr; + dsc.dsc_dso = dspp->dso; + dsc.dsc_os = os; + dsc.dsc_off = dspp->off; + dsc.dsc_toguid = dsl_dataset_phys(to_ds)->ds_guid; + dsc.dsc_pending_op = PENDING_NONE; + dsc.dsc_featureflags = featureflags; + dsc.dsc_resume_object = dspp->resumeobj; + dsc.dsc_resume_offset = dspp->resumeoff; - if (cleanup_fd == -1) { - err = SET_ERROR(EBADF); - goto out; - } - err = zfs_onexit_fd_hold(cleanup_fd, &minor); - if (err != 0) { - cleanup_fd = -1; - goto out; - } + dsl_pool_rele(dp, tag); - if (*action_handlep == 0) { - rwa->guid_to_ds_map = - kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); - avl_create(rwa->guid_to_ds_map, guid_compare, - sizeof (guid_map_entry_t), - offsetof(guid_map_entry_t, avlnode)); - err = zfs_onexit_add_cb(minor, - free_guid_map_onexit, rwa->guid_to_ds_map, - action_handlep); - if (err != 0) - goto out; - } else { - err = zfs_onexit_cb_data(minor, *action_handlep, - (void **)&rwa->guid_to_ds_map); - if (err != 0) - goto out; - } + void *payload = NULL; + size_t payload_len = 0; + nvlist_t *nvl = fnvlist_alloc(); - drc->drc_guid_to_ds_map = rwa->guid_to_ds_map; + /* + * If we're doing a redacted send, we include the snapshots we're + * redacted with respect to so that the target system knows what send + * streams can be correctly received on top of this dataset. If we're + * instead sending a redacted dataset, we include the snapshots that the + * dataset was created with respect to. + */ + if (dspp->redactbook != NULL) { + fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_SNAPS, + redact_rl->rl_phys->rlp_snaps, + redact_rl->rl_phys->rlp_num_snaps); + } else if (dsl_dataset_feature_is_active(to_ds, + SPA_FEATURE_REDACTED_DATASETS)) { + uint64_t *tods_guids; + uint64_t length; + VERIFY(dsl_dataset_get_uint64_array_feature(to_ds, + SPA_FEATURE_REDACTED_DATASETS, &length, &tods_guids)); + fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_SNAPS, tods_guids, + length); } - payloadlen = drc->drc_drr_begin->drr_payloadlen; - payload = NULL; - if (payloadlen != 0) - payload = kmem_alloc(payloadlen, KM_SLEEP); + /* + * If we're sending from a redaction bookmark, then we should retrieve + * the guids of that bookmark so we can send them over the wire. + */ + if (from_rl != NULL) { + fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_FROM_SNAPS, + from_rl->rl_phys->rlp_snaps, + from_rl->rl_phys->rlp_num_snaps); + } - err = receive_read_payload_and_next_header(ra, payloadlen, payload); - if (err != 0) { - if (payloadlen != 0) - kmem_free(payload, payloadlen); - goto out; + /* + * If the snapshot we're sending from is redacted, include the redaction + * list in the stream. + */ + if (dspp->numfromredactsnaps != NUM_SNAPS_NOT_REDACTED) { + ASSERT3P(from_rl, ==, NULL); + fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_FROM_SNAPS, + dspp->fromredactsnaps, (uint_t)dspp->numfromredactsnaps); + if (dspp->numfromredactsnaps > 0) { + kmem_free(dspp->fromredactsnaps, + dspp->numfromredactsnaps * sizeof (uint64_t)); + dspp->fromredactsnaps = NULL; + } } - if (payloadlen != 0) { - err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP); - kmem_free(payload, payloadlen); + + if (resuming || book_resuming) { + err = setup_resume_points(dspp, to_arg, from_arg, + rlt_arg, smt_arg, resuming, os, redact_rl, nvl); if (err != 0) goto out; } - /* handle DSL encryption key payload */ if (featureflags & DMU_BACKUP_FEATURE_RAW) { nvlist_t *keynvl = NULL; + ASSERT(os->os_encrypted); - ASSERT(ra->os->os_encrypted); - ASSERT(drc->drc_raw); - - err = nvlist_lookup_nvlist(begin_nvl, "crypt_keydata", &keynvl); - if (err != 0) + err = dsl_crypto_populate_key_nvlist(to_ds, &keynvl); + if (err != 0) { + fnvlist_free(nvl); goto out; + } - /* - * If this is a new dataset we set the key immediately. - * Otherwise we don't want to change the key until we - * are sure the rest of the receive succeeded so we stash - * the keynvl away until then. - */ - err = dsl_crypto_recv_raw(spa_name(ra->os->os_spa), - drc->drc_ds->ds_object, drc->drc_drrb->drr_type, - keynvl, drc->drc_newfs); - if (err != 0) - goto out; + fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl); + fnvlist_free(keynvl); + } - if (!drc->drc_newfs) - drc->drc_keynvl = fnvlist_dup(keynvl); + if (!nvlist_empty(nvl)) { + payload = fnvlist_pack(nvl, &payload_len); + drr->drr_payloadlen = payload_len; } - if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { - err = resume_check(ra, begin_nvl); - if (err != 0) - goto out; + fnvlist_free(nvl); + err = dump_record(&dsc, payload, payload_len); + fnvlist_pack_free(payload, payload_len); + if (err != 0) { + err = dsc.dsc_err; + goto out; + } + + setup_to_thread(to_arg, to_ds, dssp, fromtxg, dspp->rawok); + setup_from_thread(from_arg, from_rl, dssp); + setup_redact_list_thread(rlt_arg, dspp, redact_rl, dssp); + setup_merge_thread(smt_arg, dspp, from_arg, to_arg, rlt_arg, os); + setup_prefetch_thread(spt_arg, dspp, smt_arg); + + range = bqueue_dequeue(&spt_arg->q); + while (err == 0 && !range->eos_marker) { + err = do_dump(&dsc, range); + range = get_next_range(&spt_arg->q, range); + if (issig(JUSTLOOKING) && issig(FORREAL)) + err = EINTR; } - (void) bqueue_init(&rwa->q, - MAX(zfs_recv_queue_length, 2 * zfs_max_recordsize), - offsetof(struct receive_record_arg, node)); - cv_init(&rwa->cv, NULL, CV_DEFAULT, NULL); - mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL); - rwa->os = ra->os; - rwa->byteswap = drc->drc_byteswap; - rwa->resumable = drc->drc_resumable; - rwa->raw = drc->drc_raw; - rwa->os->os_raw_receive = drc->drc_raw; - - (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc, - TS_RUN, minclsyspri); /* - * We're reading rwa->err without locks, which is safe since we are the - * only reader, and the worker thread is the only writer. It's ok if we - * miss a write for an iteration or two of the loop, since the writer - * thread will keep freeing records we send it until we send it an eos - * marker. - * - * We can leave this loop in 3 ways: First, if rwa->err is - * non-zero. In that case, the writer thread will free the rrd we just - * pushed. Second, if we're interrupted; in that case, either it's the - * first loop and ra->rrd was never allocated, or it's later and ra->rrd - * has been handed off to the writer thread who will free it. Finally, - * if receive_read_record fails or we're at the end of the stream, then - * we free ra->rrd and exit. + * If we hit an error or are interrupted, cancel our worker threads and + * clear the queue of any pending records. The threads will pass the + * cancel up the tree of worker threads, and each one will clean up any + * pending records before exiting. */ - while (rwa->err == 0) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { - err = SET_ERROR(EINTR); - break; + if (err != 0) { + spt_arg->cancel = B_TRUE; + while (!range->eos_marker) { + range = get_next_range(&spt_arg->q, range); } + } + range_free(range); - ASSERT3P(ra->rrd, ==, NULL); - ra->rrd = ra->next_rrd; - ra->next_rrd = NULL; - /* Allocates and loads header into ra->next_rrd */ - err = receive_read_record(ra); - - if (ra->rrd->header.drr_type == DRR_END || err != 0) { - kmem_free(ra->rrd, sizeof (*ra->rrd)); - ra->rrd = NULL; - break; - } + bqueue_destroy(&spt_arg->q); + bqueue_destroy(&smt_arg->q); + if (dspp->redactbook != NULL) + bqueue_destroy(&rlt_arg->q); + bqueue_destroy(&to_arg->q); + bqueue_destroy(&from_arg->q); - bqueue_enqueue(&rwa->q, ra->rrd, - sizeof (struct receive_record_arg) + ra->rrd->payload_size); - ra->rrd = NULL; - } - if (ra->next_rrd == NULL) - ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); - ra->next_rrd->eos_marker = B_TRUE; - bqueue_enqueue(&rwa->q, ra->next_rrd, 1); - - mutex_enter(&rwa->mutex); - while (!rwa->done) { - cv_wait(&rwa->cv, &rwa->mutex); - } - mutex_exit(&rwa->mutex); + if (err == 0 && spt_arg->error != 0) + err = spt_arg->error; - /* - * If we are receiving a full stream as a clone, all object IDs which - * are greater than the maximum ID referenced in the stream are - * by definition unused and must be freed. - */ - if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) { - uint64_t obj = rwa->max_object + 1; - int free_err = 0; - int next_err = 0; - - while (next_err == 0) { - free_err = dmu_free_long_object(rwa->os, obj); - if (free_err != 0 && free_err != ENOENT) - break; + if (err != 0) + goto out; - next_err = dmu_object_next(rwa->os, &obj, FALSE, 0); - } + if (dsc.dsc_pending_op != PENDING_NONE) + if (dump_record(&dsc, NULL, 0) != 0) + err = SET_ERROR(EINTR); - if (err == 0) { - if (free_err != 0 && free_err != ENOENT) - err = free_err; - else if (next_err != ESRCH) - err = next_err; - } + if (err != 0) { + if (err == EINTR && dsc.dsc_err != 0) + err = dsc.dsc_err; + goto out; } - cv_destroy(&rwa->cv); - mutex_destroy(&rwa->mutex); - bqueue_destroy(&rwa->q); - if (err == 0) - err = rwa->err; + bzero(drr, sizeof (dmu_replay_record_t)); + drr->drr_type = DRR_END; + drr->drr_u.drr_end.drr_checksum = dsc.dsc_zc; + drr->drr_u.drr_end.drr_toguid = dsc.dsc_toguid; + if (dump_record(&dsc, NULL, 0) != 0) + err = dsc.dsc_err; out: - nvlist_free(begin_nvl); - if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) - zfs_onexit_fd_rele(cleanup_fd); + mutex_enter(&to_ds->ds_sendstream_lock); + list_remove(&to_ds->ds_sendstreams, dssp); + mutex_exit(&to_ds->ds_sendstream_lock); - if (err != 0) { - /* - * Clean up references. If receive is not resumable, - * destroy what we created, so we don't leave it in - * the inconsistent state. - */ - dmu_recv_cleanup_ds(drc); - nvlist_free(drc->drc_keynvl); + VERIFY(err != 0 || (dsc.dsc_sent_begin && dsc.dsc_sent_end)); + + kmem_free(drr, sizeof (dmu_replay_record_t)); + kmem_free(dssp, sizeof (dmu_sendstatus_t)); + kmem_free(from_arg, sizeof (*from_arg)); + kmem_free(to_arg, sizeof (*to_arg)); + kmem_free(rlt_arg, sizeof (*rlt_arg)); + kmem_free(smt_arg, sizeof (*smt_arg)); + kmem_free(spt_arg, sizeof (*spt_arg)); + + dsl_dataset_long_rele(to_ds, FTAG); + if (from_rl != NULL) { + dsl_redaction_list_long_rele(from_rl, FTAG); + dsl_redaction_list_rele(from_rl, FTAG); + } + if (redact_rl != NULL) { + dsl_redaction_list_long_rele(redact_rl, FTAG); + dsl_redaction_list_rele(redact_rl, FTAG); } - *voffp = ra->voff; - objlist_destroy(&ra->ignore_objlist); - kmem_free(ra, sizeof (*ra)); - kmem_free(rwa, sizeof (*rwa)); return (err); } -static int -dmu_recv_end_check(void *arg, dmu_tx_t *tx) +int +dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, + boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, + boolean_t rawok, int outfd, offset_t *off, dmu_send_outparams_t *dsop) { - dmu_recv_cookie_t *drc = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - int error; - - ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); + int err; + dsl_dataset_t *fromds; + ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; + struct dmu_send_params dspp = {0}; + dspp.embedok = embedok; + dspp.large_block_ok = large_block_ok; + dspp.compressok = compressok; + dspp.outfd = outfd; + dspp.off = off; + dspp.dso = dsop; + dspp.tag = FTAG; + dspp.rawok = rawok; + + err = dsl_pool_hold(pool, FTAG, &dspp.dp); + if (err != 0) + return (err); - if (!drc->drc_newfs) { - dsl_dataset_t *origin_head; + err = dsl_dataset_hold_obj_flags(dspp.dp, tosnap, dsflags, FTAG, + &dspp.to_ds); + if (err != 0) { + dsl_pool_rele(dspp.dp, FTAG); + return (err); + } - error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); - if (error != 0) - return (error); - if (drc->drc_force) { - /* - * We will destroy any snapshots in tofs (i.e. before - * origin_head) that are after the origin (which is - * the snap before drc_ds, because drc_ds can not - * have any snaps of its own). - */ - uint64_t obj; - - obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; - while (obj != - dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { - dsl_dataset_t *snap; - error = dsl_dataset_hold_obj(dp, obj, FTAG, - &snap); - if (error != 0) - break; - if (snap->ds_dir != origin_head->ds_dir) - error = SET_ERROR(EINVAL); - if (error == 0) { - error = dsl_destroy_snapshot_check_impl( - snap, B_FALSE); - } - obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; - dsl_dataset_rele(snap, FTAG); - if (error != 0) - break; - } - if (error != 0) { - dsl_dataset_rele(origin_head, FTAG); - return (error); - } - } - if (drc->drc_keynvl != NULL) { - error = dsl_crypto_recv_raw_key_check(drc->drc_ds, - drc->drc_keynvl, tx); - if (error != 0) { - dsl_dataset_rele(origin_head, FTAG); - return (error); - } + if (fromsnap != 0) { + err = dsl_dataset_hold_obj_flags(dspp.dp, fromsnap, dsflags, + FTAG, &fromds); + if (err != 0) { + dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG); + dsl_pool_rele(dspp.dp, FTAG); + return (err); } - - error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, - origin_head, drc->drc_force, drc->drc_owner, tx); - if (error != 0) { - dsl_dataset_rele(origin_head, FTAG); - return (error); + dspp.ancestor_zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; + dspp.ancestor_zb.zbm_creation_txg = + dsl_dataset_phys(fromds)->ds_creation_txg; + dspp.ancestor_zb.zbm_creation_time = + dsl_dataset_phys(fromds)->ds_creation_time; + /* See dmu_send for the reasons behind this. */ + uint64_t *fromredact; + + if (!dsl_dataset_get_uint64_array_feature(fromds, + SPA_FEATURE_REDACTED_DATASETS, + &dspp.numfromredactsnaps, + &fromredact)) { + dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; + } else if (dspp.numfromredactsnaps > 0) { + uint64_t size = dspp.numfromredactsnaps * + sizeof (uint64_t); + dspp.fromredactsnaps = kmem_zalloc(size, KM_SLEEP); + bcopy(fromredact, dspp.fromredactsnaps, size); + } + + if (!dsl_dataset_is_before(dspp.to_ds, fromds, 0)) { + err = SET_ERROR(EXDEV); + } else { + dspp.is_clone = (dspp.to_ds->ds_dir != + fromds->ds_dir); + dsl_dataset_rele(fromds, FTAG); + err = dmu_send_impl(&dspp); } - error = dsl_dataset_snapshot_check_impl(origin_head, - drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); - dsl_dataset_rele(origin_head, FTAG); - if (error != 0) - return (error); - - error = dsl_destroy_head_check_impl(drc->drc_ds, 1); } else { - error = dsl_dataset_snapshot_check_impl(drc->drc_ds, - drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); + dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; + err = dmu_send_impl(&dspp); } - return (error); + dsl_dataset_rele(dspp.to_ds, FTAG); + return (err); } -static void -dmu_recv_end_sync(void *arg, dmu_tx_t *tx) +int +dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, + boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, + uint64_t resumeobj, uint64_t resumeoff, const char *redactbook, int outfd, + offset_t *off, dmu_send_outparams_t *dsop) { - dmu_recv_cookie_t *drc = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0; - - spa_history_log_internal_ds(drc->drc_ds, "finish receiving", - tx, "snap=%s", drc->drc_tosnap); - drc->drc_ds->ds_objset->os_raw_receive = B_FALSE; - - if (!drc->drc_newfs) { - dsl_dataset_t *origin_head; - - VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, - &origin_head)); - - if (drc->drc_force) { - /* - * Destroy any snapshots of drc_tofs (origin_head) - * after the origin (the snap before drc_ds). - */ - uint64_t obj; - - obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; - while (obj != - dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { - dsl_dataset_t *snap; - VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, - &snap)); - ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); - obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; - dsl_destroy_snapshot_sync_impl(snap, - B_FALSE, tx); - dsl_dataset_rele(snap, FTAG); - } - } - if (drc->drc_keynvl != NULL) { - dsl_crypto_recv_raw_key_sync(drc->drc_ds, - drc->drc_keynvl, tx); - nvlist_free(drc->drc_keynvl); - drc->drc_keynvl = NULL; - } - - VERIFY3P(drc->drc_ds->ds_prev, ==, origin_head->ds_prev); - - dsl_dataset_clone_swap_sync_impl(drc->drc_ds, - origin_head, tx); - dsl_dataset_snapshot_sync_impl(origin_head, - drc->drc_tosnap, tx); + int err = 0; + ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; + boolean_t owned = B_FALSE; + dsl_dataset_t *fromds = NULL; + zfs_bookmark_phys_t book = {0}; + struct dmu_send_params dspp = {0}; + dspp.tosnap = tosnap; + dspp.embedok = embedok; + dspp.large_block_ok = large_block_ok; + dspp.compressok = compressok; + dspp.outfd = outfd; + dspp.off = off; + dspp.dso = dsop; + dspp.tag = FTAG; + dspp.resumeobj = resumeobj; + dspp.resumeoff = resumeoff; + dspp.rawok = rawok; - /* set snapshot's creation time and guid */ - dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); - dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = - drc->drc_drrb->drr_creation_time; - dsl_dataset_phys(origin_head->ds_prev)->ds_guid = - drc->drc_drrb->drr_toguid; - dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= - ~DS_FLAG_INCONSISTENT; + if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) + return (SET_ERROR(EINVAL)); - dmu_buf_will_dirty(origin_head->ds_dbuf, tx); - dsl_dataset_phys(origin_head)->ds_flags &= - ~DS_FLAG_INCONSISTENT; + err = dsl_pool_hold(tosnap, FTAG, &dspp.dp); + if (err != 0) + return (err); - drc->drc_newsnapobj = - dsl_dataset_phys(origin_head)->ds_prev_snap_obj; + if (strchr(tosnap, '@') == NULL && spa_writeable(dspp.dp->dp_spa)) { + /* + * We are sending a filesystem or volume. Ensure + * that it doesn't change by owning the dataset. + */ + err = dsl_dataset_own(dspp.dp, tosnap, dsflags, FTAG, + &dspp.to_ds); + owned = B_TRUE; + } else { + err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG, + &dspp.to_ds); + } - dsl_dataset_rele(origin_head, FTAG); - dsl_destroy_head_sync_impl(drc->drc_ds, tx); + if (err != 0) { + dsl_pool_rele(dspp.dp, FTAG); + return (err); + } - if (drc->drc_owner != NULL) - VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); - } else { - dsl_dataset_t *ds = drc->drc_ds; - - dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); - - /* set snapshot's creation time and guid */ - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - dsl_dataset_phys(ds->ds_prev)->ds_creation_time = - drc->drc_drrb->drr_creation_time; - dsl_dataset_phys(ds->ds_prev)->ds_guid = - drc->drc_drrb->drr_toguid; - dsl_dataset_phys(ds->ds_prev)->ds_flags &= - ~DS_FLAG_INCONSISTENT; - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; - if (dsl_dataset_has_resume_receive_state(ds)) { - (void) zap_remove(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_FROMGUID, tx); - (void) zap_remove(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_OBJECT, tx); - (void) zap_remove(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_OFFSET, tx); - (void) zap_remove(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_BYTES, tx); - (void) zap_remove(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_TOGUID, tx); - (void) zap_remove(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_TONAME, tx); + if (redactbook != NULL) { + char path[ZFS_MAX_DATASET_NAME_LEN]; + (void) strlcpy(path, tosnap, sizeof (path)); + char *at = strchr(path, '@'); + if (at == NULL) { + err = EINVAL; + } else { + (void) snprintf(at, sizeof (path) - (at - path), "#%s", + redactbook); + err = dsl_bookmark_lookup(dspp.dp, path, + NULL, &book); + dspp.redactbook = &book; } - drc->drc_newsnapobj = - dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; } - zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE); - /* - * Release the hold from dmu_recv_begin. This must be done before - * we return to open context, so that when we free the dataset's dnode - * we can evict its bonus buffer. Since the dataset may be destroyed - * at this point (and therefore won't have a valid pointer to the spa) - * we release the key mapping manually here while we do have a valid - * pointer, if it exists. - */ - if (!drc->drc_raw && encrypted) { - (void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa, - drc->drc_ds->ds_object, drc->drc_ds); + if (err != 0) { + dsl_pool_rele(dspp.dp, FTAG); + if (owned) + dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); + else + dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG); + return (err); } - dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag); - drc->drc_ds = NULL; -} - -static int -add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj, - boolean_t raw) -{ - dsl_pool_t *dp; - dsl_dataset_t *snapds; - guid_map_entry_t *gmep; - objset_t *os; - ds_hold_flags_t dsflags = (raw) ? 0 : DS_HOLD_FLAG_DECRYPT; - int err; - ASSERT(guid_map != NULL); + if (fromsnap != NULL) { + zfs_bookmark_phys_t *zb = &dspp.ancestor_zb; + int fsnamelen; + if (strpbrk(tosnap, "@#") != NULL) + fsnamelen = strpbrk(tosnap, "@#") - tosnap; + else + fsnamelen = strlen(tosnap); - err = dsl_pool_hold(name, FTAG, &dp); - if (err != 0) - return (err); - gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); - err = dsl_dataset_own_obj(dp, snapobj, dsflags, gmep, &snapds); - if (err == 0) { /* - * If this is a deduplicated raw send stream, we need - * to make sure that we can still read raw blocks from - * earlier datasets in the stream, so we set the - * os_raw_receive flag now. + * If the fromsnap is in a different filesystem, then + * mark the send stream as a clone. */ - if (raw) { - err = dmu_objset_from_ds(snapds, &os); + if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || + (fromsnap[fsnamelen] != '@' && + fromsnap[fsnamelen] != '#')) { + dspp.is_clone = B_TRUE; + } + + if (strchr(fromsnap, '@') != NULL) { + err = dsl_dataset_hold(dspp.dp, fromsnap, FTAG, + &fromds); + if (err != 0) { - dsl_dataset_disown(snapds, dsflags, FTAG); - dsl_pool_rele(dp, FTAG); - kmem_free(gmep, sizeof (*gmep)); - return (err); + ASSERT3P(fromds, ==, NULL); + } else { + /* + * We need to make a deep copy of the redact + * snapshots of the from snapshot, because the + * array will be freed when we evict from_ds. + */ + uint64_t *fromredact; + if (!dsl_dataset_get_uint64_array_feature( + fromds, SPA_FEATURE_REDACTED_DATASETS, + &dspp.numfromredactsnaps, + &fromredact)) { + dspp.numfromredactsnaps = + NUM_SNAPS_NOT_REDACTED; + } else if (dspp.numfromredactsnaps > 0) { + uint64_t size = + dspp.numfromredactsnaps * + sizeof (uint64_t); + dspp.fromredactsnaps = kmem_zalloc(size, + KM_SLEEP); + bcopy(fromredact, dspp.fromredactsnaps, + size); + } + if (!dsl_dataset_is_before(dspp.to_ds, fromds, + 0)) { + err = SET_ERROR(EXDEV); + } else { + ASSERT3U(dspp.is_clone, ==, + (dspp.to_ds->ds_dir != + fromds->ds_dir)); + zb->zbm_creation_txg = + dsl_dataset_phys(fromds)-> + ds_creation_txg; + zb->zbm_creation_time = + dsl_dataset_phys(fromds)-> + ds_creation_time; + zb->zbm_guid = + dsl_dataset_phys(fromds)->ds_guid; + zb->zbm_redaction_obj = 0; + } + dsl_dataset_rele(fromds, FTAG); } - os->os_raw_receive = B_TRUE; + } else { + dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; + err = dsl_bookmark_lookup(dspp.dp, fromsnap, dspp.to_ds, + zb); + if (err == EXDEV && zb->zbm_redaction_obj != 0 && + zb->zbm_guid == + dsl_dataset_phys(dspp.to_ds)->ds_guid) + err = 0; } - gmep->raw = raw; - gmep->guid = dsl_dataset_phys(snapds)->ds_guid; - gmep->gme_ds = snapds; - avl_add(guid_map, gmep); + if (err == 0) { + /* dmu_send_impl will call dsl_pool_rele for us. */ + err = dmu_send_impl(&dspp); + } else { + dsl_pool_rele(dspp.dp, FTAG); + } } else { - kmem_free(gmep, sizeof (*gmep)); + dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; + err = dmu_send_impl(&dspp); } - - dsl_pool_rele(dp, FTAG); + if (owned) + dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); + else + dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG); return (err); } -static int dmu_recv_end_modified_blocks = 3; - static int -dmu_recv_existing_end(dmu_recv_cookie_t *drc) +dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed, + uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep) { -#ifdef _KERNEL + int err = 0; + uint64_t size; /* - * We will be destroying the ds; make sure its origin is unmounted if - * necessary. + * Assume that space (both on-disk and in-stream) is dominated by + * data. We will adjust for indirect blocks and the copies property, + * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). */ - char name[ZFS_MAX_DATASET_NAME_LEN]; - dsl_dataset_name(drc->drc_ds, name); - zfs_destroy_unmount_origin(name); -#endif - return (dsl_sync_task(drc->drc_tofs, - dmu_recv_end_check, dmu_recv_end_sync, drc, - dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); -} + uint64_t recordsize; + uint64_t record_count; + objset_t *os; + VERIFY0(dmu_objset_from_ds(ds, &os)); -static int -dmu_recv_new_end(dmu_recv_cookie_t *drc) -{ - return (dsl_sync_task(drc->drc_tofs, - dmu_recv_end_check, dmu_recv_end_sync, drc, - dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); + /* Assume all (uncompressed) blocks are recordsize. */ + if (zfs_override_estimate_recordsize != 0) { + recordsize = zfs_override_estimate_recordsize; + } else if (os->os_phys->os_type == DMU_OST_ZVOL) { + err = dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize); + } else { + err = dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize); + } + if (err != 0) + return (err); + record_count = uncompressed / recordsize; + + /* + * If we're estimating a send size for a compressed stream, use the + * compressed data size to estimate the stream size. Otherwise, use the + * uncompressed data size. + */ + size = stream_compressed ? compressed : uncompressed; + + /* + * Subtract out approximate space used by indirect blocks. + * Assume most space is used by data blocks (non-indirect, non-dnode). + * Assume no ditto blocks or internal fragmentation. + * + * Therefore, space used by indirect blocks is sizeof(blkptr_t) per + * block. + */ + size -= record_count * sizeof (blkptr_t); + + /* Add in the space for the record associated with each block. */ + size += record_count * sizeof (dmu_replay_record_t); + + *sizep = size; + + return (0); } int -dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) +dmu_send_estimate_fast(dsl_dataset_t *ds, dsl_dataset_t *fromds, + zfs_bookmark_phys_t *frombook, boolean_t stream_compressed, uint64_t *sizep) { - int error; + int err; + uint64_t uncomp, comp; - drc->drc_owner = owner; + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + ASSERT(fromds == NULL || frombook == NULL); - if (drc->drc_newfs) - error = dmu_recv_new_end(drc); - else - error = dmu_recv_existing_end(drc); - - if (error != 0) { - dmu_recv_cleanup_ds(drc); - nvlist_free(drc->drc_keynvl); - } else if (drc->drc_guid_to_ds_map != NULL) { - (void) add_ds_to_guidmap(drc->drc_tofs, drc->drc_guid_to_ds_map, - drc->drc_newsnapobj, drc->drc_raw); + /* tosnap must be a snapshot */ + if (!ds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + if (fromds != NULL) { + uint64_t used; + if (!fromds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + if (!dsl_dataset_is_before(ds, fromds, 0)) + return (SET_ERROR(EXDEV)); + + err = dsl_dataset_space_written(fromds, ds, &used, &comp, + &uncomp); + if (err != 0) + return (err); + } else if (frombook != NULL) { + uint64_t used; + err = dsl_dataset_space_written_bookmark(frombook, ds, &used, + &comp, &uncomp); + if (err != 0) + return (err); + } else { + uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes; + comp = dsl_dataset_phys(ds)->ds_compressed_bytes; } - return (error); -} -/* - * Return TRUE if this objset is currently being received into. - */ -boolean_t -dmu_objset_is_receiving(objset_t *os) -{ - return (os->os_dsl_dataset != NULL && - os->os_dsl_dataset->ds_owner == dmu_recv_tag); + err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp, + stream_compressed, sizep); + /* + * Add the size of the BEGIN and END records to the estimate. + */ + *sizep += 2 * sizeof (dmu_replay_record_t); + return (err); } #if defined(_KERNEL) -/* BEGIN CSTYLED */ -module_param(zfs_override_estimate_recordsize, ulong, 0644); -MODULE_PARM_DESC(zfs_override_estimate_recordsize, - "Record size calculation override for zfs send estimates"); -/* END CSTYLED */ - module_param(zfs_send_corrupt_data, int, 0644); MODULE_PARM_DESC(zfs_send_corrupt_data, "Allow sending corrupt data"); module_param(zfs_send_queue_length, int, 0644); MODULE_PARM_DESC(zfs_send_queue_length, "Maximum send queue length"); -module_param(zfs_recv_queue_length, int, 0644); -MODULE_PARM_DESC(zfs_recv_queue_length, "Maximum receive queue length"); +module_param(zfs_send_no_prefetch_queue_length, int, 0644); +MODULE_PARM_DESC(zfs_send_no_prefetch_queue_length, + "Maximum send queue length for non-prefetch queues"); + +module_param(zfs_send_queue_ff, int, 0644); +MODULE_PARM_DESC(zfs_send_queue_ff, "Send queue fill fraction"); + +module_param(zfs_send_no_prefetch_queue_ff, int, 0644); +MODULE_PARM_DESC(zfs_send_no_prefetch_queue_ff, + "Send queue fill fraction for non-prefetch queues"); + +/* BEGIN CSTYLED */ +module_param(zfs_override_estimate_recordsize, ulong, 0644); +MODULE_PARM_DESC(zfs_override_estimate_recordsize, + "Override block size estimate with fixed size"); +/* END CSTYLED */ #endif diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index f4265209918e..2a74d569a28c 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -67,8 +67,8 @@ typedef struct traverse_data { boolean_t td_realloc_possible; } traverse_data_t; -static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, - uint64_t objset, uint64_t object); +static int traverse_dnode(traverse_data_t *td, const blkptr_t *bp, + const dnode_phys_t *dnp, uint64_t objset, uint64_t object); static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, uint64_t objset, uint64_t object); @@ -194,6 +194,7 @@ traverse_prefetch_metadata(traverse_data_t *td, return; if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) return; + ASSERT(!BP_IS_REDACTED(bp)); if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) zio_flags |= ZIO_FLAG_RAW; @@ -207,7 +208,7 @@ prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp) { ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA); if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || - BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) + BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG || BP_IS_REDACTED(bp)) return (B_FALSE); return (B_TRUE); } @@ -274,7 +275,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, mutex_exit(&pd->pd_mtx); } - if (BP_IS_HOLE(bp)) { + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (err != 0) goto post; @@ -354,7 +355,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, /* recursively visitbp() blocks below this */ for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) { - err = traverse_dnode(td, &child_dnp[i], + err = traverse_dnode(td, bp, &child_dnp[i], zb->zb_objset, zb->zb_blkid * epb + i); if (err != 0) break; @@ -395,19 +396,19 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, zb->zb_objset, DMU_USERUSED_OBJECT); } - err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset, + err = traverse_dnode(td, bp, &osp->os_meta_dnode, zb->zb_objset, DMU_META_DNODE_OBJECT); if (err == 0 && OBJSET_BUF_HAS_USERUSED(buf)) { if (OBJSET_BUF_HAS_PROJECTUSED(buf)) - err = traverse_dnode(td, + err = traverse_dnode(td, bp, &osp->os_projectused_dnode, zb->zb_objset, DMU_PROJECTUSED_OBJECT); if (err == 0) - err = traverse_dnode(td, + err = traverse_dnode(td, bp, &osp->os_groupused_dnode, zb->zb_objset, DMU_GROUPUSED_OBJECT); if (err == 0) - err = traverse_dnode(td, + err = traverse_dnode(td, bp, &osp->os_userused_dnode, zb->zb_objset, DMU_USERUSED_OBJECT); } @@ -475,7 +476,7 @@ prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, } static int -traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, +traverse_dnode(traverse_data_t *td, const blkptr_t *bp, const dnode_phys_t *dnp, uint64_t objset, uint64_t object) { int j, err = 0; @@ -488,7 +489,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, if (td->td_flags & TRAVERSE_PRE) { SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, ZB_DNODE_BLKID); - err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, + err = td->td_func(td->td_spa, NULL, bp, &czb, dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); @@ -511,7 +512,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, if (err == 0 && (td->td_flags & TRAVERSE_POST)) { SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, ZB_DNODE_BLKID); - err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, + err = td->td_func(td->td_spa, NULL, bp, &czb, dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); @@ -532,7 +533,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, ARC_FLAG_PRESCIENT_PREFETCH; ASSERT(pfd->pd_bytes_fetched >= 0); - if (bp == NULL) + if (zb->zb_level == ZB_DNODE_LEVEL) return (0); if (pfd->pd_cancel) return (SET_ERROR(EINTR)); @@ -635,6 +636,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, uint32_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; + ASSERT(!BP_IS_REDACTED(rootbp)); if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(rootbp)) diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index f0459e47d8c3..25085c6fb9eb 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -31,7 +31,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -753,8 +755,8 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) if (dn->dn_num_slots > DNODE_MIN_SLOTS) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; mutex_enter(&ds->ds_lock); - ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_DNODE] = - B_TRUE; + ds->ds_feature_activation[SPA_FEATURE_LARGE_DNODE] = + (void *)B_TRUE; mutex_exit(&ds->ds_lock); } diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c index 0a58115341c7..a7904cc92cb1 100644 --- a/module/zfs/dsl_bookmark.c +++ b/module/zfs/dsl_bookmark.c @@ -14,7 +14,7 @@ */ /* - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,7 @@ #include #include #include +#include static int dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname, @@ -54,13 +56,15 @@ dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname, /* * Returns ESRCH if bookmark is not found. + * Note, we need to use the ZAP rather than the AVL to look up bookmarks + * by name, because only the ZAP honors the casesensitivity setting. */ -static int -dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname, +int +dsl_bookmark_lookup_impl(dsl_dataset_t *ds, const char *shortname, zfs_bookmark_phys_t *bmark_phys) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t bmark_zapobj = ds->ds_bookmarks; + uint64_t bmark_zapobj = ds->ds_bookmarks_obj; matchtype_t mt = 0; int err; @@ -70,16 +74,23 @@ dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname, if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_NORMALIZE; - err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t), - sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, - NULL, 0, NULL); + /* + * Zero it in case this is an older format bookmark which + * has fewer entries than the current format. + */ + bzero(bmark_phys, sizeof (*bmark_phys)); + + err = zap_lookup_norm(mos, bmark_zapobj, shortname, + sizeof (uint64_t), sizeof (*bmark_phys) / sizeof (uint64_t), + bmark_phys, mt, NULL, 0, NULL); return (err == ENOENT ? ESRCH : err); } /* * If later_ds is non-NULL, this will return EXDEV if the the specified bookmark - * does not represents an earlier point in later_ds's timeline. + * does not represents an earlier point in later_ds's timeline. However, + * bmp will still be filled in if we return EXDEV. * * Returns ENOENT if the dataset containing the bookmark does not exist. * Returns ESRCH if the dataset exists but the bookmark was not found in it. @@ -96,7 +107,7 @@ dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname, if (error != 0) return (error); - error = dsl_dataset_bmark_lookup(ds, shortname, bmp); + error = dsl_bookmark_lookup_impl(ds, shortname, bmp); if (error == 0 && later_ds != NULL) { if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg)) error = SET_ERROR(EXDEV); @@ -105,6 +116,15 @@ dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname, return (error); } +typedef struct dsl_bookmark_create_redacted_arg { + const char *dbcra_bmark; + const char *dbcra_snap; + redaction_list_t **dbcra_rl; + uint64_t dbcra_numsnaps; + uint64_t *dbcra_snaps; + void *dbcra_tag; +} dsl_bookmark_create_redacted_arg_t; + typedef struct dsl_bookmark_create_arg { nvlist_t *dbca_bmarks; nvlist_t *dbca_errors; @@ -133,7 +153,7 @@ dsl_bookmark_create_check_impl(dsl_dataset_t *snapds, const char *bookmark_name, return (SET_ERROR(EINVAL)); } - error = dsl_dataset_bmark_lookup(bmark_fs, shortname, + error = dsl_bookmark_lookup_impl(bmark_fs, shortname, &bmark_phys); dsl_dataset_rele(bmark_fs, FTAG); if (error == 0) @@ -176,57 +196,187 @@ dsl_bookmark_create_check(void *arg, dmu_tx_t *tx) return (rv); } +static dsl_bookmark_node_t * +dsl_bookmark_node_alloc(char *shortname) +{ + dsl_bookmark_node_t *dbn = kmem_alloc(sizeof (*dbn), KM_SLEEP); + dbn->dbn_name = spa_strdup(shortname); + dbn->dbn_dirty = B_FALSE; + mutex_init(&dbn->dbn_lock, NULL, MUTEX_DEFAULT, NULL); + return (dbn); +} + +/* + * Set the fields in the zfs_bookmark_phys_t based on the specified snapshot. + */ static void -dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx) +dsl_bookmark_set_phys(zfs_bookmark_phys_t *zbm, dsl_dataset_t *snap) +{ + spa_t *spa = dsl_dataset_get_spa(snap); + dsl_dataset_phys_t *dsp = dsl_dataset_phys(snap); + zbm->zbm_guid = dsp->ds_guid; + zbm->zbm_creation_txg = dsp->ds_creation_txg; + zbm->zbm_creation_time = dsp->ds_creation_time; + zbm->zbm_redaction_obj = 0; + + if (spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_WRITTEN)) { + zbm->zbm_flags = ZBM_FLAG_SNAPSHOT_EXISTS | ZBM_FLAG_HAS_FBN; + zbm->zbm_referenced_bytes_refd = dsp->ds_referenced_bytes; + zbm->zbm_compressed_bytes_refd = dsp->ds_compressed_bytes; + zbm->zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes; + + dsl_dataset_t *nextds; + VERIFY0(dsl_dataset_hold_obj(snap->ds_dir->dd_pool, + dsp->ds_next_snap_obj, FTAG, &nextds)); + dsl_deadlist_space(&nextds->ds_deadlist, + &zbm->zbm_referenced_freed_before_next_snap, + &zbm->zbm_compressed_freed_before_next_snap, + &zbm->zbm_uncompressed_freed_before_next_snap); + dsl_dataset_rele(nextds, FTAG); + } else { + bzero(&zbm->zbm_flags, + sizeof (zfs_bookmark_phys_t) - + offsetof(zfs_bookmark_phys_t, zbm_flags)); + } +} + +void +dsl_bookmark_node_add(dsl_dataset_t *hds, dsl_bookmark_node_t *dbn, + dmu_tx_t *tx) { - dsl_bookmark_create_arg_t *dbca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; - ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)); + if (hds->ds_bookmarks_obj == 0) { + hds->ds_bookmarks_obj = zap_create_norm(mos, + U8_TEXTPREP_TOUPPER, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, + tx); + spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); + + dsl_dataset_zapify(hds, tx); + VERIFY0(zap_add(mos, hds->ds_object, + DS_FIELD_BOOKMARK_NAMES, + sizeof (hds->ds_bookmarks_obj), 1, + &hds->ds_bookmarks_obj, tx)); + } - for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); - pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { - dsl_dataset_t *snapds, *bmark_fs; - zfs_bookmark_phys_t bmark_phys; - char *shortname; + avl_add(&hds->ds_bookmarks, dbn); + + /* + * To maintain backwards compatibility with software that doesn't + * understand SPA_FEATURE_REDACTION_BOOKMARKS or + * SPA_FEATURE_BOOKMARK_WRITTEN, we need to use the smallest of + * the 3 possible bookmark sizes: + * - original (ends before zbm_redaction_obj) + * - redaction (ends before zbm_flags) + * - current / written (ends at end of struct) + */ + uint64_t bookmark_phys_size = offsetof(zfs_bookmark_phys_t, + zbm_redaction_obj); + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) + bookmark_phys_size = sizeof (zfs_bookmark_phys_t); + else if (dbn->dbn_phys.zbm_redaction_obj != 0) + bookmark_phys_size = offsetof(zfs_bookmark_phys_t, zbm_flags); + + zfs_bookmark_phys_t zero_phys = { 0 }; + ASSERTV(!bcmp(((char *)&dbn->dbn_phys) + bookmark_phys_size, + &zero_phys, sizeof (zfs_bookmark_phys_t) - bookmark_phys_size)); + + VERIFY0(zap_add(mos, hds->ds_bookmarks_obj, dbn->dbn_name, + sizeof (uint64_t), bookmark_phys_size / sizeof (uint64_t), + &dbn->dbn_phys, tx)); +} - VERIFY0(dsl_dataset_hold(dp, fnvpair_value_string(pair), - FTAG, &snapds)); - VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair), - &bmark_fs, FTAG, &shortname)); - if (bmark_fs->ds_bookmarks == 0) { - bmark_fs->ds_bookmarks = - zap_create_norm(mos, U8_TEXTPREP_TOUPPER, - DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); - spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); - - dsl_dataset_zapify(bmark_fs, tx); - VERIFY0(zap_add(mos, bmark_fs->ds_object, - DS_FIELD_BOOKMARK_NAMES, - sizeof (bmark_fs->ds_bookmarks), 1, - &bmark_fs->ds_bookmarks, tx)); +/* + * If redaction_list is non-null, we create a redacted bookmark and redaction + * list, and store the object number of the redaction list in redact_obj. + */ +static void +dsl_bookmark_create_sync_impl(const char *bookmark, const char *snapshot, + dmu_tx_t *tx, uint64_t num_redact_snaps, uint64_t *redact_snaps, void *tag, + redaction_list_t **redaction_list) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + dsl_dataset_t *snapds, *bmark_fs; + char *shortname; + boolean_t bookmark_redacted; + uint64_t *dsredactsnaps; + uint64_t dsnumsnaps; + + VERIFY0(dsl_dataset_hold(dp, snapshot, FTAG, &snapds)); + VERIFY0(dsl_bookmark_hold_ds(dp, bookmark, &bmark_fs, FTAG, + &shortname)); + + dsl_bookmark_node_t *dbn = dsl_bookmark_node_alloc(shortname); + dsl_bookmark_set_phys(&dbn->dbn_phys, snapds); + + bookmark_redacted = dsl_dataset_get_uint64_array_feature(snapds, + SPA_FEATURE_REDACTED_DATASETS, &dsnumsnaps, &dsredactsnaps); + if (redaction_list != NULL || bookmark_redacted) { + redaction_list_t *local_rl; + if (bookmark_redacted) { + redact_snaps = dsredactsnaps; + num_redact_snaps = dsnumsnaps; } + dbn->dbn_phys.zbm_redaction_obj = dmu_object_alloc(mos, + DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OTN_UINT64_METADATA, sizeof (redaction_list_phys_t) + + num_redact_snaps * sizeof (uint64_t), tx); + spa_feature_incr(dp->dp_spa, + SPA_FEATURE_REDACTION_BOOKMARKS, tx); + + VERIFY0(dsl_redaction_list_hold_obj(dp, + dbn->dbn_phys.zbm_redaction_obj, tag, &local_rl)); + dsl_redaction_list_long_hold(dp, local_rl, tag); + + ASSERT3U((local_rl)->rl_dbuf->db_size, >=, + sizeof (redaction_list_phys_t) + num_redact_snaps * + sizeof (uint64_t)); + dmu_buf_will_dirty(local_rl->rl_dbuf, tx); + bcopy(redact_snaps, local_rl->rl_phys->rlp_snaps, + sizeof (uint64_t) * num_redact_snaps); + local_rl->rl_phys->rlp_num_snaps = num_redact_snaps; + if (bookmark_redacted) { + ASSERT3P(redaction_list, ==, NULL); + local_rl->rl_phys->rlp_last_blkid = UINT64_MAX; + local_rl->rl_phys->rlp_last_object = UINT64_MAX; + dsl_redaction_list_long_rele(local_rl, tag); + dsl_redaction_list_rele(local_rl, tag); + } else { + *redaction_list = local_rl; + } + } - bmark_phys.zbm_guid = dsl_dataset_phys(snapds)->ds_guid; - bmark_phys.zbm_creation_txg = - dsl_dataset_phys(snapds)->ds_creation_txg; - bmark_phys.zbm_creation_time = - dsl_dataset_phys(snapds)->ds_creation_time; + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { + spa_feature_incr(dp->dp_spa, + SPA_FEATURE_BOOKMARK_WRITTEN, tx); + } - VERIFY0(zap_add(mos, bmark_fs->ds_bookmarks, - shortname, sizeof (uint64_t), - sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), - &bmark_phys, tx)); + dsl_bookmark_node_add(bmark_fs, dbn, tx); - spa_history_log_internal_ds(bmark_fs, "bookmark", tx, - "name=%s creation_txg=%llu target_snap=%llu", - shortname, - (longlong_t)bmark_phys.zbm_creation_txg, - (longlong_t)snapds->ds_object); + spa_history_log_internal_ds(bmark_fs, "bookmark", tx, + "name=%s creation_txg=%llu target_snap=%llu redact_obj=%llu", + shortname, (longlong_t)dbn->dbn_phys.zbm_creation_txg, + (longlong_t)snapds->ds_object, + (longlong_t)dbn->dbn_phys.zbm_redaction_obj); - dsl_dataset_rele(bmark_fs, FTAG); - dsl_dataset_rele(snapds, FTAG); + dsl_dataset_rele(bmark_fs, FTAG); + dsl_dataset_rele(snapds, FTAG); +} + +static void +dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_bookmark_create_arg_t *dbca = arg; + + ASSERTV(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)); + + for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { + dsl_bookmark_create_sync_impl(nvpair_name(pair), + fnvpair_value_string(pair), tx, 0, NULL, NULL, NULL); } } @@ -251,53 +401,273 @@ dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors) fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL)); } -int -dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl) +static int +dsl_bookmark_create_redacted_check(void *arg, dmu_tx_t *tx) { - int err = 0; - zap_cursor_t zc; - zap_attribute_t attr; - dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_bookmark_create_redacted_arg_t *dbcra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *snapds; + int rv = 0; - uint64_t bmark_zapobj = ds->ds_bookmarks; - if (bmark_zapobj == 0) - return (0); + if (!spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_REDACTION_BOOKMARKS)) + return (SET_ERROR(ENOTSUP)); + /* + * If the list of redact snaps will not fit in the bonus buffer with + * the furthest reached object and offset, fail. + */ + if (dbcra->dbcra_numsnaps > (dmu_bonus_max() - + sizeof (redaction_list_phys_t)) / sizeof (uint64_t)) + return (SET_ERROR(E2BIG)); + + rv = dsl_dataset_hold(dp, dbcra->dbcra_snap, + FTAG, &snapds); + if (rv == 0) { + rv = dsl_bookmark_create_check_impl(snapds, dbcra->dbcra_bmark, + tx); + dsl_dataset_rele(snapds, FTAG); + } + return (rv); +} - for (zap_cursor_init(&zc, dp->dp_meta_objset, bmark_zapobj); - zap_cursor_retrieve(&zc, &attr) == 0; - zap_cursor_advance(&zc)) { - char *bmark_name = attr.za_name; - zfs_bookmark_phys_t bmark_phys; +static void +dsl_bookmark_create_redacted_sync(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_create_redacted_arg_t *dbcra = arg; + dsl_bookmark_create_sync_impl(dbcra->dbcra_bmark, dbcra->dbcra_snap, tx, + dbcra->dbcra_numsnaps, dbcra->dbcra_snaps, dbcra->dbcra_tag, + dbcra->dbcra_rl); +} - err = dsl_dataset_bmark_lookup(ds, bmark_name, &bmark_phys); - ASSERT3U(err, !=, ENOENT); - if (err != 0) - break; +int +dsl_bookmark_create_redacted(const char *bookmark, const char *snapshot, + uint64_t numsnaps, uint64_t *snapguids, void *tag, redaction_list_t **rl) +{ + dsl_bookmark_create_redacted_arg_t dbcra; + + dbcra.dbcra_bmark = bookmark; + dbcra.dbcra_snap = snapshot; + dbcra.dbcra_rl = rl; + dbcra.dbcra_numsnaps = numsnaps; + dbcra.dbcra_snaps = snapguids; + dbcra.dbcra_tag = tag; + + return (dsl_sync_task(bookmark, dsl_bookmark_create_redacted_check, + dsl_bookmark_create_redacted_sync, &dbcra, 5, + ZFS_SPACE_CHECK_NORMAL)); +} - nvlist_t *out_props = fnvlist_alloc(); - if (nvlist_exists(props, - zfs_prop_to_name(ZFS_PROP_GUID))) { +/* + * Retrieve the list of properties given in the 'props' nvlist for a bookmark. + * If 'props' is NULL, retrieves all properties. + */ +static void +dsl_bookmark_fetch_props(dsl_pool_t *dp, zfs_bookmark_phys_t *bmark_phys, + nvlist_t *props, nvlist_t *out_props) +{ + ASSERT3P(dp, !=, NULL); + ASSERT3P(bmark_phys, !=, NULL); + ASSERT3P(out_props, !=, NULL); + ASSERT(RRW_LOCK_HELD(&dp->dp_config_rwlock)); + + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_GUID))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_GUID, bmark_phys->zbm_guid); + } + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_CREATETXG))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_CREATETXG, bmark_phys->zbm_creation_txg); + } + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_CREATION))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_CREATION, bmark_phys->zbm_creation_time); + } + if (bmark_phys->zbm_flags & ZBM_FLAG_HAS_FBN) { + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_REFERENCED))) { dsl_prop_nvlist_add_uint64(out_props, - ZFS_PROP_GUID, bmark_phys.zbm_guid); + ZFS_PROP_REFERENCED, + bmark_phys->zbm_referenced_bytes_refd); } - if (nvlist_exists(props, - zfs_prop_to_name(ZFS_PROP_CREATETXG))) { + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_LOGICALREFERENCED))) { dsl_prop_nvlist_add_uint64(out_props, - ZFS_PROP_CREATETXG, bmark_phys.zbm_creation_txg); + ZFS_PROP_LOGICALREFERENCED, + bmark_phys->zbm_uncompressed_bytes_refd); } - if (nvlist_exists(props, - zfs_prop_to_name(ZFS_PROP_CREATION))) { + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_REFRATIO))) { + uint64_t ratio = + bmark_phys->zbm_compressed_bytes_refd == 0 ? 100 : + bmark_phys->zbm_uncompressed_bytes_refd * 100 / + bmark_phys->zbm_compressed_bytes_refd; dsl_prop_nvlist_add_uint64(out_props, - ZFS_PROP_CREATION, bmark_phys.zbm_creation_time); + ZFS_PROP_REFRATIO, ratio); + } + } + + if ((props == NULL || nvlist_exists(props, "redact_snaps") || + nvlist_exists(props, "redact_complete")) && + bmark_phys->zbm_redaction_obj != 0) { + redaction_list_t *rl; + int err = dsl_redaction_list_hold_obj(dp, + bmark_phys->zbm_redaction_obj, FTAG, &rl); + if (err == 0) { + if (nvlist_exists(props, "redact_snaps")) { + nvlist_t *nvl; + nvl = fnvlist_alloc(); + fnvlist_add_uint64_array(nvl, ZPROP_VALUE, + rl->rl_phys->rlp_snaps, + rl->rl_phys->rlp_num_snaps); + fnvlist_add_nvlist(out_props, "redact_snaps", + nvl); + nvlist_free(nvl); + } + if (nvlist_exists(props, "redact_complete")) { + nvlist_t *nvl; + nvl = fnvlist_alloc(); + fnvlist_add_boolean_value(nvl, ZPROP_VALUE, + rl->rl_phys->rlp_last_blkid == UINT64_MAX && + rl->rl_phys->rlp_last_object == UINT64_MAX); + fnvlist_add_nvlist(out_props, "redact_complete", + nvl); + nvlist_free(nvl); + } + dsl_redaction_list_rele(rl, FTAG); } + } +} + +int +dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; - fnvlist_add_nvlist(outnvl, bmark_name, out_props); + ASSERT(dsl_pool_config_held(dp)); + + if (dsl_dataset_is_snapshot(ds)) + return (SET_ERROR(EINVAL)); + + for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks); + dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) { + nvlist_t *out_props = fnvlist_alloc(); + + dsl_bookmark_fetch_props(dp, &dbn->dbn_phys, props, out_props); + + fnvlist_add_nvlist(outnvl, dbn->dbn_name, out_props); fnvlist_free(out_props); } + return (0); +} + +/* + * Comparison func for ds_bookmarks AVL tree. We sort the bookmarks by + * their TXG, then by their FBN-ness. The "FBN-ness" component ensures + * that all bookmarks at the same TXG that HAS_FBN are adjacent, which + * dsl_bookmark_destroy_sync_impl() depends on. Note that there may be + * multiple bookmarks at the same TXG (with the same FBN-ness). In this + * case we differentiate them by an arbitrary metric (in this case, + * their names). + */ +static int +dsl_bookmark_compare(const void *l, const void *r) +{ + const dsl_bookmark_node_t *ldbn = l; + const dsl_bookmark_node_t *rdbn = r; + + int64_t cmp = ldbn->dbn_phys.zbm_creation_txg - + rdbn->dbn_phys.zbm_creation_txg; + if (cmp < 0) + return (-1); + else if (cmp > 0) + return (1); + cmp = (ldbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) - + (rdbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN); + if (cmp < 0) + return (-1); + else if (cmp > 0) + return (1); + cmp = strcmp(ldbn->dbn_name, rdbn->dbn_name); + if (cmp < 0) + return (-1); + else if (cmp > 0) + return (1); + return (0); +} + +/* + * Cache this (head) dataset's bookmarks in the ds_bookmarks AVL tree. + */ +int +dsl_bookmark_init_ds(dsl_dataset_t *ds) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + + ASSERT(!ds->ds_is_snapshot); + + avl_create(&ds->ds_bookmarks, dsl_bookmark_compare, + sizeof (dsl_bookmark_node_t), + offsetof(dsl_bookmark_node_t, dbn_node)); + + if (!dsl_dataset_is_zapified(ds)) + return (0); + + int zaperr = zap_lookup(mos, ds->ds_object, DS_FIELD_BOOKMARK_NAMES, + sizeof (ds->ds_bookmarks_obj), 1, &ds->ds_bookmarks_obj); + if (zaperr == ENOENT) + return (0); + if (zaperr != 0) + return (zaperr); + + if (ds->ds_bookmarks_obj == 0) + return (0); + + int err = 0; + zap_cursor_t zc; + zap_attribute_t attr; + + for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); + (err = zap_cursor_retrieve(&zc, &attr)) == 0; + zap_cursor_advance(&zc)) { + dsl_bookmark_node_t *dbn = + dsl_bookmark_node_alloc(attr.za_name); + + err = dsl_bookmark_lookup_impl(ds, + dbn->dbn_name, &dbn->dbn_phys); + ASSERT3U(err, !=, ENOENT); + if (err != 0) { + kmem_free(dbn, sizeof (*dbn)); + break; + } + avl_add(&ds->ds_bookmarks, dbn); + } zap_cursor_fini(&zc); + if (err == ENOENT) + err = 0; return (err); } +void +dsl_bookmark_fini_ds(dsl_dataset_t *ds) +{ + void *cookie = NULL; + dsl_bookmark_node_t *dbn; + + if (ds->ds_is_snapshot) + return; + + while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) != NULL) { + spa_strfree(dbn->dbn_name); + mutex_destroy(&dbn->dbn_lock); + kmem_free(dbn, sizeof (*dbn)); + } + avl_destroy(&ds->ds_bookmarks); +} + /* * Retrieve the bookmarks that exist in the specified dataset, and the * requested properties of each bookmark. @@ -328,23 +698,131 @@ dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl) return (err); } +/* + * Retrieve all properties for a single bookmark in the given dataset. + */ +int +dsl_get_bookmark_props(const char *dsname, const char *bmname, nvlist_t *props) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + zfs_bookmark_phys_t bmark_phys; + int err; + + err = dsl_pool_hold(dsname, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + err = dsl_bookmark_lookup_impl(ds, bmname, &bmark_phys); + if (err != 0) + goto out; + + dsl_bookmark_fetch_props(dp, &bmark_phys, NULL, props); +out: + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (err); +} + typedef struct dsl_bookmark_destroy_arg { nvlist_t *dbda_bmarks; nvlist_t *dbda_success; nvlist_t *dbda_errors; } dsl_bookmark_destroy_arg_t; -static int -dsl_dataset_bookmark_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) +static void +dsl_bookmark_destroy_sync_impl(dsl_dataset_t *ds, const char *name, + dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t bmark_zapobj = ds->ds_bookmarks; + uint64_t bmark_zapobj = ds->ds_bookmarks_obj; matchtype_t mt = 0; + /* + * 'search' must be zeroed so that dbn_flags (which is used in + * dsl_bookmark_compare()) will be zeroed even if the on-disk + * (in ZAP) bookmark is shorter than offsetof(dbn_flags). + */ + dsl_bookmark_node_t search = { 0 }; + char realname[ZFS_MAX_DATASET_NAME_LEN]; + + /* + * Find the real name of this bookmark, which may be different + * from the given name if the dataset is case-insensitive. Then + * use the real name to find the node in the ds_bookmarks AVL tree. + */ if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_NORMALIZE; + VERIFY0(zap_lookup_norm(mos, bmark_zapobj, name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &search.dbn_phys, mt, realname, sizeof (realname), NULL)); + + search.dbn_name = realname; + dsl_bookmark_node_t *dbn = avl_find(&ds->ds_bookmarks, &search, NULL); + ASSERT(dbn != NULL); + + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { + /* + * If this bookmark HAS_FBN, and it is before the most + * recent snapshot, then its TXG is a key in the head's + * deadlist (and all clones' heads' deadlists). If this is + * the last thing keeping the key (i.e. there are no more + * bookmarks with HAS_FBN at this TXG, and there is no + * snapshot at this TXG), then remove the key. + * + * Note that this algorithm depends on ds_bookmarks being + * sorted such that all bookmarks at the same TXG with + * HAS_FBN are adjacent (with no non-HAS_FBN bookmarks + * at the same TXG in between them). If this were not + * the case, we would need to examine *all* bookmarks + * at this TXG, rather than just the adjacent ones. + */ + + dsl_bookmark_node_t *dbn_prev = + AVL_PREV(&ds->ds_bookmarks, dbn); + dsl_bookmark_node_t *dbn_next = + AVL_NEXT(&ds->ds_bookmarks, dbn); + + boolean_t more_bookmarks_at_this_txg = + (dbn_prev != NULL && dbn_prev->dbn_phys.zbm_creation_txg == + dbn->dbn_phys.zbm_creation_txg && + (dbn_prev->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) || + (dbn_next != NULL && dbn_next->dbn_phys.zbm_creation_txg == + dbn->dbn_phys.zbm_creation_txg && + (dbn_next->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)); + + if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS) && + !more_bookmarks_at_this_txg && + dbn->dbn_phys.zbm_creation_txg < + dsl_dataset_phys(ds)->ds_prev_snap_txg) { + dsl_dir_remove_clones_key(ds->ds_dir, + dbn->dbn_phys.zbm_creation_txg, tx); + dsl_deadlist_remove_key(&ds->ds_deadlist, + dbn->dbn_phys.zbm_creation_txg, tx); + } + + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_BOOKMARK_WRITTEN, tx); + } + + if (dbn->dbn_phys.zbm_redaction_obj != 0) { + VERIFY0(dmu_object_free(mos, + dbn->dbn_phys.zbm_redaction_obj, tx)); + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_REDACTION_BOOKMARKS, tx); + } - return (zap_remove_norm(mos, bmark_zapobj, name, mt, tx)); + avl_remove(&ds->ds_bookmarks, dbn); + spa_strfree(dbn->dbn_name); + mutex_destroy(&dbn->dbn_lock); + kmem_free(dbn, sizeof (*dbn)); + + VERIFY0(zap_remove_norm(mos, bmark_zapobj, name, mt, tx)); } static int @@ -375,7 +853,7 @@ dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx) continue; } if (error == 0) { - error = dsl_dataset_bmark_lookup(ds, shortname, &bm); + error = dsl_bookmark_lookup_impl(ds, shortname, &bm); dsl_dataset_rele(ds, FTAG); if (error == ESRCH) { /* @@ -384,6 +862,20 @@ dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx) */ continue; } + if (error == 0 && bm.zbm_redaction_obj != 0) { + redaction_list_t *rl = NULL; + error = dsl_redaction_list_hold_obj(tx->tx_pool, + bm.zbm_redaction_obj, FTAG, &rl); + if (error == ENOENT) { + error = 0; + } else if (error == 0 && + dsl_redaction_list_long_held(rl)) { + error = SET_ERROR(EBUSY); + } + if (rl != NULL) { + dsl_redaction_list_rele(rl, FTAG); + } + } } if (error == 0) { if (dmu_tx_is_syncing(tx)) { @@ -413,18 +905,17 @@ dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx) VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair), &ds, FTAG, &shortname)); - VERIFY0(dsl_dataset_bookmark_remove(ds, shortname, tx)); + dsl_bookmark_destroy_sync_impl(ds, shortname, tx); /* * If all of this dataset's bookmarks have been destroyed, * free the zap object and decrement the feature's use count. */ - VERIFY0(zap_count(mos, ds->ds_bookmarks, - &zap_cnt)); + VERIFY0(zap_count(mos, ds->ds_bookmarks_obj, &zap_cnt)); if (zap_cnt == 0) { dmu_buf_will_dirty(ds->ds_dbuf, tx); - VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx)); - ds->ds_bookmarks = 0; + VERIFY0(zap_destroy(mos, ds->ds_bookmarks_obj, tx)); + ds->ds_bookmarks_obj = 0; spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); VERIFY0(zap_remove(mos, ds->ds_object, DS_FIELD_BOOKMARK_NAMES, tx)); @@ -459,3 +950,559 @@ dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors) fnvlist_free(dbda.dbda_success); return (rv); } + +/* Return B_TRUE if there are any long holds on this dataset. */ +boolean_t +dsl_redaction_list_long_held(redaction_list_t *rl) +{ + return (!refcount_is_zero(&rl->rl_longholds)); +} + +void +dsl_redaction_list_long_hold(dsl_pool_t *dp, redaction_list_t *rl, void *tag) +{ + ASSERT(dsl_pool_config_held(dp)); + (void) refcount_add(&rl->rl_longholds, tag); +} + +void +dsl_redaction_list_long_rele(redaction_list_t *rl, void *tag) +{ + (void) refcount_remove(&rl->rl_longholds, tag); +} + +/* ARGSUSED */ +static void +redaction_list_evict_sync(void *rlu) +{ + redaction_list_t *rl = rlu; + refcount_destroy(&rl->rl_longholds); + + kmem_free(rl, sizeof (redaction_list_t)); +} + +void +dsl_redaction_list_rele(redaction_list_t *rl, void *tag) +{ + dmu_buf_rele(rl->rl_dbuf, tag); +} + +int +dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, void *tag, + redaction_list_t **rlp) +{ + objset_t *mos = dp->dp_meta_objset; + dmu_buf_t *dbuf; + redaction_list_t *rl; + int err; + + ASSERT(dsl_pool_config_held(dp)); + + err = dmu_bonus_hold(mos, rlobj, tag, &dbuf); + if (err != 0) + return (err); + + rl = dmu_buf_get_user(dbuf); + if (rl == NULL) { + redaction_list_t *winner = NULL; + + rl = kmem_zalloc(sizeof (redaction_list_t), KM_SLEEP); + rl->rl_dbuf = dbuf; + rl->rl_object = rlobj; + rl->rl_phys = dbuf->db_data; + rl->rl_mos = dp->dp_meta_objset; + refcount_create(&rl->rl_longholds); + dmu_buf_init_user(&rl->rl_dbu, redaction_list_evict_sync, NULL, + &rl->rl_dbuf); + if ((winner = dmu_buf_set_user_ie(dbuf, &rl->rl_dbu)) != NULL) { + kmem_free(rl, sizeof (*rl)); + rl = winner; + } + } + *rlp = rl; + return (0); +} + +/* + * Snapshot ds is being destroyed. + * + * Adjust the "freed_before_next" of any bookmarks between this snap + * and the previous snapshot, because their "next snapshot" is changing. + * + * If there are any bookmarks with HAS_FBN at this snapshot, remove + * their HAS_SNAP flag (note: there can be at most one snapshot of + * each filesystem at a given txg), and return B_TRUE. In this case + * the caller can not remove the key in the deadlist at this TXG, because + * the HAS_FBN bookmarks require the key be there. + * + * Returns B_FALSE if there are no bookmarks with HAS_FBN at this + * snapshot's TXG. In this case the caller can remove the key in the + * deadlist at this TXG. + */ +boolean_t +dsl_bookmark_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + dsl_dataset_t *head, *next; + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &head)); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &next)); + + /* + * Find the first bookmark that HAS_FBN at or after the + * previous snapshot. + */ + dsl_bookmark_node_t search = { 0 }; + avl_index_t idx; + search.dbn_phys.zbm_creation_txg = + dsl_dataset_phys(ds)->ds_prev_snap_txg; + search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN; + /* + * The empty-string name can't be in the AVL, and it compares + * before any entries with this TXG. + */ + search.dbn_name = ""; + VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL); + dsl_bookmark_node_t *dbn = + avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER); + + /* + * Iterate over all bookmarks that are at or after the previous + * snapshot, and before this (being deleted) snapshot. Adjust + * their FBN based on their new next snapshot. + */ + for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg < + dsl_dataset_phys(ds)->ds_creation_txg; + dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) { + if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) + continue; + /* + * Increase our FBN by the amount of space that was live + * (referenced) at the time of this bookmark (i.e. + * birth <= zbm_creation_txg), and killed between this + * (being deleted) snapshot and the next snapshot (i.e. + * on the next snapshot's deadlist). (Space killed before + * this are already on our FBN.) + */ + uint64_t referenced, compressed, uncompressed; + dsl_deadlist_space_range(&next->ds_deadlist, + 0, dbn->dbn_phys.zbm_creation_txg, + &referenced, &compressed, &uncompressed); + dbn->dbn_phys.zbm_referenced_freed_before_next_snap += + referenced; + dbn->dbn_phys.zbm_compressed_freed_before_next_snap += + compressed; + dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap += + uncompressed; + VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj, + dbn->dbn_name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &dbn->dbn_phys, tx)); + } + dsl_dataset_rele(next, FTAG); + + /* + * There may be several bookmarks at this txg (the TXG of the + * snapshot being deleted). We need to clear the SNAPSHOT_EXISTS + * flag on all of them, and return TRUE if there is at least 1 + * bookmark here with HAS_FBN (thus preventing the deadlist + * key from being removed). + */ + boolean_t rv = B_FALSE; + for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg == + dsl_dataset_phys(ds)->ds_creation_txg; + dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) { + if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) { + ASSERT(!(dbn->dbn_phys.zbm_flags & + ZBM_FLAG_SNAPSHOT_EXISTS)); + continue; + } + ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS); + dbn->dbn_phys.zbm_flags &= ~ZBM_FLAG_SNAPSHOT_EXISTS; + VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj, + dbn->dbn_name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &dbn->dbn_phys, tx)); + rv = B_TRUE; + } + dsl_dataset_rele(head, FTAG); + return (rv); +} + +/* + * A snapshot is being created of this (head) dataset. + * + * We don't keep keys in the deadlist for the most recent snapshot, or any + * bookmarks at or after it, because there can't be any blocks on the + * deadlist in this range. Now that the most recent snapshot is after + * all bookmarks, we need to add these keys. Note that the caller always + * adds a key at the previous snapshot, so we only add keys for bookmarks + * after that. + */ +void +dsl_bookmark_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t last_key_added = UINT64_MAX; + for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg > + dsl_dataset_phys(ds)->ds_prev_snap_txg; + dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) { + uint64_t creation_txg = dbn->dbn_phys.zbm_creation_txg; + ASSERT3U(creation_txg, <=, last_key_added); + /* + * Note, there may be multiple bookmarks at this TXG, + * and we only want to add the key for this TXG once. + * The ds_bookmarks AVL is sorted by TXG, so we will visit + * these bookmarks in sequence. + */ + if ((dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) && + creation_txg != last_key_added) { + dsl_deadlist_add_key(&ds->ds_deadlist, + creation_txg, tx); + last_key_added = creation_txg; + } + } +} + +/* + * The next snapshot of the origin dataset has changed, due to + * promote or clone swap. If there are any bookmarks at this dataset, + * we need to update their zbm_*_freed_before_next_snap to reflect this. + * The head dataset has the relevant bookmarks in ds_bookmarks. + */ +void +dsl_bookmark_next_changed(dsl_dataset_t *head, dsl_dataset_t *origin, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + + /* + * Find the first bookmark that HAS_FBN at the origin snapshot. + */ + dsl_bookmark_node_t search = { 0 }; + avl_index_t idx; + search.dbn_phys.zbm_creation_txg = + dsl_dataset_phys(origin)->ds_creation_txg; + search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN; + /* + * The empty-string name can't be in the AVL, and it compares + * before any entries with this TXG. + */ + search.dbn_name = ""; + VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL); + dsl_bookmark_node_t *dbn = + avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER); + + /* + * Iterate over all bookmarks that are at the origin txg. + * Adjust their FBN based on their new next snapshot. + */ + for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg == + dsl_dataset_phys(origin)->ds_creation_txg && + (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN); + dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) { + + /* + * Bookmark is at the origin, therefore its + * "next dataset" is changing, so we need + * to reset its FBN by recomputing it in + * dsl_bookmark_set_phys(). + */ + ASSERT3U(dbn->dbn_phys.zbm_guid, ==, + dsl_dataset_phys(origin)->ds_guid); + ASSERT3U(dbn->dbn_phys.zbm_referenced_bytes_refd, ==, + dsl_dataset_phys(origin)->ds_referenced_bytes); + ASSERT(dbn->dbn_phys.zbm_flags & + ZBM_FLAG_SNAPSHOT_EXISTS); + /* + * Save and restore the zbm_redaction_obj, which + * is zeroed by dsl_bookmark_set_phys(). + */ + uint64_t redaction_obj = + dbn->dbn_phys.zbm_redaction_obj; + dsl_bookmark_set_phys(&dbn->dbn_phys, origin); + dbn->dbn_phys.zbm_redaction_obj = redaction_obj; + + VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj, + dbn->dbn_name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &dbn->dbn_phys, tx)); + } +} + +/* + * This block is no longer referenced by this (head) dataset. + * + * Adjust the FBN of any bookmarks that reference this block, whose "next" + * is the head dataset. + */ +/* ARGSUSED */ +void +dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) +{ + /* + * Iterate over bookmarks whose "next" is the head dataset. + */ + for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg >= + dsl_dataset_phys(ds)->ds_prev_snap_txg; + dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) { + /* + * If the block was live (referenced) at the time of this + * bookmark, add its space to the bookmark's FBN. + */ + if (bp->blk_birth <= dbn->dbn_phys.zbm_creation_txg && + (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) { + mutex_enter(&dbn->dbn_lock); + dbn->dbn_phys.zbm_referenced_freed_before_next_snap += + bp_get_dsize_sync(dsl_dataset_get_spa(ds), bp); + dbn->dbn_phys.zbm_compressed_freed_before_next_snap += + BP_GET_PSIZE(bp); + dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap += + BP_GET_UCSIZE(bp); + /* + * Changing the ZAP object here would be too + * expensive. Also, we may be called from the zio + * interrupt thread, which can't block on i/o. + * Therefore, we mark this bookmark as dirty and + * modify the ZAP once per txg, in + * dsl_bookmark_sync_done(). + */ + dbn->dbn_dirty = B_TRUE; + mutex_exit(&dbn->dbn_lock); + } + } +} + +void +dsl_bookmark_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + + if (dsl_dataset_is_snapshot(ds)) + return; + + /* + * We only dirty bookmarks that are at or after the most recent + * snapshot. We can't create snapshots between + * dsl_bookmark_block_killed() and dsl_bookmark_sync_done(), so we + * don't need to look at any bookmarks before ds_prev_snap_txg. + */ + for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg >= + dsl_dataset_phys(ds)->ds_prev_snap_txg; + dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) { + if (dbn->dbn_dirty) { + /* + * We only dirty nodes with HAS_FBN, therefore + * we can always use the current bookmark struct size. + */ + ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN); + VERIFY0(zap_update(dp->dp_meta_objset, + ds->ds_bookmarks_obj, + dbn->dbn_name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &dbn->dbn_phys, tx)); + dbn->dbn_dirty = B_FALSE; + } + } +#ifdef ZFS_DEBUG + for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks); + dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) { + ASSERT(!dbn->dbn_dirty); + } +#endif +} + +/* + * Return the TXG of the most recent bookmark (or 0 if there are no bookmarks). + */ +uint64_t +dsl_bookmark_latest_txg(dsl_dataset_t *ds) +{ + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); + if (dbn == NULL) + return (0); + return (dbn->dbn_phys.zbm_creation_txg); +} + +static inline unsigned int +redact_block_buf_num_entries(unsigned int size) +{ + return (size / sizeof (redact_block_phys_t)); +} + +/* + * This function calculates the offset of the last entry in the array of + * redact_block_phys_t. If we're reading the redaction list into buffers of + * size bufsize, then for all but the last buffer, the last valid entry in the + * array will be the last entry in the array. However, for the last buffer, any + * amount of it may be filled. Thus, we check to see if we're looking at the + * last buffer in the redaction list, and if so, we return the total number of + * entries modulo the number of entries per buffer. Otherwise, we return the + * number of entries per buffer minus one. + */ +static inline unsigned int +last_entry(redaction_list_t *rl, unsigned int bufsize, uint64_t bufid) +{ + if (bufid == (rl->rl_phys->rlp_num_entries - 1) / + redact_block_buf_num_entries(bufsize)) { + return ((rl->rl_phys->rlp_num_entries - 1) % + redact_block_buf_num_entries(bufsize)); + } + return (redact_block_buf_num_entries(bufsize) - 1); +} + +/* + * Compare the redact_block_phys_t to the bookmark. If the last block in the + * redact_block_phys_t is before the bookmark, return -1. If the first block in + * the redact_block_phys_t is after the bookmark, return 1. Otherwise, the + * bookmark is inside the range of the redact_block_phys_t, and we return 0. + */ +static int +redact_block_zb_compare(redact_block_phys_t *first, + zbookmark_phys_t *second) +{ + /* + * If the block_phys is for a previous object, or the last block in the + * block_phys is strictly before the block in the bookmark, the + * block_phys is earlier. + */ + if (first->rbp_object < second->zb_object || + (first->rbp_object == second->zb_object && + first->rbp_blkid + (redact_block_get_count(first) - 1) < + second->zb_blkid)) + return (-1); + + /* + * If the bookmark is for a previous object, or the block in the + * bookmark is strictly before the first block in the block_phys, the + * bookmark is earlier. + */ + if (first->rbp_object > second->zb_object || + (first->rbp_object == second->zb_object && + first->rbp_blkid > second->zb_blkid)) + return (1); + + return (0); +} + +/* + * Traverse the redaction list in the provided object, and call the callback for + * each entry we find. Don't call the callback for any records before resume. + */ +int +dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume, + rl_traverse_callback_t cb, void *arg) +{ + objset_t *mos = rl->rl_mos; + redact_block_phys_t *buf; + unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE; + int err = 0; + + if (rl->rl_phys->rlp_last_object != UINT64_MAX || + rl->rl_phys->rlp_last_blkid != UINT64_MAX) { + /* + * When we finish a send, we update the last object and offset + * to UINT64_MAX. If a send fails partway through, the last + * object and offset will have some other value, indicating how + * far the send got. The redaction list must be complete before + * it can be traversed, so return EINVAL if the last object and + * blkid are not set to UINT64_MAX. + */ + return (SET_ERROR(EINVAL)); + } + + /* + * Binary search for the point to resume from. The goal is to minimize + * the number of disk reads we have to perform. + */ + buf = kmem_alloc(bufsize, KM_SLEEP); + uint64_t maxbufid = (rl->rl_phys->rlp_num_entries - 1) / + redact_block_buf_num_entries(bufsize); + uint64_t minbufid = 0; + while (resume != NULL && maxbufid - minbufid >= 1) { + ASSERT3U(maxbufid, >, minbufid); + uint64_t midbufid = minbufid + ((maxbufid - minbufid) / 2); + err = dmu_read(mos, rl->rl_object, midbufid * bufsize, bufsize, + buf, DMU_READ_NO_PREFETCH); + if (err != 0) + break; + + int cmp0 = redact_block_zb_compare(&buf[0], resume); + int cmpn = redact_block_zb_compare( + &buf[last_entry(rl, bufsize, maxbufid)], resume); + + /* + * If the first block is before or equal to the resume point, + * and the last one is equal or after, then the resume point is + * in this buf, and we should start here. + */ + if (cmp0 <= 0 && cmpn >= 0) + break; + + if (cmp0 > 0) + maxbufid = midbufid - 1; + else if (cmpn < 0) + minbufid = midbufid + 1; + else + panic("No progress in binary search for resume point"); + } + + for (uint64_t curidx = minbufid * redact_block_buf_num_entries(bufsize); + err == 0 && curidx < rl->rl_phys->rlp_num_entries; + curidx++) { + /* + * We read in the redaction list one block at a time. Once we + * finish with all the entries in a given block, we read in a + * new one. The predictive prefetcher will take care of any + * prefetching, and this code shouldn't be the bottleneck, so we + * don't need to do manual prefetching. + */ + if (curidx % redact_block_buf_num_entries(bufsize) == 0) { + err = dmu_read(mos, rl->rl_object, curidx * + sizeof (*buf), bufsize, buf, + DMU_READ_PREFETCH); + if (err != 0) + break; + } + redact_block_phys_t *rb = &buf[curidx % + redact_block_buf_num_entries(bufsize)]; + /* + * If resume is non-null, we should either not send the data, or + * null out resume so we don't have to keep doing these + * comparisons. + */ + if (resume != NULL) { + if (redact_block_zb_compare(rb, resume) < 0) { + continue; + } else { + /* + * If the place to resume is in the middle of + * the range described by this + * redact_block_phys, then modify the + * redact_block_phys in memory so we generate + * the right records. + */ + if (resume->zb_object == rb->rbp_object && + resume->zb_blkid > rb->rbp_blkid) { + uint64_t diff = resume->zb_blkid - + rb->rbp_blkid; + rb->rbp_blkid = resume->zb_blkid; + redact_block_set_count(rb, + redact_block_get_count(rb) - diff); + } + resume = NULL; + } + } + + if (cb(rb, arg) != 0) + break; + } + + kmem_free(buf, bufsize); + return (err); +} diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c index f0878c934771..d53f0c0d989e 100644 --- a/module/zfs/dsl_crypt.c +++ b/module/zfs/dsl_crypt.c @@ -15,6 +15,7 @@ /* * Copyright (c) 2017, Datto, Inc. All rights reserved. + * Copyright (c) 2018 by Delphix. All rights reserved. */ #include @@ -1892,7 +1893,8 @@ dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd, VERIFY0(zap_add(dp->dp_meta_objset, dd->dd_object, DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1, &dd->dd_crypto_obj, tx)); - dsl_dataset_activate_feature(dsobj, SPA_FEATURE_ENCRYPTION, tx); + dsl_dataset_activate_feature(dsobj, SPA_FEATURE_ENCRYPTION, + (void *)B_TRUE, tx); /* * If we inherited the wrapping key we release our reference now. @@ -2203,8 +2205,8 @@ dsl_crypto_recv_raw_key_sync(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx) sizeof (uint64_t), 1, &version, tx)); dsl_dataset_activate_feature(ds->ds_object, - SPA_FEATURE_ENCRYPTION, tx); - ds->ds_feature_inuse[SPA_FEATURE_ENCRYPTION] = B_TRUE; + SPA_FEATURE_ENCRYPTION, (void *)B_TRUE, tx); + ds->ds_feature[SPA_FEATURE_ENCRYPTION] = (void *)B_TRUE; /* save the dd_crypto_obj on disk */ VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_CRYPTO_KEY_OBJ, diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index b6e3b9a5c7f3..e1d026208ad7 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 RackTop Systems. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -72,6 +73,7 @@ * of this setting. */ int zfs_max_recordsize = 1 * 1024 * 1024; +int zfs_allow_redacted_dataset_mount = 0; #define SWITCH64(x, y) \ { \ @@ -89,6 +91,8 @@ static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx); +static void unload_zfeature(dsl_dataset_t *ds, spa_feature_t f); + extern int spa_asize_inflation; static zil_header_t zero_zil; @@ -129,7 +133,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); /* It could have been compressed away to nothing */ - if (BP_IS_HOLE(bp)) + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) return; ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); @@ -149,13 +153,16 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) dsl_dataset_phys(ds)->ds_unique_bytes += used; if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) { - ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] = - B_TRUE; + ds->ds_feature_activation[SPA_FEATURE_LARGE_BLOCKS] = + (void *)B_TRUE; } spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); - if (f != SPA_FEATURE_NONE) - ds->ds_feature_activation_needed[f] = B_TRUE; + if (f != SPA_FEATURE_NONE) { + ASSERT3S(spa_feature_table[f].fi_type, ==, + ZFEATURE_TYPE_BOOLEAN); + ds->ds_feature_activation[f] = (void *)B_TRUE; + } mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, @@ -215,7 +222,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); - if (BP_IS_HOLE(bp)) + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) return (0); ASSERT(dmu_tx_is_syncing(tx)); @@ -279,6 +286,9 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, DD_USED_HEAD, DD_USED_SNAP, tx); } } + + dsl_bookmark_block_killed(ds, bp, tx); + mutex_enter(&ds->ds_lock); ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used); dsl_dataset_phys(ds)->ds_referenced_bytes -= used; @@ -291,6 +301,72 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, return (used); } +struct feature_type_uint64_array_arg { + uint64_t length; + uint64_t *array; +}; + +static void +unload_zfeature(dsl_dataset_t *ds, spa_feature_t f) +{ + switch (spa_feature_table[f].fi_type) { + case ZFEATURE_TYPE_BOOLEAN: + break; + case ZFEATURE_TYPE_UINT64_ARRAY: + { + struct feature_type_uint64_array_arg *ftuaa = ds->ds_feature[f]; + kmem_free(ftuaa->array, ftuaa->length * sizeof (uint64_t)); + kmem_free(ftuaa, sizeof (*ftuaa)); + break; + } + default: + panic("Invalid zfeature type!"); + } +} + +static int +load_zfeature(objset_t *mos, dsl_dataset_t *ds, spa_feature_t f) +{ + int err = 0; + switch (spa_feature_table[f].fi_type) { + case ZFEATURE_TYPE_BOOLEAN: + err = zap_contains(mos, ds->ds_object, + spa_feature_table[f].fi_guid); + if (err == 0) { + ds->ds_feature[f] = (void *)B_TRUE; + } else { + ASSERT3U(err, ==, ENOENT); + err = 0; + } + break; + case ZFEATURE_TYPE_UINT64_ARRAY: + { + uint64_t int_size, num_int; + uint64_t *data; + err = zap_length(mos, ds->ds_object, + spa_feature_table[f].fi_guid, &int_size, &num_int); + if (err != 0) { + ASSERT3U(err, ==, ENOENT); + err = 0; + break; + } + ASSERT3U(int_size, ==, sizeof (uint64_t)); + data = kmem_alloc(int_size * num_int, KM_SLEEP); + VERIFY0(zap_lookup(mos, ds->ds_object, + spa_feature_table[f].fi_guid, int_size, num_int, data)); + struct feature_type_uint64_array_arg *ftuaa = + kmem_alloc(sizeof (*ftuaa), KM_SLEEP); + ftuaa->length = num_int; + ftuaa->array = data; + ds->ds_feature[f] = ftuaa; + break; + } + default: + panic("Invalid zfeature type!"); + } + return (err); +} + /* * We have to release the fsid syncronously or we risk that a subsequent * mount of the same dataset will fail to unique_insert the fsid. This @@ -334,6 +410,11 @@ dsl_dataset_evict_async(void *dbu) ASSERT(!list_link_active(&ds->ds_synced_link)); + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (dsl_dataset_feature_is_active(ds, f)) + unload_zfeature(ds, f); + } + list_destroy(&ds->ds_prop_cbs); mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_opening_lock); @@ -488,8 +569,8 @@ dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, bplist_create(&ds->ds_pending_deadlist); - list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), - offsetof(dmu_sendarg_t, dsa_link)); + list_create(&ds->ds_sendstreams, sizeof (dmu_sendstatus_t), + offsetof(dmu_sendstatus_t, dss_link)); list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t), offsetof(dsl_prop_cb_record_t, cbr_ds_node)); @@ -501,14 +582,7 @@ dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, if (!(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET)) continue; - err = zap_contains(mos, dsobj, - spa_feature_table[f].fi_guid); - if (err == 0) { - ds->ds_feature_inuse[f] = B_TRUE; - } else { - ASSERT3U(err, ==, ENOENT); - err = 0; - } + err = load_zfeature(mos, ds, f); } } @@ -519,14 +593,7 @@ dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev); } - if (doi.doi_type == DMU_OTN_ZAP_METADATA) { - int zaperr = zap_lookup(mos, ds->ds_object, - DS_FIELD_BOOKMARK_NAMES, - sizeof (ds->ds_bookmarks), 1, - &ds->ds_bookmarks); - if (zaperr != ENOENT) - VERIFY0(zaperr); - } + err = dsl_bookmark_init_ds(ds); } else { if (zfs_flags & ZFS_DEBUG_SNAPNAMES) err = dsl_dataset_get_snapname(ds); @@ -571,6 +638,7 @@ dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, dsl_deadlist_close(&ds->ds_deadlist); if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) dsl_deadlist_close(&ds->ds_remap_deadlist); + dsl_bookmark_fini_ds(ds); if (ds->ds_prev) dsl_dataset_rele(ds->ds_prev, ds); dsl_dir_rele(ds->ds_dir, ds); @@ -685,14 +753,14 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag, return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp)); } -int -dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, - void *tag, dsl_dataset_t **dsp) +static int +dsl_dataset_own_obj_impl(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, + void *tag, boolean_t override, dsl_dataset_t **dsp) { int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp); if (err != 0) return (err); - if (!dsl_dataset_tryown(*dsp, tag)) { + if (!dsl_dataset_tryown(*dsp, tag, override)) { dsl_dataset_rele_flags(*dsp, flags, tag); *dsp = NULL; return (SET_ERROR(EBUSY)); @@ -700,20 +768,49 @@ dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, return (0); } + int -dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, +dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) +{ + return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_FALSE, dsp)); +} + +int +dsl_dataset_own_obj_force(dsl_pool_t *dp, uint64_t dsobj, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) +{ + return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_TRUE, dsp)); +} + +static int +dsl_dataset_own_impl(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, + void *tag, boolean_t override, dsl_dataset_t **dsp) { int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp); if (err != 0) return (err); - if (!dsl_dataset_tryown(*dsp, tag)) { + if (!dsl_dataset_tryown(*dsp, tag, override)) { dsl_dataset_rele_flags(*dsp, flags, tag); return (SET_ERROR(EBUSY)); } return (0); } +int +dsl_dataset_own_force(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, + void *tag, dsl_dataset_t **dsp) +{ + return (dsl_dataset_own_impl(dp, name, flags, tag, B_TRUE, dsp)); +} + +int +dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, + void *tag, dsl_dataset_t **dsp) +{ + return (dsl_dataset_own_impl(dp, name, flags, tag, B_FALSE, dsp)); +} + /* * See the comment above dsl_pool_hold() for details. In summary, a long * hold is used to prevent destruction of a dataset while the pool hold @@ -819,13 +916,16 @@ dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag) } boolean_t -dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) +dsl_dataset_tryown(dsl_dataset_t *ds, void *tag, boolean_t override) { boolean_t gotit = FALSE; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); mutex_enter(&ds->ds_lock); - if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) { + if (ds->ds_owner == NULL && (override || !(DS_IS_INCONSISTENT(ds) || + (dsl_dataset_feature_is_active(ds, + SPA_FEATURE_REDACTED_DATASETS) && + !zfs_allow_redacted_dataset_mount)))) { ds->ds_owner = tag; dsl_dataset_long_hold(ds, tag); gotit = TRUE; @@ -844,8 +944,55 @@ dsl_dataset_has_owner(dsl_dataset_t *ds) return (rv); } +static boolean_t +zfeature_active(spa_feature_t f, void *arg) +{ + switch (spa_feature_table[f].fi_type) { + case ZFEATURE_TYPE_BOOLEAN: { + boolean_t val = (boolean_t)arg; + ASSERT(val == B_FALSE || val == B_TRUE); + return (val); + } + case ZFEATURE_TYPE_UINT64_ARRAY: + /* + * In this case, arg is a uint64_t array. The feature is active + * if the array is non-null. + */ + return (arg != NULL); + default: + panic("Invalid zfeature type!"); + return (B_FALSE); + } +} + +boolean_t +dsl_dataset_feature_is_active(dsl_dataset_t *ds, spa_feature_t f) +{ + return (zfeature_active(f, ds->ds_feature[f])); +} + +/* + * The buffers passed out by this function are references to internal buffers; + * they should not be freed by callers of this function, and they should not be + * used after the dataset has been released. + */ +boolean_t +dsl_dataset_get_uint64_array_feature(dsl_dataset_t *ds, spa_feature_t f, + uint64_t *outlength, uint64_t **outp) +{ + VERIFY(spa_feature_table[f].fi_type & ZFEATURE_TYPE_UINT64_ARRAY); + if (!dsl_dataset_feature_is_active(ds, f)) { + return (B_FALSE); + } + struct feature_type_uint64_array_arg *ftuaa = ds->ds_feature[f]; + *outp = ftuaa->array; + *outlength = ftuaa->length; + return (B_TRUE); +} + void -dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) +dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, void *arg, + dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; @@ -856,20 +1003,44 @@ dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) spa_feature_incr(spa, f, tx); dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); - VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid, - sizeof (zero), 1, &zero, tx)); + switch (spa_feature_table[f].fi_type) { + case ZFEATURE_TYPE_BOOLEAN: + ASSERT3S((boolean_t)arg, ==, B_TRUE); + VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid, + sizeof (zero), 1, &zero, tx)); + break; + case ZFEATURE_TYPE_UINT64_ARRAY: + { + struct feature_type_uint64_array_arg *ftuaa = arg; + VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid, + sizeof (uint64_t), ftuaa->length, ftuaa->array, tx)); + break; + } + default: + panic("Invalid zfeature type!"); + } } void -dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) +dsl_dataset_deactivate_feature_impl(dsl_dataset_t *ds, spa_feature_t f, + dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; + uint64_t dsobj = ds->ds_object; VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx)); spa_feature_decr(spa, f, tx); + ds->ds_feature[f] = NULL; +} + +void +dsl_dataset_deactivate_feature(dsl_dataset_t *ds, spa_feature_t f, dmu_tx_t *tx) +{ + unload_zfeature(ds, f); + dsl_dataset_deactivate_feature_impl(ds, f, tx); } uint64_t @@ -933,8 +1104,10 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { - if (origin->ds_feature_inuse[f]) - dsl_dataset_activate_feature(dsobj, f, tx); + if (zfeature_active(f, origin->ds_feature[f])) { + dsl_dataset_activate_feature(dsobj, f, + origin->ds_feature[f], tx); + } } dmu_buf_will_dirty(origin->ds_dbuf, tx); @@ -1454,8 +1627,10 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dmu_buf_rele(dbuf, FTAG); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { - if (ds->ds_feature_inuse[f]) - dsl_dataset_activate_feature(dsobj, f, tx); + if (zfeature_active(f, ds->ds_feature[f])) { + dsl_dataset_activate_feature(dsobj, f, + ds->ds_feature[f], tx); + } } ASSERT3U(ds->ds_prev != 0, ==, @@ -1503,6 +1678,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dsl_dataset_phys(ds)->ds_deadlist_obj); dsl_deadlist_add_key(&ds->ds_deadlist, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); + dsl_bookmark_snapshotted(ds, tx); if (dsl_dataset_remap_deadlist_exists(ds)) { uint64_t remap_deadlist_obj = @@ -1770,11 +1946,12 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) dmu_objset_sync(ds->ds_objset, zio, tx); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { - if (ds->ds_feature_activation_needed[f]) { - if (ds->ds_feature_inuse[f]) + if (zfeature_active(f, ds->ds_feature_activation[f])) { + if (zfeature_active(f, ds->ds_feature[f])) continue; - dsl_dataset_activate_feature(ds->ds_object, f, tx); - ds->ds_feature_inuse[f] = B_TRUE; + dsl_dataset_activate_feature(ds->ds_object, f, + ds->ds_feature_activation[f], tx); + ds->ds_feature[f] = ds->ds_feature_activation[f]; } } } @@ -1795,6 +1972,8 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) bplist_iterate(&ds->ds_pending_deadlist, deadlist_enqueue_cb, &ds->ds_deadlist, tx); + dsl_bookmark_sync_done(ds, tx); + if (os->os_synced_dnodes != NULL) { multilist_destroy(os->os_synced_dnodes); os->os_synced_dnodes = NULL; @@ -1928,6 +2107,34 @@ get_receive_resume_stats_impl(dsl_dataset_t *ds) DS_FIELD_RESUME_RAWOK) == 0) { fnvlist_add_boolean(token_nv, "rawok"); } + if (dsl_dataset_feature_is_active(ds, + SPA_FEATURE_REDACTED_DATASETS)) { + uint64_t num_redact_snaps; + uint64_t *redact_snaps; + VERIFY(dsl_dataset_get_uint64_array_feature(ds, + SPA_FEATURE_REDACTED_DATASETS, &num_redact_snaps, + &redact_snaps)); + fnvlist_add_uint64_array(token_nv, "redact_snaps", + redact_snaps, num_redact_snaps); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS) == 0) { + uint64_t num_redact_snaps, int_size; + uint64_t *redact_snaps; + VERIFY0(zap_length(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, &int_size, + &num_redact_snaps)); + ASSERT3U(int_size, ==, sizeof (uint64_t)); + + redact_snaps = kmem_alloc(int_size * num_redact_snaps, + KM_SLEEP); + VERIFY0(zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, int_size, + num_redact_snaps, redact_snaps)); + fnvlist_add_uint64_array(token_nv, "book_redact_snaps", + redact_snaps, num_redact_snaps); + kmem_free(redact_snaps, int_size * num_redact_snaps); + } packed = fnvlist_pack(token_nv, &packed_size); fnvlist_free(token_nv); compressed = kmem_alloc(packed_size, KM_SLEEP); @@ -2113,6 +2320,13 @@ dsl_get_inconsistent(dsl_dataset_t *ds) 1 : 0); } +uint64_t +dsl_get_redacted(dsl_dataset_t *ds) +{ + return (dsl_dataset_feature_is_active(ds, + SPA_FEATURE_REDACTED_DATASETS)); +} + uint64_t dsl_get_available(dsl_dataset_t *ds) { @@ -2168,6 +2382,18 @@ dsl_get_prev_snap(dsl_dataset_t *ds, char *snap) } } +void +dsl_get_redact_snaps(dsl_dataset_t *ds, nvlist_t *propval) +{ + uint64_t nsnaps; + uint64_t *snaps; + if (dsl_dataset_get_uint64_array_feature(ds, + SPA_FEATURE_REDACTED_DATASETS, &nsnaps, &snaps)) { + fnvlist_add_uint64_array(propval, ZPROP_VALUE, snaps, + nsnaps); + } +} + /* * Returns the mountpoint property and source for the given dataset in the value * and source buffers. The value buffer must be at least as large as MAXPATHLEN @@ -2273,6 +2499,12 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_dir_stats(ds->ds_dir, nv); } + nvlist_t *propval = fnvlist_alloc(); + dsl_get_redact_snaps(ds, propval); + fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), + propval); + nvlist_free(propval); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, dsl_get_available(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, @@ -2341,6 +2573,7 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) stat->dds_creation_txg = dsl_get_creationtxg(ds); stat->dds_inconsistent = dsl_get_inconsistent(ds); stat->dds_guid = dsl_get_guid(ds); + stat->dds_redacted = dsl_get_redacted(ds); stat->dds_origin[0] = '\0'; if (ds->ds_is_snapshot) { stat->dds_is_snapshot = B_TRUE; @@ -2668,28 +2901,11 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) } /* must not have any bookmarks after the most recent snapshot */ - nvlist_t *proprequest = fnvlist_alloc(); - fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG)); - nvlist_t *bookmarks = fnvlist_alloc(); - error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks); - fnvlist_free(proprequest); - if (error != 0) { + if (dsl_bookmark_latest_txg(ds) > + dsl_dataset_phys(ds)->ds_prev_snap_txg) { dsl_dataset_rele(ds, FTAG); - return (error); - } - for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL); - pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) { - nvlist_t *valuenv = - fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair), - zfs_prop_to_name(ZFS_PROP_CREATETXG)); - uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value"); - if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) { - fnvlist_free(bookmarks); - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EEXIST)); - } + return (SET_ERROR(EEXIST)); } - fnvlist_free(bookmarks); error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx); if (error != 0) { @@ -2802,7 +3018,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds; struct promotenode *snap; - dsl_dataset_t *origin_ds; + dsl_dataset_t *origin_ds, *origin_head; int err; uint64_t unused; uint64_t ss_mv_cnt; @@ -2822,6 +3038,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) } snap = list_head(&ddpa->shared_snaps); + origin_head = snap->ds; if (snap == NULL) { err = SET_ERROR(ENOENT); goto out; @@ -2918,6 +3135,32 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) ddpa->uncomp += dluncomp; } + /* + * Check that bookmarks that are being transferred don't have + * name conflicts. + */ + for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg <= + dsl_dataset_phys(origin_ds)->ds_creation_txg; + dbn = AVL_NEXT(&origin_head->ds_bookmarks, dbn)) { + if (strlen(dbn->dbn_name) >= max_snap_len) { + err = SET_ERROR(ENAMETOOLONG); + goto out; + } + zfs_bookmark_phys_t bm; + err = dsl_bookmark_lookup_impl(ddpa->ddpa_clone, + dbn->dbn_name, &bm); + + if (err == 0) { + fnvlist_add_boolean(ddpa->err_ds, dbn->dbn_name); + conflicting_snaps = B_TRUE; + } else if (err == ESRCH) { + err = 0; + } else if (err != 0) { + goto out; + } + } + /* * In order to return the full list of conflicting snapshots, we check * whether there was a conflict after traversing all of them. @@ -3075,6 +3318,25 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx)); } + /* + * Move bookmarks to this dir. + */ + dsl_bookmark_node_t *dbn_next; + for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg <= + dsl_dataset_phys(origin_ds)->ds_creation_txg; + dbn = dbn_next) { + dbn_next = AVL_NEXT(&origin_head->ds_bookmarks, dbn); + + avl_remove(&origin_head->ds_bookmarks, dbn); + VERIFY0(zap_remove(dp->dp_meta_objset, + origin_head->ds_bookmarks_obj, dbn->dbn_name, tx)); + + dsl_bookmark_node_add(hds, dbn, tx); + } + + dsl_bookmark_next_changed(hds, origin_ds, tx); + /* move snapshots to this dir */ for (snap = list_head(&ddpa->shared_snaps); snap; snap = list_next(&ddpa->shared_snaps, snap)) { @@ -3487,31 +3749,31 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (!(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET)) { - ASSERT(!clone->ds_feature_inuse[f]); - ASSERT(!origin_head->ds_feature_inuse[f]); + ASSERT(!dsl_dataset_feature_is_active(clone, f)); + ASSERT(!dsl_dataset_feature_is_active(origin_head, f)); continue; } - boolean_t clone_inuse = clone->ds_feature_inuse[f]; - boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f]; + boolean_t clone_inuse = dsl_dataset_feature_is_active(clone, f); + void *clone_feature = clone->ds_feature[f]; + boolean_t origin_head_inuse = + dsl_dataset_feature_is_active(origin_head, f); + void *origin_head_feature = origin_head->ds_feature[f]; + + if (clone_inuse) + dsl_dataset_deactivate_feature_impl(clone, f, tx); + if (origin_head_inuse) + dsl_dataset_deactivate_feature_impl(origin_head, f, tx); if (clone_inuse) { - dsl_dataset_deactivate_feature(clone->ds_object, f, tx); - clone->ds_feature_inuse[f] = B_FALSE; - } - if (origin_head_inuse) { - dsl_dataset_deactivate_feature(origin_head->ds_object, - f, tx); - origin_head->ds_feature_inuse[f] = B_FALSE; - } - if (clone_inuse) { - dsl_dataset_activate_feature(origin_head->ds_object, - f, tx); - origin_head->ds_feature_inuse[f] = B_TRUE; + dsl_dataset_activate_feature(origin_head->ds_object, f, + clone_feature, tx); + origin_head->ds_feature[f] = clone_feature; } if (origin_head_inuse) { - dsl_dataset_activate_feature(clone->ds_object, f, tx); - clone->ds_feature_inuse[f] = B_TRUE; + dsl_dataset_activate_feature(clone->ds_object, f, + origin_head_feature, tx); + clone->ds_feature[f] = origin_head_feature; } } @@ -3535,9 +3797,9 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_phys(clone)->ds_unique_bytes); /* - * Reset origin's unique bytes, if it exists. + * Reset origin's unique bytes. */ - if (clone->ds_prev) { + { dsl_dataset_t *origin = clone->ds_prev; uint64_t comp, uncomp; @@ -3635,6 +3897,12 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_phys(origin_head)->ds_deadlist_obj); dsl_dataset_swap_remap_deadlists(clone, origin_head, tx); + /* + * If there is a bookmark at the origin, its "next dataset" is + * changing, so we need to reset its FBN. + */ + dsl_bookmark_next_changed(origin_head, origin_head->ds_prev, tx); + dsl_scan_ds_clone_swapped(origin_head, clone, tx); spa_history_log_internal_ds(clone, "clone swap", tx, @@ -3925,95 +4193,145 @@ dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, } /* - * Return (in *usedp) the amount of space written in new that is not - * present in oldsnap. New may be a snapshot or the head. Old must be - * a snapshot before new, in new's filesystem (or its origin). If not then - * fail and return EINVAL. + * Return (in *usedp) the amount of space referenced by "new" that was not + * referenced at the time the bookmark corresponds to. "New" may be a + * snapshot or a head. The bookmark must be before new, in + * new's filesystem (or its origin) -- caller verifies this. * * The written space is calculated by considering two components: First, we * ignore any freed space, and calculate the written as new's used space * minus old's used space. Next, we add in the amount of space that was freed - * between the two snapshots, thus reducing new's used space relative to old's. - * Specifically, this is the space that was born before old->ds_creation_txg, - * and freed before new (ie. on new's deadlist or a previous deadlist). + * between the two time points, thus reducing new's used space relative to + * old's. Specifically, this is the space that was born before + * zbm_creation_txg, and freed before new (ie. on new's deadlist or a + * previous deadlist). * * space freed [---------------------] * snapshots ---O-------O--------O-------O------ - * oldsnap new + * bookmark new + * + * Note, the bookmark's zbm_*_bytes_refd must be valid, but if the HAS_FBN + * flag is not set, we will calculate the freed_before_next based on the + * next snapshot's deadlist, rather than using zbm_*_freed_before_next_snap. */ -int -dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +static int +dsl_dataset_space_written_impl(zfs_bookmark_phys_t *bmp, + dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { int err = 0; - uint64_t snapobj; dsl_pool_t *dp = new->ds_dir->dd_pool; ASSERT(dsl_pool_config_held(dp)); + if (dsl_dataset_is_snapshot(new)) { + ASSERT3U(bmp->zbm_creation_txg, <, + dsl_dataset_phys(new)->ds_creation_txg); + } *usedp = 0; *usedp += dsl_dataset_phys(new)->ds_referenced_bytes; - *usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes; + *usedp -= bmp->zbm_referenced_bytes_refd; *compp = 0; *compp += dsl_dataset_phys(new)->ds_compressed_bytes; - *compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes; + *compp -= bmp->zbm_compressed_bytes_refd; *uncompp = 0; *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes; - *uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes; + *uncompp -= bmp->zbm_uncompressed_bytes_refd; - snapobj = new->ds_object; - while (snapobj != oldsnap->ds_object) { - dsl_dataset_t *snap; - uint64_t used, comp, uncomp; + dsl_dataset_t *snap = new; - if (snapobj == new->ds_object) { - snap = new; - } else { - err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); - if (err != 0) - break; - } + while (dsl_dataset_phys(snap)->ds_prev_snap_txg > + bmp->zbm_creation_txg) { + uint64_t used, comp, uncomp; - if (dsl_dataset_phys(snap)->ds_prev_snap_txg == - dsl_dataset_phys(oldsnap)->ds_creation_txg) { - /* - * The blocks in the deadlist can not be born after - * ds_prev_snap_txg, so get the whole deadlist space, - * which is more efficient (especially for old-format - * deadlists). Unfortunately the deadlist code - * doesn't have enough information to make this - * optimization itself. - */ - dsl_deadlist_space(&snap->ds_deadlist, - &used, &comp, &uncomp); - } else { - dsl_deadlist_space_range(&snap->ds_deadlist, - 0, dsl_dataset_phys(oldsnap)->ds_creation_txg, - &used, &comp, &uncomp); - } + dsl_deadlist_space_range(&snap->ds_deadlist, + 0, bmp->zbm_creation_txg, + &used, &comp, &uncomp); *usedp += used; *compp += comp; *uncompp += uncomp; - /* - * If we get to the beginning of the chain of snapshots - * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap - * was not a snapshot of/before new. - */ - snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + uint64_t snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj; if (snap != new) dsl_dataset_rele(snap, FTAG); - if (snapobj == 0) { - err = SET_ERROR(EINVAL); + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); + if (err != 0) break; - } + } + /* + * We might not have the FBN if we are calculating written from + * a snapshot (because we didn't know the correct "next" snapshot + * until now). + */ + if (bmp->zbm_flags & ZBM_FLAG_HAS_FBN) { + *usedp += bmp->zbm_referenced_freed_before_next_snap; + *compp += bmp->zbm_compressed_freed_before_next_snap; + *uncompp += bmp->zbm_uncompressed_freed_before_next_snap; + } else { + ASSERT3U(dsl_dataset_phys(snap)->ds_prev_snap_txg, ==, + bmp->zbm_creation_txg); + uint64_t used, comp, uncomp; + dsl_deadlist_space(&snap->ds_deadlist, &used, &comp, &uncomp); + *usedp += used; + *compp += comp; + *uncompp += uncomp; } + if (snap != new) + dsl_dataset_rele(snap, FTAG); return (err); } +/* + * Return (in *usedp) the amount of space written in new that was not + * present at the time the bookmark corresponds to. New may be a + * snapshot or the head. Old must be a bookmark before new, in + * new's filesystem (or its origin) -- caller verifies this. + */ +int +dsl_dataset_space_written_bookmark(zfs_bookmark_phys_t *bmp, + dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + if (!(bmp->zbm_flags & ZBM_FLAG_HAS_FBN)) + return (SET_ERROR(ENOTSUP)); + return (dsl_dataset_space_written_impl(bmp, new, + usedp, compp, uncompp)); +} + +/* + * Return (in *usedp) the amount of space written in new that is not + * present in oldsnap. New may be a snapshot or the head. Old must be + * a snapshot before new, in new's filesystem (or its origin). If not then + * fail and return EINVAL. + */ +int +dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + if (!dsl_dataset_is_before(new, oldsnap, 0)) + return (SET_ERROR(EINVAL)); + + zfs_bookmark_phys_t zbm = { 0 }; + dsl_dataset_phys_t *dsp = dsl_dataset_phys(oldsnap); + zbm.zbm_guid = dsp->ds_guid; + zbm.zbm_creation_txg = dsp->ds_creation_txg; + zbm.zbm_creation_time = dsp->ds_creation_time; + zbm.zbm_referenced_bytes_refd = dsp->ds_referenced_bytes; + zbm.zbm_compressed_bytes_refd = dsp->ds_compressed_bytes; + zbm.zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes; + + /* + * If oldsnap is the origin (or origin's origin, ...) of new, + * we can't easily calculate the effective FBN. Therefore, + * we do not set ZBM_FLAG_HAS_FBN, so that the _impl will calculate + * it relative to the correct "next": the next snapshot towards "new", + * rather than the next snapshot in oldsnap's dsl_dir. + */ + return (dsl_dataset_space_written_impl(&zbm, new, + usedp, compp, uncompp)); +} + /* * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, * lastsnap, and all snapshots in between are deleted. @@ -4104,16 +4422,26 @@ dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, if (later->ds_dir == earlier->ds_dir) return (B_TRUE); - if (!dsl_dir_is_clone(later->ds_dir)) + + /* + * We check dd_origin_obj explicitly here rather than using + * dsl_dir_is_clone() so that we will return TRUE if "earlier" + * is $ORIGIN@$ORIGIN. dsl_dataset_space_written() depends on + * this behavior. + */ + if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == 0) return (B_FALSE); - if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object) - return (B_TRUE); dsl_dataset_t *origin; error = dsl_dataset_hold_obj(dp, dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin); if (error != 0) return (B_FALSE); + if (dsl_dataset_phys(origin)->ds_creation_txg == earlier_txg && + origin->ds_dir == earlier->ds_dir) { + dsl_dataset_rele(origin, FTAG); + return (B_TRUE); + } ret = dsl_dataset_is_before(origin, earlier, earlier_txg); dsl_dataset_rele(origin, FTAG); return (ret); @@ -4230,6 +4558,26 @@ dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); } +void +dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps, + uint64_t num_redact_snaps, dmu_tx_t *tx) +{ + uint64_t dsobj = ds->ds_object; + struct feature_type_uint64_array_arg *ftuaa = + kmem_zalloc(sizeof (*ftuaa), KM_SLEEP); + ftuaa->length = (int64_t)num_redact_snaps; + if (num_redact_snaps > 0) { + ftuaa->array = kmem_alloc(num_redact_snaps * sizeof (uint64_t), + KM_SLEEP); + bcopy(redact_snaps, ftuaa->array, num_redact_snaps * + sizeof (uint64_t)); + } + dsl_dataset_activate_feature(dsobj, SPA_FEATURE_REDACTED_DATASETS, + ftuaa, tx); + ds->ds_feature[SPA_FEATURE_REDACTED_DATASETS] = ftuaa; +} + + #if defined(_KERNEL) #if defined(_LP64) module_param(zfs_max_recordsize, int, 0644); @@ -4240,6 +4588,10 @@ module_param(zfs_max_recordsize, int, 0444); MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size"); #endif +module_param(zfs_allow_redacted_dataset_mount, int, 0644); +MODULE_PARM_DESC(zfs_allow_redacted_dataset_mount, + "Allow mounting of redacted datasets"); + EXPORT_SYMBOL(dsl_dataset_hold); EXPORT_SYMBOL(dsl_dataset_hold_flags); EXPORT_SYMBOL(dsl_dataset_hold_obj); diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 10846a3249db..9e3a3331b6c6 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -80,7 +80,7 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl) zap_cursor_advance(&zc)) { dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); dle->dle_mintxg = zfs_strtonum(za.za_name, NULL); - VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, + VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, za.za_first_integer)); avl_add(&dl->dl_tree, dle); } @@ -98,13 +98,13 @@ dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); dl->dl_os = os; dl->dl_object = object; - VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); + VERIFY0(dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); dmu_object_info_from_db(dl->dl_dbuf, &doi); if (doi.doi_type == DMU_OT_BPOBJ) { dmu_buf_rele(dl->dl_dbuf, dl); dl->dl_dbuf = NULL; dl->dl_oldfmt = B_TRUE; - VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); + VERIFY0(bpobj_open(&dl->dl_bpobj, os, object)); return; } @@ -167,7 +167,7 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) zap_cursor_t zc; zap_attribute_t za; - VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); + VERIFY0(dmu_object_info(os, dlobj, &doi)); if (doi.doi_type == DMU_OT_BPOBJ) { bpobj_free(os, dlobj, tx); return; @@ -183,7 +183,7 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) bpobj_free(os, obj, tx); } zap_cursor_fini(&zc); - VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); + VERIFY0(dmu_object_free(os, dlobj, tx)); } static void @@ -196,8 +196,8 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); bpobj_close(&dle->dle_bpobj); bpobj_decr_empty(dl->dl_os, tx); - VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); - VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, + VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, dle->dle_mintxg, obj, tx)); } bpobj_enqueue(&dle->dle_bpobj, bp, tx); @@ -214,8 +214,8 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, } else { bpobj_close(&dle->dle_bpobj); bpobj_decr_empty(dl->dl_os, tx); - VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); - VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, + VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, dle->dle_mintxg, obj, tx)); } } @@ -279,10 +279,10 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) dsl_deadlist_load_tree(dl); obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); - VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); avl_add(&dl->dl_tree, dle); - VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, + VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object, mintxg, obj, tx)); mutex_exit(&dl->dl_lock); } @@ -298,12 +298,12 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) if (dl->dl_oldfmt) return; - mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); dle_tofind.dle_mintxg = mintxg; dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); + ASSERT3P(dle, !=, NULL); dle_prev = AVL_PREV(&dl->dl_tree, dle); dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); @@ -312,7 +312,7 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) bpobj_close(&dle->dle_bpobj); kmem_free(dle, sizeof (*dle)); - VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); + VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); mutex_exit(&dl->dl_lock); } @@ -334,7 +334,7 @@ dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, while (mrs_obj != 0) { dsl_dataset_t *ds; - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); + VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); dsl_deadlist_add_key(&dl, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; @@ -368,7 +368,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, break; obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); - VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, + VERIFY0(zap_add_int_key(dl->dl_os, newobj, dle->dle_mintxg, obj, tx)); } mutex_exit(&dl->dl_lock); @@ -381,7 +381,7 @@ dsl_deadlist_space(dsl_deadlist_t *dl, { ASSERT(dsl_deadlist_is_open(dl)); if (dl->dl_oldfmt) { - VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, + VERIFY0(bpobj_space(&dl->dl_bpobj, usedp, compp, uncompp)); return; } @@ -397,7 +397,7 @@ dsl_deadlist_space(dsl_deadlist_t *dl, * return space used in the range (mintxg, maxtxg]. * Includes maxtxg, does not include mintxg. * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is - * larger than any bp in the deadlist (eg. UINT64_MAX)). + * UINT64_MAX). */ void dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, @@ -408,7 +408,7 @@ dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, avl_index_t where; if (dl->dl_oldfmt) { - VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, + VERIFY0(bpobj_space_range(&dl->dl_bpobj, mintxg, maxtxg, usedp, compp, uncompp)); return; } @@ -430,13 +430,20 @@ dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, dle = AVL_NEXT(&dl->dl_tree, dle)) { uint64_t used, comp, uncomp; - VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, + VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); *usedp += used; *compp += comp; *uncompp += uncomp; } + + /* + * This assertion ensures that the maxtxg is a key in the deadlist + * (unless it's UINT64_MAX). + */ + ASSERT(maxtxg == UINT64_MAX || + (dle != NULL && dle->dle_mintxg == maxtxg)); mutex_exit(&dl->dl_lock); } @@ -452,8 +459,8 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, ASSERT(MUTEX_HELD(&dl->dl_lock)); - VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); - VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); + VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); + VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp)); bpobj_close(&bpo); dsl_deadlist_load_tree(dl); @@ -491,12 +498,11 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) dsl_deadlist_phys_t *dlp; dmu_object_info_t doi; - VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); + VERIFY0(dmu_object_info(dl->dl_os, obj, &doi)); if (doi.doi_type == DMU_OT_BPOBJ) { bpobj_t bpo; - VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); - VERIFY3U(0, ==, bpobj_iterate(&bpo, - dsl_deadlist_insert_cb, dl, tx)); + VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); + VERIFY0(bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx)); bpobj_close(&bpo); return; } @@ -507,11 +513,11 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) zap_cursor_advance(&zc)) { uint64_t mintxg = zfs_strtonum(za.za_name, NULL); dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); - VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); + VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx)); } zap_cursor_fini(&zc); - VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); + VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); dlp = bonus->db_data; dmu_buf_will_dirty(bonus, tx); bzero(dlp, sizeof (*dlp)); @@ -520,7 +526,7 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) } /* - * Remove entries on dl that are >= mintxg, and put them on the bpobj. + * Remove entries on dl that are born > mintxg, and put them on the bpobj. */ void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, @@ -546,7 +552,7 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); - VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, + VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); ASSERT3U(dl->dl_phys->dl_used, >=, used); ASSERT3U(dl->dl_phys->dl_comp, >=, comp); @@ -555,7 +561,7 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dl->dl_phys->dl_comp -= comp; dl->dl_phys->dl_uncomp -= uncomp; - VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, + VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, dle->dle_mintxg, tx)); dle_next = AVL_NEXT(&dl->dl_tree, dle); diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index aa90f95ac6b6..06fead0ea3f3 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2013 by Joyent, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -181,70 +182,86 @@ process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, dsl_dataset_phys(ds_next)->ds_deadlist_obj); } -struct removeclonesnode { - list_node_t link; - dsl_dataset_t *ds; -}; +typedef struct remaining_clones_key { + dsl_dataset_t *rck_clone; + list_node_t rck_node; +} remaining_clones_key_t; + +static remaining_clones_key_t * +rck_alloc(dsl_dataset_t *clone) +{ + remaining_clones_key_t *rck = kmem_alloc(sizeof (*rck), KM_SLEEP); + rck->rck_clone = clone; + return (rck); +} static void -dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) +dsl_dir_remove_clones_key_impl(dsl_dir_t *dd, uint64_t mintxg, dmu_tx_t *tx, + list_t *stack, void *tag) { - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - list_t clones; - struct removeclonesnode *rcn; + objset_t *mos = dd->dd_pool->dp_meta_objset; - list_create(&clones, sizeof (struct removeclonesnode), - offsetof(struct removeclonesnode, link)); + /* + * If it is the old version, dd_clones doesn't exist so we can't + * find the clones, but dsl_deadlist_remove_key() is a no-op so it + * doesn't matter. + */ + if (dsl_dir_phys(dd)->dd_clones == 0) + return; - rcn = kmem_zalloc(sizeof (struct removeclonesnode), KM_SLEEP); - rcn->ds = ds; - list_insert_head(&clones, rcn); + zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); + zap_attribute_t *za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - for (; rcn != NULL; rcn = list_next(&clones, rcn)) { - zap_cursor_t zc; - zap_attribute_t za; - /* - * If it is the old version, dd_clones doesn't exist so we can't - * find the clones, but dsl_deadlist_remove_key() is a no-op so - * it doesn't matter. - */ - if (dsl_dir_phys(rcn->ds->ds_dir)->dd_clones == 0) - continue; - - for (zap_cursor_init(&zc, mos, - dsl_dir_phys(rcn->ds->ds_dir)->dd_clones); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - dsl_dataset_t *clone; - - VERIFY0(dsl_dataset_hold_obj(rcn->ds->ds_dir->dd_pool, - za.za_first_integer, FTAG, &clone)); - if (clone->ds_dir->dd_origin_txg > mintxg) { - dsl_deadlist_remove_key(&clone->ds_deadlist, - mintxg, tx); - if (dsl_dataset_remap_deadlist_exists(clone)) { - dsl_deadlist_remove_key( - &clone->ds_remap_deadlist, mintxg, - tx); - } - rcn = kmem_zalloc( - sizeof (struct removeclonesnode), KM_SLEEP); - rcn->ds = clone; - list_insert_tail(&clones, rcn); - } else { - dsl_dataset_rele(clone, FTAG); + for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + dsl_dataset_t *clone; + + VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, + za->za_first_integer, tag, &clone)); + + if (clone->ds_dir->dd_origin_txg > mintxg) { + dsl_deadlist_remove_key(&clone->ds_deadlist, + mintxg, tx); + + if (dsl_dataset_remap_deadlist_exists(clone)) { + dsl_deadlist_remove_key( + &clone->ds_remap_deadlist, mintxg, tx); } + + list_insert_head(stack, rck_alloc(clone)); + } else { + dsl_dataset_rele(clone, tag); } - zap_cursor_fini(&zc); } + zap_cursor_fini(zc); + + kmem_free(za, sizeof (zap_attribute_t)); + kmem_free(zc, sizeof (zap_cursor_t)); +} + +void +dsl_dir_remove_clones_key(dsl_dir_t *top_dd, uint64_t mintxg, dmu_tx_t *tx) +{ + list_t stack; + + list_create(&stack, sizeof (remaining_clones_key_t), + offsetof(remaining_clones_key_t, rck_node)); + + dsl_dir_remove_clones_key_impl(top_dd, mintxg, tx, &stack, FTAG); + for (remaining_clones_key_t *rck = list_remove_head(&stack); + rck != NULL; rck = list_remove_head(&stack)) { + dsl_dataset_t *clone = rck->rck_clone; + dsl_dir_t *clone_dir = clone->ds_dir; + + kmem_free(rck, sizeof (*rck)); - rcn = list_remove_head(&clones); - kmem_free(rcn, sizeof (struct removeclonesnode)); - while ((rcn = list_remove_head(&clones)) != NULL) { - dsl_dataset_rele(rcn->ds, FTAG); - kmem_free(rcn, sizeof (struct removeclonesnode)); + dsl_dir_remove_clones_key_impl(clone_dir, mintxg, tx, + &stack, FTAG); + dsl_dataset_rele(clone, FTAG); } - list_destroy(&clones); + + list_destroy(&stack); } static void @@ -314,11 +331,11 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) obj = ds->ds_object; + boolean_t book_exists = dsl_bookmark_ds_destroyed(ds, tx); + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { - if (ds->ds_feature_inuse[f]) { - dsl_dataset_deactivate_feature(obj, f, tx); - ds->ds_feature_inuse[f] = B_FALSE; - } + if (dsl_dataset_feature_is_active(ds, f)) + dsl_dataset_deactivate_feature(ds, f, tx); } if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { ASSERT3P(ds->ds_prev, ==, NULL); @@ -402,9 +419,11 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx); - /* Collapse range in clone heads */ - dsl_dataset_remove_clones_key(ds, - dsl_dataset_phys(ds)->ds_creation_txg, tx); + if (!book_exists) { + /* Collapse range in clone heads */ + dsl_dir_remove_clones_key(ds->ds_dir, + dsl_dataset_phys(ds)->ds_creation_txg, tx); + } if (ds_next->ds_is_snapshot) { dsl_dataset_t *ds_nextnext; @@ -432,9 +451,13 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) /* Collapse range in this head. */ dsl_dataset_t *hds; VERIFY0(dsl_dataset_hold_obj(dp, - dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds)); - dsl_deadlist_remove_key(&hds->ds_deadlist, - dsl_dataset_phys(ds)->ds_creation_txg, tx); + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, + FTAG, &hds)); + if (!book_exists) { + /* Collapse range in this head. */ + dsl_deadlist_remove_key(&hds->ds_deadlist, + dsl_dataset_phys(ds)->ds_creation_txg, tx); + } if (dsl_dataset_remap_deadlist_exists(hds)) { dsl_deadlist_remove_key(&hds->ds_remap_deadlist, dsl_dataset_phys(ds)->ds_creation_txg, tx); @@ -677,7 +700,8 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, struct killarg *ka = arg; dmu_tx_t *tx = ka->tx; - if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || + BP_IS_EMBEDDED(bp)) return (0); if (zb->zb_level == ZB_ZIL_LEVEL) { @@ -867,10 +891,8 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) obj = ds->ds_object; for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { - if (ds->ds_feature_inuse[f]) { - dsl_dataset_deactivate_feature(obj, f, tx); - ds->ds_feature_inuse[f] = B_FALSE; - } + if (dsl_dataset_feature_is_active(ds, f)) + dsl_dataset_deactivate_feature(ds, f, tx); } dsl_scan_ds_destroyed(ds, tx); @@ -981,8 +1003,28 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx)); - if (ds->ds_bookmarks != 0) { - VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx)); + if (ds->ds_bookmarks_obj != 0) { + void *cookie = NULL; + dsl_bookmark_node_t *dbn; + + while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) != + NULL) { + if (dbn->dbn_phys.zbm_redaction_obj != 0) { + VERIFY0(dmu_object_free(mos, + dbn->dbn_phys.zbm_redaction_obj, tx)); + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_REDACTION_BOOKMARKS, tx); + } + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_BOOKMARK_WRITTEN, tx); + } + spa_strfree(dbn->dbn_name); + mutex_destroy(&dbn->dbn_lock); + kmem_free(dbn, sizeof (*dbn)); + } + avl_destroy(&ds->ds_bookmarks); + VERIFY0(zap_destroy(mos, ds->ds_bookmarks_obj, tx)); spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); } diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index e8f519b18b01..e929b15cb830 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index f3c869538ce1..f0d6021f5d7d 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2016 Gary Mills * Copyright (c) 2017 Datto Inc. * Copyright 2017 Joyent, Inc. @@ -1218,6 +1218,7 @@ dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) zil_header_t *zh = zsa->zsa_zh; zbookmark_phys_t zb; + ASSERT(!BP_IS_REDACTED(bp)); if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); @@ -1250,6 +1251,7 @@ dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; + ASSERT(!BP_IS_REDACTED(bp)); if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); @@ -1378,7 +1380,7 @@ dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) spa_t *spa = scn->scn_dp->dp_spa; scan_prefetch_issue_ctx_t *spic; - if (zfs_no_scrub_prefetch) + if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp)) return; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg || @@ -1630,6 +1632,8 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; int err; + ASSERT(!BP_IS_REDACTED(bp)); + if (BP_GET_LEVEL(bp) > 0) { arc_flags_t flags = ARC_FLAG_WAIT; int i; @@ -1783,6 +1787,12 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, return; } + if (BP_IS_REDACTED(bp)) { + ASSERT(dsl_dataset_feature_is_active(ds, + SPA_FEATURE_REDACTED_DATASETS)); + return; + } + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { scn->scn_lt_min_this_txg++; return; diff --git a/module/zfs/objlist.c b/module/zfs/objlist.c new file mode 100644 index 000000000000..c80bab2a77bd --- /dev/null +++ b/module/zfs/objlist.c @@ -0,0 +1,84 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include + +objlist_t * +objlist_create(void) +{ + objlist_t *list = kmem_alloc(sizeof (*list), KM_SLEEP); + list_create(&list->ol_list, sizeof (objlist_node_t), + offsetof(objlist_node_t, on_node)); + list->ol_last_lookup = 0; + return (list); +} + +void +objlist_destroy(objlist_t *list) +{ + for (objlist_node_t *n = list_remove_head(&list->ol_list); + n != NULL; n = list_remove_head(&list->ol_list)) { + kmem_free(n, sizeof (*n)); + } + list_destroy(&list->ol_list); + kmem_free(list, sizeof (*list)); +} + +/* + * This function looks through the objlist to see if the specified object number + * is contained in the objlist. In the process, it will remove all object + * numbers in the list that are smaller than the specified object number. Thus, + * any lookup of an object number smaller than a previously looked up object + * number will always return false; therefore, all lookups should be done in + * ascending order. + */ +boolean_t +objlist_exists(objlist_t *list, uint64_t object) +{ + objlist_node_t *node = list_head(&list->ol_list); + ASSERT3U(object, >=, list->ol_last_lookup); + list->ol_last_lookup = object; + while (node != NULL && node->on_object < object) { + VERIFY3P(node, ==, list_remove_head(&list->ol_list)); + kmem_free(node, sizeof (*node)); + node = list_head(&list->ol_list); + } + return (node != NULL && node->on_object == object); +} + +/* + * The objlist is a list of object numbers stored in ascending order. However, + * the insertion of new object numbers does not seek out the correct location to + * store a new object number; instead, it appends it to the list for simplicity. + * Thus, any users must take care to only insert new object numbers in ascending + * order. + */ +void +objlist_insert(objlist_t *list, uint64_t object) +{ + objlist_node_t *node = kmem_zalloc(sizeof (*node), KM_SLEEP); + node->on_object = object; +#ifdef ZFS_DEBUG + objlist_node_t *last_object = list_tail(&list->ol_list); + uint64_t last_objnum = (last_object != NULL ? last_object->on_object : + 0); + ASSERT3U(node->on_object, >, last_objnum); +#endif + list_insert_tail(&list->ol_list, node); +} diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index a151aceaecfb..6f94ef222c4a 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -86,7 +86,7 @@ refcount_destroy_many(refcount_t *rc, uint64_t number) { reference_t *ref; - ASSERT(rc->rc_count == number); + ASSERT3U(rc->rc_count, ==, number); while ((ref = list_head(&rc->rc_list))) { list_remove(&rc->rc_list, ref); kmem_cache_free(reference_cache, ref); @@ -132,7 +132,7 @@ refcount_add_many(refcount_t *rc, uint64_t number, void *holder) ref->ref_number = number; } mutex_enter(&rc->rc_mtx); - ASSERT(rc->rc_count >= 0); + ASSERT3U(rc->rc_count, >=, 0); if (rc->rc_tracked) list_insert_head(&rc->rc_list, ref); rc->rc_count += number; @@ -155,7 +155,7 @@ refcount_remove_many(refcount_t *rc, uint64_t number, void *holder) int64_t count; mutex_enter(&rc->rc_mtx); - ASSERT(rc->rc_count >= number); + ASSERT3U(rc->rc_count, >=, number); if (!rc->rc_tracked) { rc->rc_count -= number; diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 7bf9cde5b302..52bcccb2b409 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -2105,7 +2105,8 @@ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { - if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || + BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) return (0); /* * Note: normally this routine will not be called if diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index fc0fbbf59e40..ae92445427f3 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -176,6 +176,7 @@ #include #include #include +#include #include #include #include @@ -193,6 +194,7 @@ #include #include +#include #include #include #include @@ -267,7 +269,8 @@ typedef struct zfs_ioc_key { typedef enum { NO_NAME, POOL_NAME, - DATASET_NAME + DATASET_NAME, + ENTITY_NAME } zfs_ioc_namecheck_t; typedef enum { @@ -3717,6 +3720,37 @@ zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) return (dsl_get_bookmarks(fsname, innvl, outnvl)); } +/* + * innvl is not used. + * + * outnvl: { + * property 1, property 2, ... + * } + * + */ +static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = { + /* no nvl keys */ +}; + +/* ARGSUSED */ +static int +zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl, + nvlist_t *outnvl) +{ + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + char *bmname; + + bmname = strchr(bookmark, '#'); + if (bmname == NULL) + return (SET_ERROR(EINVAL)); + bmname++; + + (void) strlcpy(fsname, bookmark, sizeof (fsname)); + *(strchr(fsname, '#')) = '\0'; + + return (dsl_get_bookmark_props(fsname, bmname, outnvl)); +} + /* * innvl: { * bookmark name 1, bookmark name 2 @@ -3954,6 +3988,40 @@ recursive_unmount(const char *fsname, void *arg) return (0); } +/* + * + * snapname is the snapshot to redact. + * innvl: { + * "bookname" -> (string) + * name of the redaction bookmark to generate + * "snapnv" -> (nvlist, values ignored) + * snapshots to redact snapname with respect to + * } + * + * outnvl is unused + */ + +/* ARGSUSED */ +static const zfs_ioc_key_t zfs_keys_redact[] = { + {"bookname", DATA_TYPE_STRING, 0}, + {"snapnv", DATA_TYPE_NVLIST, 0}, +}; +static int +zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) +{ + nvlist_t *redactnvl = NULL; + char *redactbook = NULL; + + if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0) + return (SET_ERROR(EINVAL)); + if (fnvlist_num_pairs(redactnvl) == 0) + return (SET_ERROR(ENXIO)); + if (nvlist_lookup_string(innvl, "bookname", &redactbook) != 0) + return (SET_ERROR(EINVAL)); + + return (dmu_redact_snap(snapname, redactnvl, redactbook)); +} + /* * inputs: * zc_name old name of dataset @@ -4460,6 +4528,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *origrecvd = NULL; /* existing received properties */ boolean_t first_recvd_props = B_FALSE; + boolean_t tofs_was_redacted; file_t *input_fp; *read_bytes = 0; @@ -4470,10 +4539,13 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, if (input_fp == NULL) return (SET_ERROR(EBADF)); + off = input_fp->f_offset; error = dmu_recv_begin(tofs, tosnap, begin_record, force, - resumable, localprops, hidden_args, origin, &drc); + resumable, localprops, hidden_args, origin, &drc, input_fp->f_vnode, + &off); if (error != 0) goto out; + tofs_was_redacted = dsl_get_redacted(drc.drc_ds); /* * Set properties before we receive the stream so that they are applied @@ -4574,9 +4646,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, nvlist_free(xprops); } - off = input_fp->f_offset; - error = dmu_recv_stream(&drc, input_fp->f_vnode, &off, cleanup_fd, - action_handle); + error = dmu_recv_stream(&drc, cleanup_fd, action_handle, &off); if (error == 0) { zfsvfs_t *zfsvfs = NULL; @@ -4586,6 +4656,9 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, /* online recv */ dsl_dataset_t *ds; int end_err; + boolean_t stream_is_redacted = DMU_GET_FEATUREFLAGS( + begin_record->drr_u.drr_begin. + drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); @@ -4594,8 +4667,17 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, * likely also fail, and clean up after itself. */ end_err = dmu_recv_end(&drc, zfsvfs); - if (error == 0) + /* + * If the dataset was not redacted, but we received a + * redacted stream onto it, we need to unmount the + * dataset. Otherwise, resume the filesystem. + */ + if (error == 0 && !drc.drc_newfs && + stream_is_redacted && !tofs_was_redacted) { + error = zfs_end_fs(zfsvfs, ds); + } else if (error == 0) { error = zfs_resume_fs(zfsvfs, ds); + } error = error ? error : end_err; deactivate_super(zfsvfs->z_sb); } else if ((zv = zvol_suspend(tofs)) != NULL) { @@ -4952,6 +5034,49 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) return (error); } +typedef struct dump_bytes_io { + vnode_t *dbi_vp; + void *dbi_buf; + int dbi_len; + int dbi_err; +} dump_bytes_io_t; + +static void +dump_bytes_cb(void *arg) +{ + dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; + ssize_t resid; /* have to get resid to get detailed errno */ + + dbi->dbi_err = vn_rdwr(UIO_WRITE, dbi->dbi_vp, + (caddr_t)dbi->dbi_buf, dbi->dbi_len, + 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); +} + +static int +dump_bytes(objset_t *os, void *buf, int len, void *arg) +{ + dump_bytes_io_t dbi; + + dbi.dbi_vp = arg; + dbi.dbi_buf = buf; + dbi.dbi_len = len; + +#if defined(HAVE_LARGE_STACKS) + dump_bytes_cb(&dbi); +#else + /* + * The vn_rdwr() call is performed in a taskq to ensure that there is + * always enough stack space to write safely to the target filesystem. + * The ZIO_TYPE_FREE threads are used because there can be a lot of + * them and they are used in vdev_file.c for a similar purpose. + */ + spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE, + ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); +#endif /* HAVE_LARGE_STACKS */ + + return (dbi.dbi_err); +} + /* * inputs: * zc_name name of snapshot to send @@ -5027,8 +5152,8 @@ zfs_ioc_send(zfs_cmd_t *zc) } } - error = dmu_send_estimate(tosnap, fromsnap, compressok || rawok, - &zc->zc_objset_type); + error = dmu_send_estimate_fast(tosnap, fromsnap, NULL, + compressok || rawok, &zc->zc_objset_type); if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); @@ -5040,9 +5165,13 @@ zfs_ioc_send(zfs_cmd_t *zc) return (SET_ERROR(EBADF)); off = fp->f_offset; + dmu_send_outparams_t out = {0}; + out.dso_outfunc = dump_bytes; + out.dso_arg = fp->f_vnode; + out.dso_dryrun = B_FALSE; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, zc->zc_fromobj, embedok, large_block_ok, compressok, rawok, - zc->zc_cookie, fp->f_vnode, &off); + zc->zc_cookie, &off, &out); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; @@ -5053,18 +5182,19 @@ zfs_ioc_send(zfs_cmd_t *zc) /* * inputs: - * zc_name name of snapshot on which to report progress - * zc_cookie file descriptor of send stream + * zc_name name of snapshot on which to report progress + * zc_cookie file descriptor of send stream * * outputs: - * zc_cookie number of bytes written in send stream thus far + * zc_cookie number of bytes written in send stream thus far + * zc_objset_type logical size of data traversed by send thus far */ static int zfs_ioc_send_progress(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds; - dmu_sendarg_t *dsp = NULL; + dmu_sendstatus_t *dsp = NULL; int error; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); @@ -5088,15 +5218,19 @@ zfs_ioc_send_progress(zfs_cmd_t *zc) for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; dsp = list_next(&ds->ds_sendstreams, dsp)) { - if (dsp->dsa_outfd == zc->zc_cookie && - dsp->dsa_proc->group_leader == curproc->group_leader) + if (dsp->dss_outfd == zc->zc_cookie && + dsp->dss_proc == curproc) break; } - if (dsp != NULL) - zc->zc_cookie = *(dsp->dsa_off); - else + if (dsp != NULL) { + zc->zc_cookie = atomic_cas_64((volatile uint64_t *)dsp->dss_off, + 0, 0); + /* This is the closest thing we have to atomic_read_64. */ + zc->zc_objset_type = atomic_cas_64(&dsp->dss_blocks, 0, 0); + } else { error = SET_ERROR(ENOENT); + } mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_rele(ds, FTAG); @@ -5949,8 +6083,8 @@ zfs_ioc_events_seek(zfs_cmd_t *zc) /* * inputs: - * zc_name name of new filesystem or snapshot - * zc_value full name of old snapshot + * zc_name name of later filesystem or snapshot + * zc_value full name of old snapshot or bookmark * * outputs: * zc_cookie space in bytes @@ -5962,7 +6096,7 @@ zfs_ioc_space_written(zfs_cmd_t *zc) { int error; dsl_pool_t *dp; - dsl_dataset_t *new, *old; + dsl_dataset_t *new; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) @@ -5972,16 +6106,26 @@ zfs_ioc_space_written(zfs_cmd_t *zc) dsl_pool_rele(dp, FTAG); return (error); } - error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); - if (error != 0) { - dsl_dataset_rele(new, FTAG); - dsl_pool_rele(dp, FTAG); - return (error); - } + if (strchr(zc->zc_value, '#') != NULL) { + zfs_bookmark_phys_t bmp; + error = dsl_bookmark_lookup(dp, zc->zc_value, + new, &bmp); + if (error == 0) { + error = dsl_dataset_space_written_bookmark(&bmp, new, + &zc->zc_cookie, + &zc->zc_objset_type, &zc->zc_perm_action); + } + } else { + dsl_dataset_t *old; + error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); - error = dsl_dataset_space_written(old, new, &zc->zc_cookie, - &zc->zc_objset_type, &zc->zc_perm_action); - dsl_dataset_rele(old, FTAG); + if (error == 0) { + error = dsl_dataset_space_written(old, new, + &zc->zc_cookie, + &zc->zc_objset_type, &zc->zc_perm_action); + dsl_dataset_rele(old, FTAG); + } + } dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); @@ -6061,6 +6205,9 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) * presence indicates raw encrypted records should be used. * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. + * (optional) "redactbook" -> (string) + * if present, use this bookmark's redaction list to generate a redacted + * send stream * } * * outnvl is unused @@ -6074,6 +6221,7 @@ static const zfs_ioc_key_t zfs_keys_send_new[] = { {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, }; /* ARGSUSED */ @@ -6091,6 +6239,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) boolean_t rawok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; + char *redactbook = NULL; fd = fnvlist_lookup_int32(innvl, "fd"); @@ -6104,12 +6253,18 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); + (void) nvlist_lookup_string(innvl, "redactbook", &redactbook); + if ((fp = getf(fd)) == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; + dmu_send_outparams_t out = {0}; + out.dso_outfunc = dump_bytes; + out.dso_arg = fp->f_vnode; + out.dso_dryrun = B_FALSE; error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, - rawok, fd, resumeobj, resumeoff, fp->f_vnode, &off); + rawok, resumeobj, resumeoff, redactbook, fd, &off, &out); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; @@ -6118,6 +6273,15 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) return (error); } +/* ARGSUSED */ +int +send_space_sum(objset_t *os, void *buf, int len, void *arg) +{ + uint64_t *size = arg; + *size += len; + return (0); +} + /* * Determine approximately how large a zfs send stream will be -- the number * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). @@ -6133,6 +6297,8 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. + * (optional) "fd" -> file descriptor to use as a cookie for progress + * tracking (int32) * } * * outnvl: { @@ -6146,6 +6312,11 @@ static const zfs_ioc_key_t zfs_keys_send_space[] = { {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"fd", DATA_TYPE_INT32, ZK_OPTIONAL}, + {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, + {"resumeobj", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"resumeoff", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"bytes", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int @@ -6153,11 +6324,21 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { dsl_pool_t *dp; dsl_dataset_t *tosnap; + dsl_dataset_t *fromsnap = NULL; int error; - char *fromname; + char *fromname = NULL; + char *redactlist_book = NULL; + boolean_t largeblockok; + boolean_t embedok; boolean_t compressok; boolean_t rawok; - uint64_t space; + uint64_t space = 0; + boolean_t full_estimate = B_FALSE; + uint64_t resumeobj = 0; + uint64_t resumeoff = 0; + uint64_t resume_bytes = 0; + int32_t fd = -1; + zfs_bookmark_phys_t zbm = {0}; error = dsl_pool_hold(snapname, FTAG, &dp); if (error != 0) @@ -6168,61 +6349,101 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) dsl_pool_rele(dp, FTAG); return (error); } + (void) nvlist_lookup_int32(innvl, "fd", &fd); + largeblockok = nvlist_exists(innvl, "largeblockok"); + embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); + boolean_t from = (nvlist_lookup_string(innvl, "from", &fromname) == 0); + boolean_t altbook = (nvlist_lookup_string(innvl, "redactbook", + &redactlist_book) == 0); + + (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); + (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); + (void) nvlist_lookup_uint64(innvl, "bytes", &resume_bytes); + + if (altbook) { + full_estimate = B_TRUE; + } else if (from) { + if (strchr(fromname, '#')) { + error = dsl_bookmark_lookup(dp, fromname, tosnap, &zbm); - error = nvlist_lookup_string(innvl, "from", &fromname); - if (error == 0) { - if (strchr(fromname, '@') != NULL) { /* - * If from is a snapshot, hold it and use the more - * efficient dmu_send_estimate to estimate send space - * size using deadlists. + * dsl_bookmark_lookup() will fail with EXDEV if + * the from-bookmark and tosnap are at the same txg. + * However, it's valid to do a send (and therefore, + * a send estimate) from and to the same time point, + * if the bookmark is redacted (the incremental send + * can change what's redacted on the target). In + * this case, dsl_bookmark_lookup() fills in zbm + * but returns EXDEV. Ignore this error. */ - dsl_dataset_t *fromsnap; + if (error == EXDEV && zbm.zbm_redaction_obj != 0 && + zbm.zbm_guid == + dsl_dataset_phys(tosnap)->ds_guid) + error = 0; + + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + if (zbm.zbm_redaction_obj != 0 || !(zbm.zbm_flags & + ZBM_FLAG_HAS_FBN)) { + full_estimate = B_TRUE; + } + } else if (strchr(fromname, '@')) { error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); - if (error != 0) - goto out; - error = dmu_send_estimate(tosnap, fromsnap, - compressok || rawok, &space); - dsl_dataset_rele(fromsnap, FTAG); - } else if (strchr(fromname, '#') != NULL) { - /* - * If from is a bookmark, fetch the creation TXG of the - * snapshot it was created from and use that to find - * blocks that were born after it. - */ - zfs_bookmark_phys_t frombm; - - error = dsl_bookmark_lookup(dp, fromname, tosnap, - &frombm); - if (error != 0) - goto out; - error = dmu_send_estimate_from_txg(tosnap, - frombm.zbm_creation_txg, compressok || rawok, - &space); + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + + if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) { + full_estimate = B_TRUE; + dsl_dataset_rele(fromsnap, FTAG); + } } else { /* * from is not properly formatted as a snapshot or * bookmark */ - error = SET_ERROR(EINVAL); - goto out; + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (SET_ERROR(EINVAL)); } - } else { + } + + if (full_estimate) { + dmu_send_outparams_t out = {0}; + offset_t off = 0; + out.dso_outfunc = send_space_sum; + out.dso_arg = &space; + out.dso_dryrun = B_TRUE; /* - * If estimating the size of a full send, use dmu_send_estimate. + * We have to release these holds so dmu_send can take them. It + * will do all the error checking we need. */ - error = dmu_send_estimate(tosnap, NULL, compressok || rawok, - &space); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + error = dmu_send(snapname, fromname, embedok, largeblockok, + compressok, rawok, resumeobj, resumeoff, redactlist_book, + fd, &off, &out); + } else { + error = dmu_send_estimate_fast(tosnap, fromsnap, + (from && strchr(fromname, '#') != NULL ? &zbm : NULL), + compressok || rawok, &space); + space -= resume_bytes; + if (fromsnap != NULL) + dsl_dataset_rele(fromsnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); } fnvlist_add_uint64(outnvl, "space", space); -out: - dsl_dataset_rele(tosnap, FTAG); - dsl_pool_rele(dp, FTAG); return (error); } @@ -6583,6 +6804,11 @@ zfs_ioctl_init(void) POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks)); + zfs_ioctl_register("get_bookmark_props", ZFS_IOC_GET_BOOKMARK_PROPS, + zfs_ioc_get_bookmark_props, zfs_secpolicy_read, ENTITY_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmark_props, + ARRAY_SIZE(zfs_keys_get_bookmark_props)); + zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, POOL_NAME, @@ -6622,6 +6848,11 @@ zfs_ioctl_init(void) B_TRUE, zfs_keys_channel_program, ARRAY_SIZE(zfs_keys_channel_program)); + zfs_ioctl_register("redact", ZFS_IOC_REDACT, + zfs_ioc_redact, zfs_secpolicy_config, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_redact, ARRAY_SIZE(zfs_keys_redact)); + zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, @@ -6857,7 +7088,8 @@ pool_status_check(const char *name, zfs_ioc_namecheck_t type, spa_t *spa; int error; - ASSERT(type == POOL_NAME || type == DATASET_NAME); + ASSERT(type == POOL_NAME || type == DATASET_NAME || + type == ENTITY_NAME); if (check & POOL_CHECK_NONE) return (0); @@ -7128,10 +7360,18 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) vec->zvec_namecheck, vec->zvec_pool_check); break; + case ENTITY_NAME: + if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) { + error = SET_ERROR(EINVAL); + } else { + error = pool_status_check(zc->zc_name, + vec->zvec_namecheck, vec->zvec_pool_check); + } + break; + case NO_NAME: break; } - /* * Ensure that all input pairs are valid before we pass them down * to the lower layers. diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index 205773ef35d7..c2c6f4df0d2e 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include "zfs_comutil.h" @@ -2164,11 +2165,14 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) mutex_exit(&zfsvfs->z_znodes_lock); bail: + if (err != 0) + zfsvfs->z_unmounted = B_TRUE; + /* release the VFS ops */ rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); - if (err) { + if (err != 0) { /* * Since we couldn't setup the sa framework, try to force * unmount this file system. @@ -2179,6 +2183,37 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) return (err); } +/* + * Release VOPs and unmount a suspended filesystem. + */ +int +zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just hold and rele it to update the + * objset_t, as the one we had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + VERIFY0(dmu_objset_from_ds(ds, &os)); + zfsvfs->z_os = os; + + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + /* + * Try to force unmount this file system. + */ + (void) zfs_umount(zfsvfs->z_sb); + zfsvfs->z_unmounted = B_TRUE; + return (0); +} + int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { @@ -2353,6 +2388,71 @@ zfs_get_vfs_flag_unmounted(objset_t *os) return (unmounted); } +struct objnode { + avl_node_t node; + uint64_t obj; +}; + +static int +objnode_compare(const void *o1, const void *o2) +{ + const struct objnode *obj1 = o1; + const struct objnode *obj2 = o2; + if (obj1->obj < obj2->obj) + return (-1); + if (obj1->obj > obj2->obj) + return (1); + return (0); +} + +objlist_t * +zfs_get_deleteq(objset_t *os) +{ + objlist_t *deleteq_objlist = objlist_create(); + uint64_t deleteq_obj; + zap_cursor_t zc; + zap_attribute_t za; + dmu_object_info_t doi; + + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + VERIFY0(dmu_object_info(os, MASTER_NODE_OBJ, &doi)); + ASSERT3U(doi.doi_type, ==, DMU_OT_MASTER_NODE); + + VERIFY0(zap_lookup(os, MASTER_NODE_OBJ, + ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); + + /* + * In order to insert objects into the objlist, they must be in sorted + * order. We don't know what order we'll get them out of the ZAP in, so + * we insert them into and remove them from an avl_tree_t to sort them. + */ + avl_tree_t at; + avl_create(&at, objnode_compare, sizeof (struct objnode), + offsetof(struct objnode, node)); + + for (zap_cursor_init(&zc, os, deleteq_obj); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + struct objnode *obj = kmem_zalloc(sizeof (*obj), KM_SLEEP); + obj->obj = za.za_first_integer; + avl_add(&at, obj); + } + zap_cursor_fini(&zc); + + struct objnode *next, *found = avl_first(&at); + while (found != NULL) { + next = AVL_NEXT(&at, found); + objlist_insert(deleteq_objlist, found->obj); + found = next; + } + + void *cookie = NULL; + while ((found = avl_destroy_nodes(&at, &cookie)) != NULL) + kmem_free(found, sizeof (*found)); + avl_destroy(&at); + return (deleteq_objlist); +} + + void zfs_init(void) { diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 88bd7831eb5c..fe9357b6a353 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4727,6 +4727,9 @@ zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, zb1->zb_blkid == zb2->zb_blkid) return (0); + IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT); + IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT); + /* * BP_SPANB calculates the span in blocks. */ diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 4b41c3f743ca..34e87797e035 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -693,6 +693,15 @@ tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos', 'quota_004_pos', 'quota_005_pos', 'quota_006_neg'] tags = ['functional', 'quota'] +[tests/functional/redacted_send] +tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted', + 'redacted_disabled_feature', 'redacted_embedded', 'redacted_holes', + 'redacted_incrementals', 'redacted_largeblocks', 'redacted_many_clones', + 'redacted_mixed_recsize', 'redacted_mounts', 'redacted_negative', + 'redacted_origin', 'redacted_props', 'redacted_resume', 'redacted_size', + 'redacted_volume'] +tags = ['functional', 'redacted_send'] + [tests/functional/raidz] tests = ['raidz_001_neg', 'raidz_002_pos'] tags = ['functional', 'raidz'] diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index 39a538d2d2a3..09c59f591a83 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -8,6 +8,7 @@ SUBDIRS = \ file_check \ file_trunc \ file_write \ + get_diff \ largest_file \ libzfs_input_check \ mkbusy \ @@ -24,4 +25,5 @@ SUBDIRS = \ rename_dir \ rm_lnkcnt_zero_file \ threadsappend \ - xattrtest + xattrtest \ + stride_dd diff --git a/tests/zfs-tests/cmd/get_diff/Makefile.am b/tests/zfs-tests/cmd/get_diff/Makefile.am new file mode 100644 index 000000000000..06c39ddd81ce --- /dev/null +++ b/tests/zfs-tests/cmd/get_diff/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = get_diff +get_diff_SOURCES = get_diff.c diff --git a/tests/zfs-tests/cmd/get_diff/get_diff.c b/tests/zfs-tests/cmd/get_diff/get_diff.c new file mode 100644 index 000000000000..2799f46b0747 --- /dev/null +++ b/tests/zfs-tests/cmd/get_diff/get_diff.c @@ -0,0 +1,109 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void +usage(char *msg, int exit_value) +{ + (void) fprintf(stderr, "get_diff file redacted_file\n"); + (void) fprintf(stderr, "%s\n", msg); + exit(exit_value); +} + +/* + * This utility compares two files, an original and its redacted counterpart + * (in that order). It compares the files 512 bytes at a time, printing out + * any ranges (as offset and length) where the redacted file does not match + * the original. This output is used to verify that the expected ranges of + * a redacted file do not contain the original data. + */ +int +main(int argc, char *argv[]) +{ + off_t diff_off = 0, diff_len = 0, off = 0; + int fd1, fd2; + char *fname1, *fname2; + char buf1[DEV_BSIZE], buf2[DEV_BSIZE]; + ssize_t bytes; + + if (argc != 3) + usage("Incorrect number of arguments.", 1); + + if ((fname1 = argv[1]) == NULL) + usage("Filename missing.", 1); + if ((fd1 = open(fname1, O_LARGEFILE | O_RDONLY)) < 0) { + perror("open1 failed"); + exit(1); + } + + if ((fname2 = argv[2]) == NULL) + usage("Redacted filename missing.", 1); + if ((fd2 = open(fname2, O_LARGEFILE | O_RDONLY)) < 0) { + perror("open2 failed"); + exit(1); + } + + while ((bytes = pread(fd1, buf1, DEV_BSIZE, off)) > 0) { + if (pread(fd2, buf2, DEV_BSIZE, off) < 0) { + if (errno == EIO) { + /* + * A read in a redacted section of a file will + * fail with EIO. If we get EIO, continue on + * but ensure that a comparison of buf1 and + * buf2 will fail, indicating a redacted block. + */ + buf2[0] = ~buf1[0]; + } else { + perror("pread failed"); + exit(1); + } + } + if (memcmp(buf1, buf2, bytes) == 0) { + if (diff_len != 0) { + (void) fprintf(stdout, "%lld,%lld\n", + (long long)diff_off, (long long)diff_len); + assert(off == diff_off + diff_len); + diff_len = 0; + } + diff_off = 0; + } else { + if (diff_len == 0) + diff_off = off; + assert(off == diff_off + diff_len); + diff_len += bytes; + } + off += bytes; + } + + if (diff_len != 0 && diff_len != 0) { + (void) fprintf(stdout, "%lld,%lld\n", (long long)diff_off, + (long long)diff_len); + } + + (void) close(fd1); + (void) close(fd2); + + return (0); +} diff --git a/tests/zfs-tests/cmd/stride_dd/.gitignore b/tests/zfs-tests/cmd/stride_dd/.gitignore new file mode 100644 index 000000000000..7c072ee0dec6 --- /dev/null +++ b/tests/zfs-tests/cmd/stride_dd/.gitignore @@ -0,0 +1 @@ +/stride_dd diff --git a/tests/zfs-tests/cmd/stride_dd/Makefile.am b/tests/zfs-tests/cmd/stride_dd/Makefile.am new file mode 100644 index 000000000000..d6f1adbac2b7 --- /dev/null +++ b/tests/zfs-tests/cmd/stride_dd/Makefile.am @@ -0,0 +1,7 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = stride_dd +stride_dd_SOURCES = stride_dd.c +stride_dd_LDADD = -lrt diff --git a/tests/zfs-tests/cmd/stride_dd/stride_dd.c b/tests/zfs-tests/cmd/stride_dd/stride_dd.c new file mode 100644 index 000000000000..e11b94ba3217 --- /dev/null +++ b/tests/zfs-tests/cmd/stride_dd/stride_dd.c @@ -0,0 +1,214 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +static int bsize = 0; +static int count = 0; +static char *ifile = NULL; +static char *ofile = NULL; +static int stride = 0; +static int seek = 0; +static char *execname = "stride_dd"; + +static void usage(void); +static void parse_options(int argc, char *argv[]); + +static void +usage(void) +{ + (void) fprintf(stderr, + "usage: %s -i inputfile -o outputfile -b blocksize -c count \n" + " -s stride [ -k seekblocks]\n" + "\n" + "Simplified version of dd that supports the stride option.\n" + "A stride of n means that for each block written, n - 1 blocks\n" + "are skipped in both the input and output file. A stride of 1\n" + "means that blocks are read and written consecutively.\n" + "All numeric parameters must be integers.\n" + "\n" + " inputfile: File to read from\n" + " outputfile: File to write to\n" + " blocksize: Size of each block to read/write\n" + " count: Number of blocks to read/write\n" + " stride: Read/write a block then skip (stride - 1) blocks\n" + " seekblocks: Number of blocks to skip at start of output\n", + execname); + (void) exit(1); +} + +static void +parse_options(int argc, char *argv[]) +{ + int c; + int errflag = 0; + + execname = argv[0]; + + extern char *optarg; + extern int optind, optopt; + + while ((c = getopt(argc, argv, ":b:c:i:o:s:k:")) != -1) { + switch (c) { + case 'b': + bsize = atoi(optarg); + break; + + case 'c': + count = atoi(optarg); + break; + + case 'i': + ifile = optarg; + break; + + case 'o': + ofile = optarg; + break; + + case 's': + stride = atoi(optarg); + break; + + case 'k': + seek = atoi(optarg); + break; + + case ':': + (void) fprintf(stderr, + "Option -%c requires an operand\n", optopt); + errflag++; + break; + + case '?': + default: + (void) fprintf(stderr, + "Unrecognized option: -%c\n", optopt); + errflag++; + break; + } + + if (errflag) { + (void) usage(); + } + } + + if (bsize <= 0 || count <= 0 || stride <= 0 || ifile == NULL || + ofile == NULL || seek < 0) { + (void) fprintf(stderr, + "Required parameter(s) missing or invalid.\n"); + (void) usage(); + } +} + +int +main(int argc, char *argv[]) +{ + int i; + int ifd; + int ofd; + void *buf; + int c; + + parse_options(argc, argv); + + ifd = open(ifile, O_RDONLY); + if (ifd == -1) { + (void) fprintf(stderr, "%s: %s: ", execname, ifile); + perror("open"); + exit(2); + } + + ofd = open(ofile, O_WRONLY | O_CREAT, 0666); + if (ofd == -1) { + (void) fprintf(stderr, "%s: %s: ", execname, ofile); + perror("open"); + exit(2); + } + + /* + * We use valloc because some character block devices expect a + * page-aligned buffer. + */ + buf = valloc(bsize); + if (buf == NULL) { + (void) fprintf(stderr, + "%s: %s\n", execname, "not enough memory"); + exit(2); + } + + if (seek > 0) { + if (lseek(ofd, seek * bsize, SEEK_CUR) == -1) { + perror("output lseek"); + exit(2); + } + } + + for (i = 0; i < count; i++) { + c = read(ifd, buf, bsize); + if (c != bsize) { + + perror("read"); + exit(2); + } + if (c != bsize) { + if (c < 0) { + perror("read"); + } else { + (void) fprintf(stderr, + "%s: unexpected short read, read %d " + "bytes, expected %d\n", execname, + c, bsize); + } + exit(2); + } + + c = write(ofd, buf, bsize); + if (c != bsize) { + if (c < 0) { + perror("write"); + } else { + (void) fprintf(stderr, + "%s: unexpected short write, wrote %d " + "bytes, expected %d\n", execname, + c, bsize); + } + exit(2); + } + + if (stride > 1) { + if (lseek(ifd, (stride - 1) * bsize, SEEK_CUR) == -1) { + perror("input lseek"); + exit(2); + } + if (lseek(ofd, (stride - 1) * bsize, SEEK_CUR) == -1) { + perror("output lseek"); + exit(2); + } + } + } + + (void) close(ofd); + (void) close(ifd); + + return (0); +} + + diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 7cdaa394aaf1..d855020f386d 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -1,4 +1,5 @@ # +# Copyright (c) 2016, 2018 by Delphix. All rights reserved. # These variables are used by zfs-tests.sh to constrain which utilities # may be used by the suite. The suite will create a directory which is # the only element of $PATH and create symlinks from that dir to the @@ -159,6 +160,7 @@ export ZFSTEST_FILES='chg_usr_exec file_check file_trunc file_write + get_diff largest_file libzfs_input_check mkbusy @@ -176,4 +178,5 @@ export ZFSTEST_FILES='chg_usr_exec rm_lnkcnt_zero_file threadsappend user_ns_exec - xattrtest' + xattrtest + stride_dd' diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index e5a3d63a40d3..39e21b5cc6d8 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -391,7 +391,8 @@ function create_recv_clone log_must eval "zfs send $snap | zfs recv -u $recvfs" log_must mkfile 1m "$mountpoint/data" log_must zfs snapshot $incr - log_must eval "zfs send -i $snap $incr | dd bs=10K count=1 > $sendfile" + log_must eval "zfs send -i $snap $incr | dd bs=10K count=1 \ + iflag=fullblock > $sendfile" log_mustnot eval "zfs recv -su $recvfs < $sendfile" destroy_dataset "$sendfs" "-r" log_must rm -f "$sendfile" diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am index e0a4aca99cb4..1c14a20ad13d 100644 --- a/tests/zfs-tests/tests/functional/Makefile.am +++ b/tests/zfs-tests/tests/functional/Makefile.am @@ -51,6 +51,7 @@ SUBDIRS = \ projectquota \ quota \ raidz \ + redacted_send \ redundancy \ refquota \ refreserv \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_rm_nested.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_rm_nested.ksh new file mode 100644 index 000000000000..447fbb36b412 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_rm_nested.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# When a snapshot is destroyed, we used to recurse all clones +# that are downstream of the destroyed snapshot (e.g. to remove +# its key and merge its deadlist entries to the previous one). +# This recursion would break the stack on deeply nested clone +# hierarchies. To avoid this problem today, we keep heap-allocated +# records of all the clones as we traverse their hierarchy. +# +# This test ensures and showcases that our new method works with +# deeply nested clone hierarchies. +# +# STRATEGY: +# 1. Create an fs and take a snapshot of it (snapshot foo) +# 2. Take a second snapshot of the same fs (snapshot bar) on +# top of snapshot foo +# 3. Create a clone of snapshot bar and then take a snapshot +# of it. +# 4. Create a clone of the newly-created snapshot and then +# take a snapshot of it. +# 5. Repeat step [4] many times to create a deeply nested hierarchy. +# 6. Destroy snapshot foo. +# + +verify_runnable "both" + +typeset FS0=$TESTPOOL/0 +typeset FOO=foo +typeset BAR=BAR + +typeset FS0SNAPFOO=$FS0@$FOO +typeset FS0SNAPBAR=$FS0@$BAR + +typeset -i numds=300 + +log_must zfs create $FS0 + +function test_cleanup +{ + log_must zfs destroy -Rf $FS0 + + return 0 +} + +log_must zfs snapshot $FS0SNAPFOO +log_must zfs snapshot $FS0SNAPBAR + +log_onexit test_cleanup + +for (( i=1; i /dev/null" log_mustnot eval "zfs send -b$opt $SENDFS#bm > /dev/null" - log_mustnot eval "zfs send -b$opt -i $SENDFS#bm $SENDFS@s2 > /dev/null" done # Do 3..6 in a loop to verify various combination of "zfs send" options diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_006_pos.ksh index 7192551b6c5d..61314d30b369 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_006_pos.ksh @@ -15,7 +15,7 @@ # # -# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2012, 2018 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -36,6 +36,7 @@ verify_runnable "both" function cleanup { + mdb_set_uint32 zfs_override_estimate_recordsize 8192 for ds in $datasets; do destroy_dataset $ds "-rf" done @@ -90,6 +91,7 @@ function verify_size_estimates log_assert "Verify 'zfs send -nvP' generates valid stream estimates" log_onexit cleanup +mdb_set_uint32 zfs_override_estimate_recordsize 0 typeset -l block_count=0 typeset -l block_size typeset -i PERCENT=1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 8907533c9e4b..961da44ff197 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -76,6 +76,9 @@ typeset -a properties=( "feature@obsolete_counts" "feature@zpool_checkpoint" "feature@spacemap_v2" + "feature@redaction_bookmarks" + "feature@redacted_datasets" + "feature@bookmark_written" ) # Additional properties added for Linux. diff --git a/tests/zfs-tests/tests/functional/redacted_send/Makefile.am b/tests/zfs-tests/tests/functional/redacted_send/Makefile.am new file mode 100644 index 000000000000..dd6b4eb679a3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/Makefile.am @@ -0,0 +1,25 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/redacted_send +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + cleanup.ksh \ + redacted_compressed.ksh \ + redacted_contents.ksh \ + redacted_deleted.ksh \ + redacted_disabled_feature.ksh \ + redacted_embedded.ksh \ + redacted_holes.ksh \ + redacted_incrementals.ksh \ + redacted_largeblocks.ksh \ + redacted_many_clones.ksh \ + redacted_mixed_recsize.ksh \ + redacted_mounts.ksh \ + redacted_negative.ksh \ + redacted_origin.ksh \ + redacted_props.ksh \ + redacted_resume.ksh \ + redacted_size.ksh \ + redacted_volume.ksh + +dist_pkgdata_DATA = \ + redacted.cfg \ + redacted.kshlib diff --git a/tests/zfs-tests/tests/functional/redacted_send/cleanup.ksh b/tests/zfs-tests/tests/functional/redacted_send/cleanup.ksh new file mode 100755 index 000000000000..9768b795e91f --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/cleanup.ksh @@ -0,0 +1,33 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +destroy_pool $POOL +destroy_pool $POOL2 +echo "0" > /sys/module/zfs/parameters/zfs_allow_redacted_dataset_mount + +log_pass diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted.cfg b/tests/zfs-tests/tests/functional/redacted_send/redacted.cfg new file mode 100644 index 000000000000..f964b37bad3b --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted.cfg @@ -0,0 +1,86 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +export DISK1=$(echo $DISKS | awk '{print $1}') +export DISK2=$(echo $DISKS | awk '{print $2}') + +export POOL=$TESTPOOL +export POOL2=$TESTPOOL2 +export FS=$TESTFS +export FS2=$TESTFS2 + +# +# These are the byte ranges that differ between files and their redacted +# counterparts. See compare_files() for more detail. +# +typeset RANGE0="0,2097152" +typeset RANGE1="0,131072" +typeset RANGE2="1048576,2097152" +typeset RANGE3="0,131072 +1966080,131072 +3932160,131072" +typeset RANGE4="0,131072 +262144,131072 +524288,131072 +786432,131072" +typeset RANGE5="0,1048576 +7340032,1048576" +typeset RANGE6="393216,131072 +655360,131072 +917504,131072 +1179648,131072 +1441792,393216 +1966080,393216 +2621440,262144 +3145728,262144 +3670016,262144 +4194304,262144 +4718592,262144 +5242880,262144" +typeset RANGE7="1048576,6291456" +typeset RANGE8="4063232,131072" +typeset RANGE9="0,131072 +262144,131072 +524288,131072 +786432,131072 +1048576,131072 +1310720,131072 +1572864,131072 +1835008,131072 +2097152,131072 +2359296,131072 +2621440,131072 +2883584,131072 +3145728,131072 +3407872,131072 +3670016,131072 +3932160,131072" +typeset RANGE10="0,393216" +typeset RANGE11="0,1048576" +typeset RANGE12="0,2097152" +typeset RANGE13="0,16384" +typeset RANGE14="" +typeset RANGE15="0,4194304" +typeset RANGE16="0,6291456" \ No newline at end of file diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted.kshlib b/tests/zfs-tests/tests/functional/redacted_send/redacted.kshlib new file mode 100644 index 000000000000..fd1bd5d009d0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted.kshlib @@ -0,0 +1,269 @@ +#!/bin/ksh + +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016, 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/rsend/rsend.kshlib +. $STF_SUITE/tests/functional/redacted_send/redacted.cfg + +function setup_dataset +{ + typeset ds_name=$1 + typeset opts=$2 + typeset file_create_func=$3 + typeset sendfs="$POOL/$ds_name" + [[ -n $file_create_func ]] || file_create_func=setup_common + + log_must zfs create $opts $sendfs + + $file_create_func $sendfs + + log_must zfs snapshot $sendfs@snap + log_must zfs clone $opts $sendfs@snap $POOL/${ds_name}_clone + log_must zfs snapshot $POOL/${ds_name}_clone@snap +} + +function setup_common +{ + typeset sendfs=$1 + + typeset mntpnt=$(get_prop mountpoint $sendfs) + typeset bs=$(get_prop recsize $sendfs) + log_must dd if=/dev/urandom of=$mntpnt/f1 bs=$bs count=16 + log_must dd if=/dev/urandom of=$mntpnt/f2 bs=$bs count=32 +} + +function setup_embedded +{ + typeset sendfs=$1 + + typeset recsize + typeset mntpnt=$(get_prop mountpoint $sendfs) + for recsize in 512 1024 2048 4096 8192 16384; do + if is_linux; then + log_must dd if=/dev/urandom of=$mntpnt/$recsize bs=8 \ + count=1 seek=$(((recsize / 8) - 1)) + else + log_must mkholes -d $((recsize - 8)):8 $mntpnt/$recsize + fi + done +} + +function setup_holes +{ + typeset sendfs=$1 + + typeset mntpnt=$(get_prop mountpoint $sendfs) + typeset M=$((1024 * 1024)) + + if is_linux; then + log_must dd if=/dev/urandom of=$mntpnt/f1 bs=8M count=1 + + log_must dd if=/dev/urandom of=$mntpnt/f2 bs=1M count=1 + log_must dd if=/dev/urandom of=$mntpnt/f2 bs=1M count=1 seek=7 \ + conv=notrunc + + log_must dd if=/dev/urandom of=$mntpnt/f3 bs=1M count=6 seek=1 + log_must truncate $mntpnt/f3 --size=$((8 * M)) + + log_must truncate $mntpnt/f4 --size=$((8 * M)) + else + log_must mkholes -d 0:$((8 * M)) $mntpnt/f1 + log_must mkholes -d 0:$M -d $((7 * M)):$M $mntpnt/f2 + log_must mkholes -d $M:$((6 * M)) -h $((7 * M)):$M $mntpnt/f3 + log_must mkholes -h 0:$((8 * M)) $mntpnt/f4 + fi + + log_must zfs create $sendfs/manyrm + for i in {1..256}; do + log_must stride_dd -i /dev/urandom -o $mntpnt/manyrm/f$i -b 512 \ + -c $(random 100) -s $(random 4) + done + + log_must zfs snapshot $sendfs/manyrm@snap + log_must zfs clone $sendfs/manyrm@snap $sendfs/manyrm_clone + log_must zfs snapshot $sendfs/manyrm_clone@snap +} + +function setup_incrementals +{ + typeset sendfs=$1 + + typeset mntpnt=$(get_prop mountpoint $sendfs) + typeset bs=$(get_prop recsize $sendfs) + log_must dd if=/dev/urandom of=$mntpnt/f1 bs=$bs count=16 + log_must dd if=/dev/urandom of=$mntpnt/f2 bs=$bs count=32 + log_must mkdir $mntpnt/d1 + log_must eval "cat $mntpnt/f1 $mntpnt/f2 >$mntpnt/d1/f1" + log_must zfs snapshot $sendfs@snap0 + + log_must zfs clone $sendfs@snap0 $POOL/hole + mntpnt=$(get_prop mountpoint $POOL/hole) + log_must dd if=/dev/zero of=$mntpnt/f2 bs=$bs count=16 conv=notrunc + log_must zfs snapshot $POOL/hole@snap + + log_must zfs clone $sendfs@snap0 $POOL/stride3 + mntpnt=$(get_prop mountpoint $POOL/stride3) + log_must stride_dd -i /dev/urandom -o $mntpnt/f2 -b $bs -c 11 -s 3 + log_must zfs snapshot $POOL/stride3@snap + + log_must zfs clone $sendfs@snap0 $POOL/stride5 + mntpnt=$(get_prop mountpoint $POOL/stride5) + log_must stride_dd -i /dev/urandom -o $mntpnt/f2 -b $bs -c 7 -s 5 + log_must zfs snapshot $POOL/stride5@snap + + log_must zfs clone $sendfs@snap0 $POOL/int + log_must zfs snapshot $POOL/int@snap + + log_must zfs clone $POOL/int@snap $POOL/rm + mntpnt=$(get_prop mountpoint $POOL/rm) + log_must rm -rf $mntpnt/[df][12] + log_must zfs snapshot $POOL/rm@snap + + log_must zfs clone $POOL/int@snap $POOL/write + mntpnt=$(get_prop mountpoint $POOL/write) + log_must dd if=/dev/urandom of=$mntpnt/f1 bs=512 count=16 conv=notrunc + log_must dd if=/dev/urandom of=$mntpnt/d1/f1 bs=512 count=16 seek=16 \ + conv=notrunc + log_must zfs snapshot $POOL/write@snap +} + +function setup_mounts +{ + typeset sendfs=$1 + + typeset mntpnt=$(get_prop mountpoint $sendfs) + log_must touch $mntpnt/empty + log_must dd if=/dev/urandom of=$mntpnt/contents1 bs=512 count=2 + log_must dd if=/dev/urandom of=$mntpnt/contents2 bs=512 count=2 + log_must mkdir $mntpnt/dir1 + log_must touch $mntpnt/dir1/empty + log_must dd if=/dev/urandom of=$mntpnt/dir1/contents1 bs=512 count=2 + log_must dd if=/dev/urandom of=$mntpnt/dir1/contents2 bs=512 count=2 + log_must mkdir $mntpnt/dir1/dir2 + log_must touch $mntpnt/dir1/dir2/empty + log_must dd if=/dev/urandom of=$mntpnt/dir1/dir2/file bs=512 count=2 + + log_must zfs create -s -V 16p $sendfs/vol + log_must zfs snapshot $sendfs/vol@snap + log_must zfs clone $sendfs/vol@snap $sendfs/vol_clone + log_must zfs snapshot $sendfs/vol_clone@snap +} + +function mount_redacted +{ + typeset flag='' + while getopts "f" opt; do + case $opt in + f) + flag='-f' + ;; + esac + done + shift $(($OPTIND - 1)) + + typeset ds=$1 + echo "1" > /sys/module/zfs/parameters/zfs_allow_redacted_dataset_mount + zfs mount $flag -oro $ds || return 1 + echo "0" > /sys/module/zfs/parameters/zfs_allow_redacted_dataset_mount +} + +function unmount_redacted +{ + typeset ds=$1 + + zfs unmount $ds +} + +# +# This function calls a utility that prints out the ranges where a file +# and its redacted counterpart differ, each range on a new line like this: +# +# 0,131072 +# 1966080,131072 +# 3932160,131072 +# +# The output is then checked against a variable containing the expected +# output to verify the redacted ranges are the ones expected. +# +function compare_files +{ + typeset sendfs=$1 + typeset recvfs=$2 + typeset file=$3 + typeset expected="$4" + typeset tmpfile="$tmpdir/get_file.out" + + log_must mount_redacted -f $recvfs + + typeset file1="$(get_prop mountpoint $sendfs)/$file" + typeset file2="$(get_prop mountpoint $recvfs)/$file" + log_note "Comparing $file1 and $file2" + [[ -f $file1 ]] || log_fail "File $file1 does not exist." + [[ -f $file2 ]] || log_fail "File $file2 does not exist." + + log_must eval "get_diff $file1 $file2 >$tmpfile" + typeset range="$(cat $tmpfile)" + log_must unmount_redacted $recvfs + [[ "$expected" = "$range" ]] || log_fail "Unexpected range: $range" +} + +function redacted_cleanup +{ + typeset ds_list=$@ + typeset ds + + # Verify the receiving pool can still be exported and imported. + log_must zpool export $POOL2 + log_must zpool import $POOL2 + + for ds in $ds_list; do + datasetexists $ds && log_must zfs destroy -R $ds + done + + echo "0" > /sys/module/zfs/parameters/zfs_allow_redacted_dataset_mount + rm -f $(get_prop mountpoint $POOL)/tmp/* +} + +# Retrieve the redaction list of a bookmark or snapshot, using +# the property or zdb output, as requested. +function get_guid_list +{ + typeset filename=$1 + typeset dataset=$2 + typeset use_zdb=${3:-false} + + if $use_zdb; then + guid_list=$(zdb -vvvv $dataset | sed -e 's/,//g' \ + -ne 's/^.*Snapshots: \[\(.*\)\]/\1/p') + else + guid_list=$(get_prop redact_snaps $dataset) + fi + + for guid in $(echo $guid_list | tr ',' ' '); do + echo $guid + done | sort >$filename +} diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_compressed.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_compressed.ksh new file mode 100755 index 000000000000..0a8bf3903c28 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_compressed.ksh @@ -0,0 +1,71 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify that compressed send streams are redacted correctly. +# +# Strategy: +# 1. Receive a redacted compressed send stream, verifying compression and +# redaction. +# 2. Receive an incremental on the full receive, verifying compression and +# redaction. +# + +typeset ds_name="compressed" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name "-o compress=lz4" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset clone_mnt="$(get_prop mountpoint $clone)" + +log_onexit redacted_cleanup $sendfs $recvfs + +log_must stride_dd -i /dev/urandom -o $clone_mnt/f1 -b $((128 * 1024)) -c 4 -s 2 +log_must zfs snapshot $clone@snap1 +log_must rm $clone_mnt/f2 +log_must zfs snapshot $clone@snap2 + +log_must zfs redact $sendfs@snap book1 $clone@snap1 $clone@snap2 +log_must eval "zfs send -c --redact book1 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must stream_has_features $stream compressed lz4 redacted +compare_files $sendfs $recvfs "f1" "$RANGE4" +verify_stream_size $stream $sendfs +log_must mount_redacted -f $recvfs +verify_stream_size $stream $recvfs +log_must unmount_redacted $recvfs + +log_must eval "zfs send -c -i $sendfs@snap $clone@snap1 >$stream" +log_must eval "zfs recv $POOL2/inc1 <$stream" +log_must stream_has_features $stream compressed lz4 +typeset mntpnt=$(get_prop mountpoint $POOL2) +log_must diff $clone_mnt/f1 $mntpnt/inc1/f1 +log_must diff $send_mnt/f2 $mntpnt/inc1/f2 + +log_must eval "zfs send -c -i $sendfs@snap $clone@snap2 >$stream" +log_must eval "zfs recv $POOL2/inc2 <$stream" +log_must stream_has_features $stream compressed lz4 +log_must diff $clone_mnt/f1 $mntpnt/inc1/f1 +[[ -f $mntpnt/inc2/f2 ]] && log_fail "File f2 should not exist." + +log_pass "Compressed send streams are redacted correctly." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_contents.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_contents.ksh new file mode 100755 index 000000000000..d05cce947011 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_contents.ksh @@ -0,0 +1,162 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify redaction works as expected for various scenarios. +# +# Strategy: +# 1. An unmodified file does not get redacted at all. +# 2. Empty redaction list redacts everything. +# 3. A file removed in the clone redacts the whole file. +# 4. A file moved in the clone does not redact the file. +# 5. A copied, then removed file in the clone redacts the whole file. +# 6. Overwriting a file with identical contents redacts the file. +# 7. A paritally modified block redacts the entire block. +# 8. Only overlapping areas of modified ranges are redacted. +# 9. Send from the root dataset of a pool work correctly. +# + +typeset ds_name="contents" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" + +log_onexit redacted_cleanup $sendfs $recvfs + +# An unmodified file does not get redacted at all. +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book1 $clone@snap1 +log_must eval "zfs send --redact book1 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must mount_redacted -f $recvfs +log_must diff $send_mnt/f1 $recv_mnt/f1 +log_must diff $send_mnt/f2 $recv_mnt/f2 +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Removing a file in the clone redacts the entire file. +log_must rm "$clone_mnt/f1" +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book3 $clone@snap1 +log_must eval "zfs send --redact book3 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f1" "$RANGE0" +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Moving a file in the clone does not redact the file. +log_must mv "$clone_mnt/f1" "$clone_mnt/f1.moved" +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book4 $clone@snap1 +log_must eval "zfs send --redact book4 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must mount_redacted -f $recvfs +[[ -f $recv_mnt/f1.moved ]] && log_fail "Found moved file in redacted receive." +log_must diff $send_mnt/f1 $recv_mnt/f1 +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Copying, then removing a file in the clone does redact the file. +log_must cp "$clone_mnt/f1" "$clone_mnt/f1.copied" +log_must rm "$clone_mnt/f1" +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book5 $clone@snap1 +log_must eval "zfs send --redact book5 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f1" "$RANGE0" +log_must mount_redacted -f $recvfs +[[ -f $recv_mnt/f1.copied ]] && log_fail "Found moved file in redacted receive." +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Overwriting the contents of a block with identical contents redacts the file. +log_must cp "$clone_mnt/f1" "$clone_mnt/f1.copied" +log_must cp "$clone_mnt/f1.copied" "$clone_mnt/f1" +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book6 $clone@snap1 +log_must eval "zfs send --redact book6 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f1" "$RANGE0" +log_must mount_redacted -f $recvfs +[[ -f $recv_mnt/f1.copied ]] && log_fail "Found moved file in redacted receive." +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Modifying some of a block redacts the whole block. +log_must dd if=/dev/urandom of=$clone_mnt/f1 conv=notrunc seek=2 count=1 bs=32k +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book7 $clone@snap1 +log_must eval "zfs send --redact book7 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f1" "$RANGE1" +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Only overlapping areas of modified ranges are redacted. +log_must dd if=/dev/urandom of=$clone_mnt/f2 bs=1024k count=3 conv=notrunc +log_must zfs snapshot $clone@snap1 +log_must zfs clone $sendfs@snap $clone/new +typeset mntpnt="$(get_prop mountpoint $clone/new)" +log_must dd if=/dev/urandom of=$mntpnt/f2 bs=1024k seek=1 count=3 \ + conv=notrunc +log_must zfs snapshot $clone/new@snap +log_must zfs redact $sendfs@snap book8 $clone@snap1 $clone/new@snap +log_must eval "zfs send --redact book8 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f2" "$RANGE2" +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $clone/new +log_must zfs destroy -R $recvfs + +# FizzBuzz version +log_must zfs clone $sendfs@snap $POOL/stride3 +mntpnt="$(get_prop mountpoint $POOL/stride3)" +log_must stride_dd -i /dev/urandom -o $mntpnt/f2 -b $((128 * 1024)) -c 11 -s 3 +log_must zfs snapshot $POOL/stride3@snap +log_must zfs clone $sendfs@snap $POOL/stride5 +mntpnt="$(get_prop mountpoint $POOL/stride5)" +log_must stride_dd -i /dev/urandom -o $mntpnt/f2 -b $((128 * 1024)) -c 7 -s 5 +log_must zfs snapshot $POOL/stride5@snap +log_must zfs redact $sendfs@snap book8a $POOL/stride3@snap $POOL/stride5@snap +log_must eval "zfs send --redact book8a $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f2" "$RANGE3" +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Send from the root dataset of a pool work correctly. +log_must dd if=/dev/urandom of=/$POOL/f1 bs=128k count=4 +log_must zfs snapshot $POOL@snap +log_must zfs clone $POOL@snap $POOL/clone +log_must dd if=/dev/urandom of=/$POOL/clone/f1 bs=128k count=1 conv=notrunc +log_must zfs snapshot $POOL/clone@snap +log_must zfs redact $POOL@snap book9 $POOL/clone@snap +log_must eval "zfs send --redact book9 $POOL@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $POOL $recvfs "f1" "$RANGE1" +log_must zfs destroy -R $POOL@snap + +log_pass "Redaction works as expected for various scenarios." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_deleted.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_deleted.ksh new file mode 100755 index 000000000000..2eb9e4dc9f0a --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_deleted.ksh @@ -0,0 +1,102 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify redaction works as expected with respect to deleted files +# +# Strategy: +# 1. A file on the delete queue counts as deleted when using it to calculate +# redaction. +# 2. A file that is removed in the tosnap of an incremental, where the fromsnap +# is a redaction bookmark that contains references to that file, does not +# result in records for that file. +# + +typeset ds_name="deleted" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset clone2="$POOL/${ds_name}_clone2" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" + +log_onexit redacted_cleanup $sendfs $recvfs + +# +# A file on the delete queue counts as deleted when using it to calculate +# redaction. +# + +# +# Open file descriptor 5 for appending to $clone_mnt/f1 so that it will go on +# the delete queue when we rm it. +# +exec 5>>$clone_mnt/f1 +log_must rm $clone_mnt/f1 +log_must zfs snapshot $clone@snap1 +# Close file descriptor 5 +exec 5>&- +log_must zfs redact $sendfs@snap book1 $clone@snap1 +log_must eval "zfs send --redact book1 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must mount_redacted -f $recvfs +# +# We have temporarily disabled redaction blkptrs, so this will not +# fail as was originally intended. We should uncomment this line +# when we reenable redaction blkptrs. +# +#log_mustnot dd if=$recv_mnt/f1 of=/dev/null bs=512 count=1 +log_must diff $send_mnt/f2 $recv_mnt/f2 +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# +# A file that is removed in the tosnap of an incremental, where the fromsnap +# is a redaction bookmark that contains references to that file, does not +# result in records for that file. +# +log_must zfs clone $sendfs@snap $clone2 +typeset clone2_mnt="$(get_prop mountpoint $clone2)" +log_must rm -rf $clone2_mnt/* +log_must zfs snapshot $clone2@snap +log_must zfs redact $sendfs@snap book2 $clone2@snap +log_must zfs destroy -R $clone2 +log_must eval "zfs send --redact book2 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must rm $send_mnt/f1 +log_must zfs snapshot $sendfs@snap2 +log_must zfs clone $sendfs@snap2 $clone2 +typeset clone2_mnt="$(get_prop mountpoint $clone2)" +log_must rm $clone2_mnt/* +log_must zfs snapshot $clone2@snap +log_must zfs redact $sendfs@snap2 book3 $clone2@snap +log_must zfs destroy -R $clone2 +log_must eval "zfs send -i $sendfs#book2 --redact book3 $sendfs@snap2 >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must mount_redacted -f $recvfs +log_must diff <(ls $send_mnt) <(ls $recv_mnt) +log_must zfs destroy -R $recvfs +log_must zfs rollback -R $sendfs@snap + +log_pass "Verify Redaction works as expected with respect to deleted files." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_disabled_feature.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_disabled_feature.ksh new file mode 100755 index 000000000000..24478f1bc182 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_disabled_feature.ksh @@ -0,0 +1,71 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify the functionality of the redaction_bookmarks and redacted_datasets +# features. +# +# Strategy: +# 1. Create a pool with all features disabled. +# 2. Verify redacted send fails. +# 3. Enable redaction_bookmarks and verify redacted sends works. +# 4. Verify recepit of a redacted stream fails. +# 5. Enable recacted_datasets and verify zfs receive works. +# + +typeset ds_name="disabled" +typeset sendfs="$POOL/$ds_name" +typeset sendfs1="$POOL2/${ds_name}1" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset clone1="$POOL2/${ds_name}_clone1" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' + +function cleanup +{ + destroy_pool $POOL2 + create_pool $POOL2 $DISK2 + log_must zfs snapshot $POOL2@init + redacted_cleanup $sendfs $recvfs +} + +log_onexit cleanup + +destroy_pool $POOL2 +log_must zpool create -d $POOL2 $DISK2 + +log_must zfs create $sendfs1 +log_must zfs snapshot $sendfs1@snap +log_must zfs clone $sendfs1@snap $clone1 +log_must zfs snapshot $clone1@snap + +log_mustnot zfs redact $sendfs1@snap book1 $clone1@snap +log_must zpool set feature@redaction_bookmarks=enabled $POOL2 +log_must zfs redact $sendfs1@snap book1 $clone1@snap + +log_must zfs redact $sendfs@snap book1 $clone@snap +log_must eval "zfs send --redact book1 $sendfs@snap >$stream" +log_mustnot eval "zfs recv $recvfs <$stream" +log_must zpool set feature@redacted_datasets=enabled $POOL2 +log_must eval "zfs recv $recvfs <$stream" + +log_pass "The redacted send/recv features work correctly." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_embedded.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_embedded.ksh new file mode 100755 index 000000000000..94937a2f79ab --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_embedded.ksh @@ -0,0 +1,103 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify embedded blocks and redacted send work correctly together. +# +# Strategy: +# 1. Create recsize sized files with embedded blocks from size 512b to 16k. +# 2. Receive a redacted send stream with nothing redacted. +# 3. Verify the received files match the source, contain embedded blocks, and +# that the stream has the redacted and embedded data features. +# 4. Receive a redacted send stream with files 512, 2048 and 8192 redacted. +# 5. Verify that the redacted files no longer match, but the others still +# contain embedded blocks and the stream has the redacted and embedded +# data features. +# + +typeset ds_name="embedded" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '-o compress=lz4' setup_embedded +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" +typeset recsize send_obj recv_obj + +log_onexit redacted_cleanup $sendfs $recvfs + +log_must zfs redact $sendfs@snap book1 $clone@snap +log_must eval "zfs send -e --redact book1 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must stream_has_features $stream redacted embed_data + +log_must mount_redacted -f $recvfs +for recsize in 512 1024 2048 4096 8192 16384; do + send_obj=$(get_objnum $send_mnt/$recsize) + recv_obj=$(get_objnum $recv_mnt/$recsize) + + log_must diff $send_mnt/$recsize $recv_mnt/$recsize + log_must eval "zdb -ddddd $sendfs $send_obj >$tmpdir/send.zdb" + log_must eval "zdb -ddddd $recvfs $recv_obj >$tmpdir/recv.zdb" + + grep -q "EMBEDDED" $tmpdir/send.zdb || \ + log_fail "Obj $send_obj not embedded in $sendfs" + grep -q "EMBEDDED" $tmpdir/recv.zdb || \ + log_fail "Obj $recv_obj not embedded in $recvfs" + + cat $stream | zstreamdump -v | log_must grep -q \ + "WRITE_EMBEDDED object = $send_obj offset = 0" +done + +log_must zfs destroy -R $recvfs +for recsize in 512 2048 8192; do + log_must dd if=/dev/urandom of=$clone_mnt/$recsize bs=$recsize count=1 +done +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book2 $clone@snap1 +log_must eval "zfs send -e --redact book2 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must stream_has_features $stream redacted embed_data + +log_must mount_redacted -f $recvfs +for recsize in 512 2048 8192; do + log_mustnot diff $send_mnt/$recsize $recv_mnt/$recsize +done +for recsize in 1024 4096 16384; do + send_obj=$(get_objnum $send_mnt/$recsize) + recv_obj=$(get_objnum $recv_mnt/$recsize) + + log_must diff $send_mnt/$recsize $recv_mnt/$recsize + log_must eval "zdb -ddddd $sendfs $send_obj >$tmpdir/send.zdb" + log_must eval "zdb -ddddd $recvfs $recv_obj >$tmpdir/recv.zdb" + + grep -q "EMBEDDED" $tmpdir/send.zdb || \ + log_fail "Obj $send_obj not embedded in $sendfs" + grep -q "EMBEDDED" $tmpdir/recv.zdb || \ + log_fail "Obj $recv_obj not embedded in $recvfs" + + cat $stream | zstreamdump -v | log_must grep -q \ + "WRITE_EMBEDDED object = $send_obj offset = 0" +done + +log_pass "Embedded blocks and redacted send work correctly together." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_holes.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_holes.ksh new file mode 100755 index 000000000000..47063848fe48 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_holes.ksh @@ -0,0 +1,120 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify redacted send streams reliably handle holes. +# +# Strategy: +# 1. Holes written at the beginning and end of a non-sparse file in the +# redacted list are correctly redacted. +# 2. Holes written throughout a non-sparse file in the redacted list are +# correctly redacted. +# 3. Data written into a hole in a sparse file in the redacted list are +# correctly redacted. +# 4. Holes in metadata blocks. +# + +typeset ds_name="holes" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' setup_holes +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" +typeset M=$((1024 * 1024)) + +log_onexit redacted_cleanup $sendfs $recvfs + +# Write holes at the start and end of a non-sparse file. +if is_linux; then + log_must dd if=/dev/zero of=$clone_mnt/f1 bs=1M count=1 conv=notrunc + log_must dd if=/dev/zero of=$clone_mnt/f1 bs=1M count=1 conv=notrunc seek=7 +else + log_must mkholes -h 0:$M -h $((7 * M)):$M $clone_mnt/f1 +fi +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book1 $clone@snap1 +log_must eval "zfs send --redact book1 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f1" "$RANGE5" +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Write two overlapping sets of holes into the same non-sparse file. +log_must stride_dd -i /dev/zero -o $clone_mnt/f1 -b $((128 * 1024)) -c 8 -s 2 -k 3 +log_must stride_dd -i /dev/zero -o $clone_mnt/f1 -b $((256 * 1024)) -c 8 -s 2 -k 6 +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book2 $clone@snap1 +log_must eval "zfs send --redact book2 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f1" "$RANGE6" +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Write data into the middle of a hole. +if is_linux; then + log_must dd if=/dev/urandom of=$clone_mnt/f2 bs=1M count=2 seek=3 \ + conv=notrunc +else + log_must mkholes -d $((3 * M)):$((2 * M)) $clone_mnt/f2 +fi +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book3 $clone@snap1 +log_must eval "zfs send --redact book3 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f2" "$RANGE14" +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Remove a file with holes. +log_must rm $clone_mnt/f3 +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book4 $clone@snap1 +log_must eval "zfs send --redact book4 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f3" "$RANGE7" +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Create a hole in a L0 metadata block by removing files. +log_must rm $send_mnt/manyrm_clone/f{32..96} +log_must zfs snapshot $sendfs/manyrm_clone@snap1 + +log_must zfs redact $sendfs/manyrm@snap book6 $sendfs/manyrm_clone@snap1 +log_must eval "zfs send --redact book6 $sendfs/manyrm@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must mount_redacted -f $recvfs +for i in {1..31} {97..256}; do + diff $send_mnt/manyrm/f$i $recv_mnt/f$i || log_fail \ + "File f$i did not match in the send and recv datasets." +done +for i in {32..96}; do + file_size=$(stat -c %s $send_mnt/manyrm/f$i) + redacted_size=$(stat -c %s $recv_mnt/f$i) + [[ $file_size -eq $redacted_size ]] || log_fail \ + "File f$i has size $file_size and redacted size $redacted_size" +done +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +log_pass "Redacted send streams reliably handle holes." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_incrementals.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_incrementals.ksh new file mode 100755 index 000000000000..1d2ed3a687be --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_incrementals.ksh @@ -0,0 +1,152 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify that incrementals (redacted and normal) work with redacted datasets. +# +# Strategy: +# 1. Test normal incrementals from the original snap to a subset of the +# redaction list. +# 2. Test receipt of intermediate clones, and their children. +# 3. Test receipt with origin snap specified by '-o origin='. +# 4. Test incrementals from redaction bookmarks. +# + +typeset ds_name="incrementals" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' setup_incrementals +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" + +log_onexit redacted_cleanup $sendfs $recvfs $POOL2/rfs + +# Setup a redacted send using a redaction list at varying depth. +log_must zfs redact $sendfs@snap0 book1 $POOL/rm@snap $POOL/stride3@snap \ + $POOL/stride5@snap +log_must eval "zfs send --redact book1 $sendfs@snap0 >$stream" +log_must eval "zfs receive $POOL2/rfs <$stream" + +# Verify receipt of normal incrementals to redaction list members. +log_must eval "zfs send -i $sendfs@snap0 $POOL/stride3@snap >$stream" +log_must eval "zfs recv $POOL2/rstride3 <$stream" +log_must diff -r /$POOL/stride3 /$POOL2/rstride3 +log_must eval "zfs send -i $sendfs@snap0 $POOL/stride5@snap >$stream" +log_must eval "zfs recv $POOL2/rstride5 <$stream" +log_must diff -r /$POOL/stride5 /$POOL2/rstride5 + +# But not a normal child that we weren't redacted with respect to. +log_must eval "zfs send -i $sendfs@snap0 $POOL/hole@snap >$stream" +log_mustnot eval "zfs recv $POOL2/rhole@snap <$stream" + +# Verify we can receive an intermediate clone redacted with respect to a +# subset of the original redaction list. +log_must zfs redact $POOL/int@snap book2 $POOL/rm@snap +log_must eval "zfs send -i $sendfs@snap0 --redact book2 $POOL/int@snap >$stream" +log_must eval "zfs recv $POOL2/rint <$stream" +compare_files $POOL/int $POOL2/rint "f1" "$RANGE0" +compare_files $POOL/int $POOL2/rint "f2" "$RANGE15" +compare_files $POOL/int $POOL2/rint "d1/f1" "$RANGE16" +log_must mount_redacted -f $POOL2/rint + +# Verify we can receive grandchildren on the child. +log_must eval "zfs send -i $POOL/int@snap $POOL/rm@snap >$stream" +log_must eval "zfs receive $POOL2/rrm <$stream" +log_must diff -r /$POOL/rm /$POOL2/rrm + +# But not a grandchild that the received child wasn't redacted with respect to. +log_must eval "zfs send -i $POOL/int@snap $POOL/write@snap >$stream" +log_mustnot eval "zfs recv $POOL2/rwrite<$stream" + +# Verify we cannot receive an intermediate clone that isn't redacted with +# respect to a subset of the original redaction list. +log_must zfs redact $POOL/int@snap book4 $POOL/rm@snap $POOL/write@snap +log_must eval "zfs send -i $sendfs@snap0 --redact book4 $POOL/int@snap >$stream" +log_mustnot eval "zfs recv $POOL2/rint <$stream" +log_must zfs redact $POOL/int@snap book5 $POOL/write@snap +log_must eval "zfs send -i $sendfs@snap0 --redact book5 $POOL/int@snap >$stream" +log_mustnot eval "zfs recv $POOL2/rint <$stream" +log_mustnot zfs redact $POOL/int@snap book6 $POOL/hole@snap + +# Verify we can receive a full clone of the grandchild on the child. +log_must eval "zfs send $POOL/write@snap >$stream" +log_must eval "zfs recv -o origin=$POOL2/rint@snap $POOL2/rwrite <$stream" +log_must diff -r /$POOL/write /$POOL2/rwrite + +# Along with other origins. +log_must eval "zfs recv -o origin=$POOL2/rfs@snap0 $POOL2/rwrite1 <$stream" +log_must diff -r /$POOL/write /$POOL2/rwrite1 +log_must eval "zfs recv -o origin=$POOL2@init $POOL2/rwrite2 <$stream" +log_must diff -r /$POOL/write /$POOL2/rwrite2 +log_must zfs destroy -R $POOL2/rwrite2 + +log_must zfs destroy -R $POOL2/rfs + +# Write some data for tests of incremental sends from bookmarks +log_must zfs snapshot $sendfs@snap1 +log_must zfs clone $sendfs@snap1 $POOL/hole1 +typeset mntpnt=$(get_prop mountpoint $POOL/hole1) +log_must dd if=/dev/zero of=$mntpnt/f2 bs=128k count=16 conv=notrunc +log_must zfs snapshot $POOL/hole1@snap +log_must zfs clone $sendfs@snap1 $POOL/write1 +mntpnt=$(get_prop mountpoint $POOL/write1) +log_must dd if=/dev/urandom of=$mntpnt/f2 bs=128k count=16 conv=notrunc +log_must zfs snapshot $POOL/write1@snap +log_must zfs clone $POOL/int@snap $POOL/write2 +mntpnt=$(get_prop mountpoint $POOL/write2) +log_must dd if=/dev/urandom of=$mntpnt/f2 bs=128k count=16 conv=notrunc +log_must zfs snapshot $POOL/write2@snap + +# Setup a redacted send using a redaction list at varying depth. +log_must zfs redact $sendfs@snap0 book7 $POOL/rm@snap $POOL/stride3@snap \ + $POOL/stride5@snap +log_must eval "zfs send --redact book7 $sendfs@snap0 >$stream" +log_must eval "zfs receive $POOL2/rfs <$stream" + +# Verify we can receive a redacted incremental sending from the bookmark. +log_must zfs redact $sendfs@snap1 book8 $POOL/write1@snap +log_must eval "zfs send -i $sendfs#book7 --redact book8 $sendfs@snap1 >$stream" +log_must eval "zfs receive $POOL2/rfs <$stream" +# The stride3 and stride5 snaps redact 3 128k blocks at block offsets 0 15 and +# 30 of f2. The write1 snap only covers the first two of those three blocks. +compare_files $sendfs $POOL2/rfs "f2" "$RANGE12" +log_must mount_redacted -f $POOL2/rfs +log_must diff $send_mnt/f1 /$POOL2/rfs/f1 +log_must diff $send_mnt/d1/f1 /$POOL2/rfs/d1/f1 +unmount_redacted $POOL2/rfs + +# Verify we can receive a normal child we weren't redacted with respect to by +# sending from the bookmark. +log_must eval "zfs send -i $sendfs#book7 $POOL/hole1@snap >$stream" +log_must eval "zfs recv $POOL2/rhole1 <$stream" +log_must diff -r /$POOL/hole1 /$POOL2/rhole1 + +# Verify we can receive an intermediate clone redacted with respect to a +# non-subset if we send from the bookmark. +log_must zfs redact $POOL/int@snap book9 $POOL/write2@snap +log_must eval "zfs send -i $sendfs#book7 --redact book9 $POOL/int@snap >$stream" +log_must eval "zfs receive $POOL2/rint <$stream" +compare_files $sendfs $POOL2/rint "f2" "$RANGE12" + +log_pass "Incrementals (redacted and normal) work with redacted datasets." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_largeblocks.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_largeblocks.ksh new file mode 100755 index 000000000000..caccdd360061 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_largeblocks.ksh @@ -0,0 +1,63 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify large blocks and redacted send work correctly together. +# +# Strategy: +# 1. Create a dataset and clone with a 1m recordsize, modifying a few k +# within the first 1m of a 16m file. +# 2. Verify that the whole first 1m of the file is redacted. +# 3. Receive an incremental stream from the original snap to the snap it +# was redacted with respect to. +# 4. Verify that the received dataset matches the clone +# + +typeset ds_name="largeblocks" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '-o recsize=1m' +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" + +log_onexit redacted_cleanup $sendfs $recvfs + +log_must dd if=/dev/urandom of=$clone_mnt/f1 bs=32k count=3 seek=8 conv=notrunc +log_must zfs snapshot $clone@snap1 + +log_must zfs redact $sendfs@snap book1 $clone@snap1 +log_must eval "zfs send -L --redact book1 $sendfs@snap >$stream" +log_must stream_has_features $stream redacted large_blocks +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f1" "$RANGE11" +log_must mount_redacted -f $recvfs +log_must diff $send_mnt/f2 $recv_mnt/f2 +unmount_redacted $recvfs + +log_must eval "zfs send -L -i $sendfs@snap $clone@snap1 >$stream" +log_must stream_has_features $stream large_blocks +log_must eval "zfs recv $recvfs/new <$stream" +log_must diff -r $clone_mnt $recv_mnt/new + +log_pass "Large blocks and redacted send work correctly together." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh new file mode 100755 index 000000000000..3386643b295e --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh @@ -0,0 +1,68 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify redacted send can deal with a large redaction list. +# +# Strategy: +# 1. Create 64 clones of sendfs each of which modifies two blocks in a file. +# The first modification is at an offset unique to each clone, and the +# second (the last block in the file) is common to them all. +# 2. Verify a redacted stream with a reasonable redaction list length can +# be correctly processed. +# 3. Verify that if the list is too long, the send fails gracefully. +# + +typeset ds_name="many_clones" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" +typeset redaction_list='' +typeset mntpnt + +log_onexit redacted_cleanup $sendfs $recvfs + +# Fill in both the last block, and a different block in every clone. +for i in {1..64}; do + log_must zfs clone $sendfs@snap ${clone}$i + mntpnt=$(get_prop mountpoint ${clone}$i) + log_must dd if=/dev/urandom of=$mntpnt/f2 bs=64k count=1 seek=$i \ + conv=notrunc + log_must dd if=/dev/urandom of=$mntpnt/f2 bs=64k count=1 seek=63 \ + conv=notrunc + log_must zfs snapshot ${clone}$i@snap +done + +# The limit isn't necessarily 32 snapshots. The maximum number of snapshots in +# the redacted list is determined in dsl_bookmark_create_redacted_check(). +log_must zfs redact $sendfs@snap book1 $clone{1..32}@snap +log_must eval "zfs send --redact book1 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f2" "$RANGE8" + +log_mustnot zfs redact $sendfs@snap book2 $clone{1..64}@snap + +log_pass "Redacted send can deal with a large redaction list." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_mixed_recsize.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_mixed_recsize.ksh new file mode 100755 index 000000000000..e1cd09e17d59 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_mixed_recsize.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify redacted send works with datasets of different sizes. +# +# Strategy: +# 1. Create two dataset one with recsize 512, and one 1m and create a 2m file. +# 2. For each dataset, create clones of both 512 and 1m recsize and modify +# the first 16k of the file. +# 3. Send each original dataset, redacted with respect to each of the clones +# into both a dataset inheriting a 512 recsize and a 1m one. +# 4. Verify that the smallest unit of redaction is that of the origin fs. +# + +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +typeset mntpnt + +log_onexit redacted_cleanup $POOL/512 $POOL/1m $POOL2/512 $POOL2/1m + +# Set up the datasets we'll send and redact from. +log_must zfs create -o recsize=512 $POOL/512 +mntpnt=$(get_prop mountpoint $POOL/512) +log_must dd if=/dev/urandom of=$mntpnt/f1 bs=1024k count=2 +log_must zfs snapshot $POOL/512@snap +log_must zfs clone -o recsize=1m $POOL/512@snap $POOL/1mclone +mntpnt=$(get_prop mountpoint $POOL/1mclone) +log_must dd if=/dev/urandom of=$mntpnt/f1 bs=512 count=32 conv=notrunc +log_must zfs snapshot $POOL/1mclone@snap + +log_must zfs create -o recsize=1m $POOL/1m +mntpnt=$(get_prop mountpoint $POOL/1m) +log_must dd if=/dev/urandom of=$mntpnt/f1 bs=1024k count=2 +log_must zfs snapshot $POOL/1m@snap +log_must zfs clone -o recsize=512 $POOL/1m@snap $POOL/512clone +mntpnt=$(get_prop mountpoint $POOL/512clone) +log_must dd if=/dev/urandom of=$mntpnt/f1 bs=512 count=32 conv=notrunc +log_must zfs snapshot $POOL/512clone@snap + +# Create datasets that allow received datasets to inherit recordsize. +log_must zfs create -o recsize=512 $POOL2/512 +log_must zfs create -o recsize=1m $POOL2/1m + +# Do the sends and verify the contents. +log_must zfs redact $POOL/512@snap book1 $POOL/1mclone@snap +log_must eval "zfs send --redact book1 $POOL/512@snap>$stream" +log_must eval "zfs recv $POOL2/512/recva <$stream" +compare_files $POOL/512 $POOL2/512/recva "f1" "$RANGE13" +log_must eval "zfs recv $POOL2/1m/recvb <$stream" +compare_files $POOL/512 $POOL2/1m/recvb "f1" "$RANGE13" + +log_must zfs redact $POOL/1m@snap book2 $POOL/512clone@snap +log_must eval "zfs send --redact book2 $POOL/1m@snap >$stream" +log_must eval "zfs recv $POOL2/512/recvc <$stream" +compare_files $POOL/1m $POOL2/512/recvc "f1" "$RANGE11" +log_must eval "zfs recv $POOL2/1m/recvd <$stream" +compare_files $POOL/1m $POOL2/1m/recvd "f1" "$RANGE11" + +log_pass "Redaction works correctly with different recordsizes." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_mounts.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_mounts.ksh new file mode 100755 index 000000000000..b52075133623 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_mounts.ksh @@ -0,0 +1,109 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify that received redacted datasets are not mounted by default, but +# can still be mounted after setting zfs_allow_redacted_dataset_mount. +# +# Strategy: +# 1. Verify a received redacted stream isn't mounted by default. +# 2. Set zfs_allow_redacted_dataset_mount and verify it can't be mounted +# without the -f flag, but can with -f. +# 3. Receive a redacted volume. +# 4. Verify the device file isn't present until the kernel variable is set. +# 5. Verify the files in the send fs are also present in the recv fs. +# + +typeset ds_name="mounts" +typeset sendfs="$POOL/$ds_name" +typeset sendvol="$sendfs/vol" +typeset recvfs="$POOL2/$ds_name" +typeset recvvol="$POOL2/vol" +typeset clone="$POOL/${ds_name}_clone" +typeset clonevol="${sendvol}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' setup_mounts +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" +typeset recv_vol_file="/dev/zvol/$recvvol" + +log_onexit redacted_cleanup $sendfs $recvfs $recvvol + +log_must rm $clone_mnt/empty $clone_mnt/contents1 +log_must dd if=/dev/urandom of=$clone_mnt/contents2 bs=512 count=1 conv=notrunc +log_must rm $clone_mnt/dir1/contents1 +log_must rm -rf $clone_mnt/dir1/dir2 +log_must dd if=/dev/urandom of=$clone_mnt/dir1/contents2 bs=512 count=1 \ + conv=notrunc +log_must dd if=/dev/urandom of=$clone_mnt/dir1/empty bs=512 count=1 +log_must zfs snapshot $clone@snap1 + +log_must zfs redact $sendfs@snap book1 $clone@snap +log_must eval "zfs send --redact book1 $sendfs@snap >$stream" +log_must eval "zfs receive $recvfs <$stream" +log_mustnot ismounted $recvfs +log_mustnot mount_redacted $recvfs +log_mustnot ismounted $recvfs +log_must mount_redacted -f $recvfs +log_must ismounted $recvfs + +# Verify that the send and recv fs both have the same files under their +# mountpoints by comparing find output with the name of the mountpoint +# deleted. +contents=$(log_must find $recv_mnt) +contents_orig=$(log_must find $send_mnt) +log_must diff <(echo ${contents//$recv_mnt/}) \ + <(echo ${contents_orig//$send_mnt/}) +log_must zfs redact $sendvol@snap book2 $clonevol@snap +log_must eval "zfs send --redact book2 $sendvol@snap >$stream" +log_must eval "zfs receive $recvvol <$stream" +[[ -b $recv_vol_file ]] && log_fail "Volume device file should not exist." +echo "1" > /sys/module/zfs/parameters/zfs_allow_redacted_dataset_mount +log_must zpool export $POOL2 +log_must zpool import $POOL2 +udevadm settle + +# The device file isn't guaranteed to show up right away. +if [[ ! -b $recv_vol_file ]]; then + udevadm settle + for t in 10 5 3 2 1; do + log_note "Polling $t seconds for device file." + udevadm settle + sleep $t + [[ -b $recv_vol_file ]] && break + done +fi +[[ -b $recv_vol_file ]] || log_fail "Volume device file should exist." + +log_must dd if=/dev/urandom of=$send_mnt/dir1/contents1 bs=512 count=2 +log_must rm $send_mnt/dir1/dir2/empty +log_must zfs snapshot $sendfs@snap2 +log_must eval "zfs send -i $sendfs#book1 $sendfs@snap2 >$stream" +log_must eval "zfs receive $recvfs <$stream" +log_must mount_redacted -f $recvfs +log_must ismounted $recvfs +contents=$(log_must find $recv_mnt) +contents_orig=$(log_must find $send_mnt) +log_must diff <(echo ${contents//$recv_mnt/}) \ + <(echo ${contents_orig//$send_mnt/}) + +log_pass "Received redacted streams can be mounted." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_negative.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_negative.ksh new file mode 100755 index 000000000000..e27eb601e290 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_negative.ksh @@ -0,0 +1,80 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Test that redacted send correctly detects invalid arguments. +# + +typeset sendfs="$POOL2/sendfs" +typeset recvfs="$POOL2/recvfs" +typeset clone1="$POOL2/clone1" +typeset clone2="$POOL2/clone2" +typeset clone3="$POOL2/clone3" +typeset clone3="$POOL2/clone4" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) + +log_onexit redacted_cleanup $sendfs $recvfs $clone3 + +log_must zfs create $sendfs +log_must zfs snapshot $sendfs@snap1 +log_must zfs snapshot $sendfs@snap2 +log_must zfs snapshot $sendfs@snap3 +log_must zfs clone $sendfs@snap2 $clone1 +log_must zfs snapshot $clone1@snap +log_must zfs bookmark $clone1@snap $clone1#book +log_must zfs clone $sendfs@snap2 $clone2 +log_must zfs snapshot $clone2@snap + +# Incompatible flags +log_must zfs redact $sendfs@snap2 book $clone1@snap +log_mustnot eval "zfs send -R --redact book $sendfs@snap2 >/dev/null" + +typeset arg +for arg in "$sendfs" "$clone1#book"; do + log_mustnot eval "zfs send --redact book $arg >/dev/null" +done + +# Bad redaction list arguments +log_mustnot zfs redact $sendfs@snap1 +log_mustnot zfs redact $sendfs@snap1 book +log_mustnot zfs redact $sendfs#book1 book4 $clone1 +log_mustnot eval "zfs send --redact $sendfs#book $sendfs@snap >/dev/null" + +# Redaction snapshots not a descendant of tosnap +log_mustnot zfs redact $sendfs@snap2 book $sendfs@snap2 +log_must zfs redact $sendfs@snap2 book2 $clone1@snap $clone2@snap +log_must eval "zfs send --redact book2 $sendfs@snap2 >$stream" +log_must zfs redact $sendfs@snap2 book3 $clone1@snap $clone2@snap +log_must eval "zfs send -i $sendfs@snap1 --redact book3 $sendfs@snap2 \ + >/dev/null" +log_mustnot zfs redact $sendfs@snap3 $sendfs@snap3 $clone1@snap + +# Full redacted sends of redacted datasets are not allowed. +log_must eval "zfs recv $recvfs <$stream" +log_must zfs snapshot $recvfs@snap +log_must zfs clone $recvfs@snap $clone3 +log_must zfs snapshot $clone3@snap +log_mustnot zfs redact $recvfs@snap book5 $clone3@snap + +# Nor may a redacted dataset appear in the redaction list. +log_mustnot zfs redact testpool2/recvfs@snap2 book7 testpool2/recvfs@snap + +log_pass "Verify that redacted send correctly detects invalid arguments." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_origin.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_origin.ksh new file mode 100755 index 000000000000..74e5914f2d88 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_origin.ksh @@ -0,0 +1,87 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Test that receiving sends from redaction bookmarks and redacted datasets +# works correctly in certain edge cases. +# 1. Send A(B,C,D) to pool2. +# 2. Verify send from A(B, C, D) can be received onto it. +# 3. Verify send from A(B, C) can be received onto it. +# 4. Verify send from A() can be received onto it. +# 5. Verify send from A(E) cannot be received onto it. +# 6. Verify send from redaction bookmark for A(B, C) can be received onto it. +# 7. Verify send from redaction bookmark for A() can be received onto it. +# 8. Verify send from redaction bookmark for A(E) cannot be received onto it. +# + +typeset ds_name="origin" +typeset sendfs="$POOL/$ds_name" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' setup_incrementals +typeset dsA=$sendfs@snap0 +typeset dsB=$POOL/hole@snap +typeset dsC=$POOL/rm@snap +typeset dsD=$POOL/write@snap +typeset dsE=$POOL/stride3@snap +typeset dsF=$POOL/stride5@snap +typeset targ=$POOL2/targfs@snap + +log_onexit redacted_cleanup $sendfs $POOL2/rBCD $POOL2/targfs \ + $POOL2/rBC $POOL2/rE + +# Set up all the filesystems and clones. +log_must zfs redact $dsA BCD $dsB $dsC $dsD +log_must eval "zfs send --redact BCD $dsA >$stream" +log_must eval "zfs receive $POOL2/rBCD <$stream" +log_must eval "zfs receive $targ <$stream" + +log_must zfs redact $dsA BC $dsB $dsC +log_must eval "zfs send --redact BC $dsA >$stream" +log_must eval "zfs receive $POOL2/rBC <$stream" + +log_must zfs redact $dsA E $dsE +log_must eval "zfs send --redact E $dsA >$stream" +log_must eval "zfs receive $POOL2/rE <$stream" + +log_must eval "zfs send $dsF >$stream" +log_must eval "zfs receive -o origin=$POOL2/rBCD@snap0 $POOL2/BCDrF <$stream" +log_must eval "zfs receive -o origin=$POOL2/rBC@snap0 $POOL2/BCrF <$stream" +log_must eval "zfs receive -o origin=$POOL2/rE@snap0 $POOL2/ErF <$stream" + +# Run tests from redacted datasets. +log_must eval "zfs send -i $POOL2/rBCD@snap0 $POOL2/BCDrF@snap >$stream" +log_must eval "zfs receive -o origin=$targ $POOL2/tdBCD <$stream" + +log_must eval "zfs send -i $POOL2/rBC@snap0 $POOL2/BCrF@snap >$stream" +log_must eval "zfs receive -o origin=$targ $POOL2/tdBC <$stream" + +log_must eval "zfs send -i $POOL2/rE@snap0 $POOL2/ErF@snap >$stream" +log_mustnot eval "zfs receive -o origin=$targ $POOL2/tdE <$stream" + +# Run tests from redaction bookmarks. +log_must eval "zfs send -i $sendfs#BC $dsF >$stream" +log_must eval "zfs receive -o origin=$targ $POOL2/tbBC <$stream" + +log_must eval "zfs send -i $sendfs#E $dsF >$stream" +log_mustnot eval "zfs receive -o origin=$targ $POOL2/tbE <$stream" + +log_pass "Verify sends from redacted datasets and bookmarks work correctly." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_props.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_props.ksh new file mode 100755 index 000000000000..e4163c4ef8da --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_props.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify the list of redacted snapshot guids as properties. +# +# Strategy: +# 1. Create a redacted dataset and receive it into another pool. +# 2. Verify that the redaction list in the book mark (according to zdb) +# matches the list shown in the redact_snaps property. +# 3. Verify that the received snapshot has a matching redaction list. +# + +typeset ds_name="props" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' +typeset mntpnt + +log_onexit redacted_cleanup $sendfs $recvfs + +# Verify a plain dataset, snapshot or bookmark has an empty list. +log_must zfs snapshot $sendfs@empty_snapshot +log_must zfs bookmark $sendfs@empty_snapshot $sendfs#empty_bookmark +found_list=$(get_prop redact_snaps $sendfs) +[[ $found_list = "-" ]] || log_fail "Unexpected dataset list: $found_list" +found_list=$(get_prop redact_snaps $sendfs@empty_snapshot) +[[ $found_list = "-" ]] || log_fail "Unexpected snapshot list: $found_list" +found_list=$(get_prop redact_snaps $sendfs#empty_bookmark) +[[ $found_list = "-" ]] || log_fail "Unexpected bookmark list: $found_list" + +# Fill in a different block in every clone. +for i in {1..16}; do + log_must zfs clone $sendfs@snap ${clone}$i + mntpnt=$(get_prop mountpoint ${clone}$i) + log_must dd if=/dev/urandom of=$mntpnt/f2 bs=64k count=1 seek=$i \ + conv=notrunc + log_must zfs snapshot ${clone}$i@snap +done + +log_must zfs redact $sendfs@snap book1 $clone{1..16}@snap +log_must eval "zfs send --redact book1 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" + +get_guid_list $tmpdir/prop_list $sendfs#book1 +get_guid_list $tmpdir/zdb_list $sendfs#book1 true +get_guid_list $tmpdir/recvd_prop_list $recvfs@snap + +count=$(wc -l $tmpdir/prop_list | awk '{print $1}') +[[ $count -eq 16 ]] || log_fail "Found incorrect number of redaction snapshots." + +diff $tmpdir/prop_list $tmpdir/zdb_list || \ + log_fail "Property list differed from zdb output" +diff $tmpdir/prop_list $tmpdir/recvd_prop_list || \ + log_fail "Received property list differed from sent" + +log_pass "The redaction list is consistent between sent and received datasets." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_resume.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_resume.ksh new file mode 100755 index 000000000000..3766df1f80cd --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_resume.ksh @@ -0,0 +1,87 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify that resumable send works correctly with redacted streams. +# +# Strategy: +# 1. Do a full redacted resumable send. +# 2. Verify the received contents are correct. +# 3. Do an incremental redacted resumable send. +# 4. Verify the received contents are correct. +# 5. Verify that recv -A removes a partially received dataset. +# + +typeset ds_name="resume" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset clone1="$POOL/${ds_name}_clone1" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" + +log_onexit redacted_cleanup $sendfs $recvfs + +log_must stride_dd -i /dev/urandom -o $clone_mnt/f2 -b 512 -c 64 -s 512 +log_must zfs snapshot $clone@snap1 + +# Do the full resumable send +log_must zfs redact $sendfs@snap book1 $clone@snap1 +resume_test "zfs send --redact book1 $sendfs@snap" $tmpdir $recvfs +log_must mount_redacted -f $recvfs +echo "1" > /sys/module/zfs/parameters/zfs_allow_redacted_dataset_mount +log_must diff $send_mnt/f1 $recv_mnt/f1 +log_must eval "get_diff $send_mnt/f2 $recv_mnt/f2 >$tmpdir/get_diff.out" +typeset range=$(cat $tmpdir/get_diff.out) +[[ "$RANGE9" = "$range" ]] || log_fail "Unexpected range: $range" + +log_must dd if=/dev/urandom of=$send_mnt/f3 bs=1024k count=3 +log_must zfs snapshot $sendfs@snap2 +log_must zfs clone $sendfs@snap2 $clone1 +typeset clone1_mnt="$(get_prop mountpoint $clone1)" +log_must dd if=/dev/urandom of=$clone1_mnt/f3 bs=128k count=3 conv=notrunc +log_must zfs snapshot $clone1@snap + +# Do the incremental resumable send +log_must zfs redact $sendfs@snap2 book2 $clone1@snap +resume_test "zfs send --redact book2 -i $sendfs#book1 $sendfs@snap2" \ + $tmpdir $recvfs +log_must diff $send_mnt/f1 $recv_mnt/f1 +log_must diff $send_mnt/f2 $recv_mnt/f2 +log_must eval "get_diff $send_mnt/f3 $recv_mnt/f3 >$tmpdir/get_diff.out" +range=$(cat $tmpdir/get_diff.out) +[[ "$RANGE10" = "$range" ]] || log_fail "Unexpected range: $range" + +# Test recv -A works properly +log_mustnot zfs recv -A $recvfs +log_must zfs destroy -R $recvfs +log_mustnot zfs recv -A $recvfs +log_must eval "zfs send --redact book1 $sendfs@snap >$stream" +dd if=$stream bs=64k count=1 | log_mustnot zfs receive -s $recvfs +[[ "-" = $(get_prop receive_resume_token $recvfs) ]] && \ + log_fail "Receive token not found." +log_must zfs recv -A $recvfs +log_must datasetnonexists $recvfs + +log_pass "Resumable send works correctly with redacted streams." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_size.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_size.ksh new file mode 100755 index 000000000000..81e7fe31d163 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_size.ksh @@ -0,0 +1,64 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify that send size estimates of redacted sends work correctly +# +# Strategy: +# 1. Perform a redacted send with -nv and without, and verify the +# size estimate is the same as the size of the actual send. +# 2. Receive an incremental send from the redaction bookmark with +# -nv and without, and verify the size estimate is the same as +# the size of the actual send. +# + +ds_name="sizes" +typeset sendfs="$POOL/$ds_name" +typeset clone="$POOL/${ds_name}_clone2" +setup_dataset $ds_name "-o compress=lz4" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset size=$(mktemp $tmpdir/size.XXXX) +typeset size2=$(mktemp $tmpdir/size.XXXX) + +log_onexit redacted_cleanup $sendfs $clone +log_must zfs clone $sendfs@snap $clone +typeset clone_mnt="$(get_prop mountpoint $clone)" +log_must rm -rf $clone_mnt/* +log_must zfs snapshot $clone@snap +log_must zfs redact $sendfs@snap book $clone@snap +log_must eval "zfs send -nvP --redact book $sendfs@snap | \ + grep '^size' | awk '{print \$2}' >$size" +log_must eval "zfs send --redact book $sendfs@snap | wc --bytes \ + >$size2" +bytes1=$(cat $size | tr -d '[[:space:]]') +bytes2=$(cat $size2 | tr -d '[[:space:]]') +[[ "$bytes1" -eq "$bytes2" ]] || \ + log_fail "Full sizes differ: estimate $bytes1 and actual $bytes2" + +log_must zfs snapshot $sendfs@snap2 +log_must eval "zfs send -nvP -i $sendfs#book $sendfs@snap2 | \ + grep '^size' | awk '{print \$2}' >$size" +log_must eval "zfs send -i $sendfs#book $sendfs@snap2 | wc --bytes >$size2" +bytes1=$(cat $size | tr -d '[[:space:]]') +bytes2=$(cat $size2 | tr -d '[[:space:]]') +[[ "$bytes1" -eq "$bytes2" ]] || \ + log_fail "Incremental sizes differ: estimate $bytes1 and actual $bytes2" + +log_pass "Size estimates of redacted sends estimate accurately." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_volume.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_volume.ksh new file mode 100755 index 000000000000..90f3890f241a --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_volume.ksh @@ -0,0 +1,105 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify that redacted send works on volumes. +# +# Strategy: +# 1. Write to a volume, then make a clone of that volume. +# 2. Receive a redacted stream that sends all blocks. +# 3. Receive a redacted stream that redacts the first half of the written area. +# + +typeset ds_name="volume" +typeset sendvol="$POOL/$ds_name" +typeset recvvol="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +typeset send_file="/dev/zvol/$sendvol" +typeset recv_file="/dev/zvol/$recvvol" +typeset clone_file="/dev/zvol/$clone" + +log_onexit redacted_cleanup $sendvol $recvvol + +log_must zfs create -b 8k -V 1g $sendvol +sleep 10 +log_must zpool export $POOL +log_must zpool import $POOL +udevadm settle +if [[ ! -b $send_file ]]; then + udevadm settle + for t in 10 5 3 2 1; do + log_note "Polling $t seconds for device file." + udevadm settle + sleep $t + [[ -b $send_file ]] && break + done +fi +log_must dd if=/dev/urandom of=$send_file bs=8k count=64 +log_must zfs snapshot $sendvol@snap +log_must zfs clone $sendvol@snap $clone +log_must zfs snapshot $clone@snap + +echo "1" > /sys/module/zfs/parameters/zfs_allow_redacted_dataset_mount +log_must zfs redact $sendvol@snap book1 $clone@snap +log_must eval "zfs send --redact book1 $sendvol@snap >$stream" +log_must eval "zfs recv $recvvol <$stream" +sleep 10 +log_must zpool export $POOL2 +log_must zpool import $POOL2 +udevadm settle +if [[ ! -b $recv_file ]]; then + udevadm settle + for t in 10 5 3 2 1; do + log_note "Polling $t seconds for device file." + udevadm settle + sleep $t + [[ -b $recv_file ]] && break + done +fi +log_must dd if=$send_file of=$tmpdir/send.dd bs=8k count=64 +log_must dd if=$recv_file of=$tmpdir/recv.dd bs=8k count=64 +log_must diff $tmpdir/send.dd $tmpdir/recv.dd +log_must zfs destroy -R $recvvol + +log_must dd if=/dev/urandom of=$clone_file bs=8k count=32 +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendvol@snap book2 $clone@snap1 +log_must eval "zfs send --redact book2 $sendvol@snap >$stream" +log_must eval "zfs recv $recvvol <$stream" +sleep 10 +log_must zpool export $POOL2 +log_must zpool import $POOL2 +udevadm settle +if [[ ! -b $recv_file ]]; then + udevadm settle + for t in 10 5 3 2 1; do + log_note "Polling $t seconds for device file." + udevadm settle + sleep $t + [[ -b $recv_file ]] && break + done +fi +log_must dd if=$send_file of=$tmpdir/send.dd bs=8k count=32 skip=32 +log_must dd if=$recv_file of=$tmpdir/recv.dd bs=8k count=32 skip=32 +log_must diff $tmpdir/send.dd $tmpdir/recv.dd + +log_pass "Redacted send works correctly with volumes." diff --git a/tests/zfs-tests/tests/functional/redacted_send/setup.ksh b/tests/zfs-tests/tests/functional/redacted_send/setup.ksh new file mode 100755 index 000000000000..3f537f813db0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/setup.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +verify_disk_count "$DISKS" 2 + +create_pool $POOL $DISK1 +log_must zfs snapshot $POOL@init +create_pool $POOL2 $DISK2 +log_must zfs snapshot $POOL2@init +log_must zfs create $POOL/tmp +log_pass diff --git a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib index 72d2eb93d442..1d2ef8c3ed83 100644 --- a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib +++ b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib @@ -25,7 +25,7 @@ # # -# Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright (c) 2013, 2018 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -525,7 +525,7 @@ function resume_test stream_num=$((stream_num+1)) token=$(zfs get -Hp -o value receive_resume_token $recvfs) - log_must eval "zfs send -v -t $token >/$streamfs/$stream_num" + log_must eval "zfs send -t $token >/$streamfs/$stream_num" [[ -f /$streamfs/$stream_num ]] || \ log_fail "NO FILE /$streamfs/$stream_num" done diff --git a/tests/zfs-tests/tests/functional/rsend/rsend_016_neg.ksh b/tests/zfs-tests/tests/functional/rsend/rsend_016_neg.ksh new file mode 100644 index 000000000000..4610802e9984 --- /dev/null +++ b/tests/zfs-tests/tests/functional/rsend/rsend_016_neg.ksh @@ -0,0 +1,33 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2014, 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/rsend/rsend.kshlib + +# +# Description: +# Verify that error conditions don't cause panics in zfs send +# +# Strategy: +# 1. Perform a zfs incremental send from a bookmark that doesn't exist +# + +verify_runnable "both" + +log_neg eval "zfs send -i \#bla $POOl/$FS@final > /dev/null" + +log_pass "Ensure that error conditions cause appropriate failures."