diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c index 8be08558b36d..730e6e1a040b 100644 --- a/cmd/raidz_test/raidz_bench.c +++ b/cmd/raidz_test/raidz_bench.c @@ -84,10 +84,10 @@ run_gen_bench_impl(const char *impl) if (rto_opts.rto_expand) { rm_bench = vdev_raidz_map_alloc_expanded( - zio_bench.io_abd, - zio_bench.io_size, zio_bench.io_offset, + &zio_bench, rto_opts.rto_ashift, ncols+1, ncols, - fn+1, rto_opts.rto_expand_offset); + fn+1, rto_opts.rto_expand_offset, + 0, B_FALSE); } else { rm_bench = vdev_raidz_map_alloc(&zio_bench, BENCH_ASHIFT, ncols, fn+1); @@ -172,10 +172,10 @@ run_rec_bench_impl(const char *impl) if (rto_opts.rto_expand) { rm_bench = vdev_raidz_map_alloc_expanded( - zio_bench.io_abd, - zio_bench.io_size, zio_bench.io_offset, + &zio_bench, BENCH_ASHIFT, ncols+1, ncols, - PARITY_PQR, rto_opts.rto_expand_offset); + PARITY_PQR, + rto_opts.rto_expand_offset, 0, B_FALSE); } else { rm_bench = vdev_raidz_map_alloc(&zio_bench, BENCH_ASHIFT, ncols, PARITY_PQR); diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c index 195026d3a7ab..6a018ecf0737 100644 --- a/cmd/raidz_test/raidz_test.c +++ b/cmd/raidz_test/raidz_test.c @@ -327,14 +327,12 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) if (opts->rto_expand) { opts->rm_golden = - vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd, - opts->zio_golden->io_size, opts->zio_golden->io_offset, + vdev_raidz_map_alloc_expanded(opts->zio_golden, opts->rto_ashift, total_ncols+1, total_ncols, - parity, opts->rto_expand_offset); - rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd, - zio_test->io_size, zio_test->io_offset, + parity, opts->rto_expand_offset, 0, B_FALSE); + rm_test = vdev_raidz_map_alloc_expanded(zio_test, opts->rto_ashift, total_ncols+1, total_ncols, - parity, opts->rto_expand_offset); + parity, opts->rto_expand_offset, 0, B_FALSE); } else { opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, opts->rto_ashift, total_ncols, parity); @@ -361,187 +359,6 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) return (err); } -/* - * If reflow is not in progress, reflow_offset should be UINT64_MAX. - * For each row, if the row is entirely before reflow_offset, it will - * come from the new location. Otherwise this row will come from the - * old location. Therefore, rows that straddle the reflow_offset will - * come from the old location. - * - * NOTE: Until raidz expansion is implemented this function is only - * needed by raidz_test.c to the multi-row raid_map_t functionality. - */ -raidz_map_t * -vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, - uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, - uint64_t nparity, uint64_t reflow_offset) -{ - /* The zio's size in units of the vdev's minimum sector size. */ - uint64_t s = size >> ashift; - uint64_t q, r, bc, devidx, asize = 0, tot; - - /* - * "Quotient": The number of data sectors for this stripe on all but - * the "big column" child vdevs that also contain "remainder" data. - * AKA "full rows" - */ - q = s / (logical_cols - nparity); - - /* - * "Remainder": The number of partial stripe data sectors in this I/O. - * This will add a sector to some, but not all, child vdevs. - */ - r = s - q * (logical_cols - nparity); - - /* The number of "big columns" - those which contain remainder data. */ - bc = (r == 0 ? 0 : r + nparity); - - /* - * The total number of data and parity sectors associated with - * this I/O. - */ - tot = s + nparity * (q + (r == 0 ? 0 : 1)); - - /* How many rows contain data (not skip) */ - uint64_t rows = howmany(tot, logical_cols); - int cols = MIN(tot, logical_cols); - - raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), - KM_SLEEP); - rm->rm_nrows = rows; - - for (uint64_t row = 0; row < rows; row++) { - raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t, - rr_col[cols]), KM_SLEEP); - rm->rm_row[row] = rr; - - /* The starting RAIDZ (parent) vdev sector of the row. */ - uint64_t b = (offset >> ashift) + row * logical_cols; - - /* - * If we are in the middle of a reflow, and any part of this - * row has not been copied, then use the old location of - * this row. - */ - int row_phys_cols = physical_cols; - if (b + (logical_cols - nparity) > reflow_offset >> ashift) - row_phys_cols--; - - /* starting child of this row */ - uint64_t child_id = b % row_phys_cols; - /* The starting byte offset on each child vdev. */ - uint64_t child_offset = (b / row_phys_cols) << ashift; - - /* - * We set cols to the entire width of the block, even - * if this row is shorter. This is needed because parity - * generation (for Q and R) needs to know the entire width, - * because it treats the short row as though it was - * full-width (and the "phantom" sectors were zero-filled). - * - * Another approach to this would be to set cols shorter - * (to just the number of columns that we might do i/o to) - * and have another mechanism to tell the parity generation - * about the "entire width". Reconstruction (at least - * vdev_raidz_reconstruct_general()) would also need to - * know about the "entire width". - */ - rr->rr_cols = cols; - rr->rr_bigcols = bc; - rr->rr_missingdata = 0; - rr->rr_missingparity = 0; - rr->rr_firstdatacol = nparity; - rr->rr_abd_empty = NULL; - rr->rr_nempty = 0; - - for (int c = 0; c < rr->rr_cols; c++, child_id++) { - if (child_id >= row_phys_cols) { - child_id -= row_phys_cols; - child_offset += 1ULL << ashift; - } - rr->rr_col[c].rc_devidx = child_id; - rr->rr_col[c].rc_offset = child_offset; - rr->rr_col[c].rc_orig_data = NULL; - rr->rr_col[c].rc_error = 0; - rr->rr_col[c].rc_tried = 0; - rr->rr_col[c].rc_skipped = 0; - rr->rr_col[c].rc_need_orig_restore = B_FALSE; - - uint64_t dc = c - rr->rr_firstdatacol; - if (c < rr->rr_firstdatacol) { - rr->rr_col[c].rc_size = 1ULL << ashift; - rr->rr_col[c].rc_abd = - abd_alloc_linear(rr->rr_col[c].rc_size, - B_TRUE); - } else if (row == rows - 1 && bc != 0 && c >= bc) { - /* - * Past the end, this for parity generation. - */ - rr->rr_col[c].rc_size = 0; - rr->rr_col[c].rc_abd = NULL; - } else { - /* - * "data column" (col excluding parity) - * Add an ASCII art diagram here - */ - uint64_t off; - - if (c < bc || r == 0) { - off = dc * rows + row; - } else { - off = r * rows + - (dc - r) * (rows - 1) + row; - } - rr->rr_col[c].rc_size = 1ULL << ashift; - rr->rr_col[c].rc_abd = abd_get_offset_struct( - &rr->rr_col[c].rc_abdstruct, - abd, off << ashift, 1 << ashift); - } - - asize += rr->rr_col[c].rc_size; - } - /* - * If all data stored spans all columns, there's a danger that - * parity will always be on the same device and, since parity - * isn't read during normal operation, that that device's I/O - * bandwidth won't be used effectively. We therefore switch - * the parity every 1MB. - * - * ...at least that was, ostensibly, the theory. As a practical - * matter unless we juggle the parity between all devices - * evenly, we won't see any benefit. Further, occasional writes - * that aren't a multiple of the LCM of the number of children - * and the minimum stripe width are sufficient to avoid pessimal - * behavior. Unfortunately, this decision created an implicit - * on-disk format requirement that we need to support for all - * eternity, but only for single-parity RAID-Z. - * - * If we intend to skip a sector in the zeroth column for - * padding we must make sure to note this swap. We will never - * intend to skip the first column since at least one data and - * one parity column must appear in each row. - */ - if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && - (offset & (1ULL << 20))) { - ASSERT(rr->rr_cols >= 2); - ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); - devidx = rr->rr_col[0].rc_devidx; - uint64_t o = rr->rr_col[0].rc_offset; - rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; - rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; - rr->rr_col[1].rc_devidx = devidx; - rr->rr_col[1].rc_offset = o; - } - - } - ASSERT3U(asize, ==, tot << ashift); - - /* init RAIDZ parity ops */ - rm->rm_ops = vdev_raidz_math_get_ops(); - - return (rm); -} - static raidz_map_t * init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) { @@ -561,10 +378,9 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) init_zio_abd(*zio); if (opts->rto_expand) { - rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd, - (*zio)->io_size, (*zio)->io_offset, + rm = vdev_raidz_map_alloc_expanded(*zio, opts->rto_ashift, total_ncols+1, total_ncols, - parity, opts->rto_expand_offset); + parity, opts->rto_expand_offset, 0, B_FALSE); } else { rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, total_ncols, parity); diff --git a/cmd/raidz_test/raidz_test.h b/cmd/raidz_test/raidz_test.h index 163929defc73..f912e281f6f3 100644 --- a/cmd/raidz_test/raidz_test.h +++ b/cmd/raidz_test/raidz_test.h @@ -119,7 +119,4 @@ void init_zio_abd(zio_t *zio); void run_raidz_benchmark(void); -struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t, - uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); - #endif /* RAIDZ_TEST_H */ diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 9568d2bbfe38..94f3637ab689 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -4133,6 +4133,11 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer) } (void) printf("\tcheckpoint_txg = %llu\n", (u_longlong_t)ub->ub_checkpoint_txg); + + (void) printf("\traidz_reflow state=%u off=%llu\n", + (int)RRSS_GET_STATE(ub), + (u_longlong_t)RRSS_GET_OFFSET(ub)); + (void) printf("%s", footer ? footer : ""); } diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 10a3b5b14fc9..915e1bd7c710 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -6623,9 +6623,17 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing, rebuild); - if (ret == 0 && wait) - ret = zpool_wait(zhp, - replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER); + if (ret == 0 && wait) { + zpool_wait_activity_t activity = ZPOOL_WAIT_RESILVER; + char raidz_prefix[] = "raidz"; + if (replacing) { + activity = ZPOOL_WAIT_REPLACE; + } else if (strncmp(old_disk, + raidz_prefix, strlen(raidz_prefix)) == 0) { + activity = ZPOOL_WAIT_RAIDZ_EXPAND; + } + ret = zpool_wait(zhp, activity); + } nvlist_free(props); nvlist_free(nvroot); @@ -8162,6 +8170,98 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) } } +/* + * Print out detailed raidz expansion status. + */ +static void +print_raidz_expand_status(zpool_handle_t *zhp, pool_raidz_expand_stat_t *pres) +{ + char copied_buf[7]; + + if (pres == NULL || pres->pres_state == DSS_NONE) + return; + + /* + * Determine name of vdev. + */ + nvlist_t *config = zpool_get_config(zhp, NULL); + nvlist_t *nvroot = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE); + nvlist_t **child; + uint_t children; + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0); + assert(pres->pres_expanding_vdev < children); + + printf_color(ANSI_BOLD, gettext("raidz expand: ")); + + time_t start = pres->pres_start_time; + time_t end = pres->pres_end_time; + zfs_nicenum(pres->pres_reflowed, copied_buf, sizeof (copied_buf)); + + /* + * Expansion is finished or canceled. + */ + if (pres->pres_state == DSS_FINISHED) { + uint64_t minutes_taken = (end - start) / 60; + + (void) printf(gettext("Expansion of vdev %u copied %s " + "in %lluh%um, completed on %s"), + (int)pres->pres_expanding_vdev, + copied_buf, + (u_longlong_t)(minutes_taken / 60), + (uint_t)(minutes_taken % 60), + ctime((time_t *)&end)); + } else { + char examined_buf[7], total_buf[7], rate_buf[7]; + uint64_t copied, total, elapsed, secs_left; + double fraction_done; + uint_t rate; + + assert(pres->pres_state == DSS_SCANNING); + + /* + * Expansion is in progress. + */ + (void) printf(gettext( + "Expansion of vdev %u in progress since %s"), + (int)pres->pres_expanding_vdev, ctime(&start)); + + copied = pres->pres_reflowed > 0 ? pres->pres_reflowed : 1; + total = pres->pres_to_reflow; + fraction_done = (double)copied / total; + + /* elapsed time for this pass */ + elapsed = time(NULL) - pres->pres_start_time; + elapsed = elapsed > 0 ? elapsed : 1; + rate = copied / elapsed; + rate = rate > 0 ? rate : 1; + secs_left = (total - copied) / rate; + + zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); + zfs_nicenum(total, total_buf, sizeof (total_buf)); + zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); + + /* + * do not print estimated time if hours_left is more than + * 30 days + */ + (void) printf(gettext(" %s copied out of %s at %s/s, " + "%.2f%% done"), + examined_buf, total_buf, rate_buf, 100 * fraction_done); + if (pres->pres_waiting_for_resilver) { + (void) printf(gettext(", paused due to io errors, " + "waiting for resilver or clear\n")); + } else if (secs_left < (30 * 24 * 3600)) { + char time_buf[32]; + secs_to_dhms(secs_left, time_buf); + (void) printf(gettext(", %s to go\n"), time_buf); + } else { + (void) printf(gettext( + ", (copy is slow, no estimated time)\n")); + } + } +} static void print_checkpoint_status(pool_checkpoint_stat_t *pcs) { @@ -8739,19 +8839,24 @@ status_callback(zpool_handle_t *zhp, void *data) uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - pool_checkpoint_stat_t *pcs = NULL; - pool_removal_stat_t *prs = NULL; print_scan_status(zhp, nvroot); + pool_removal_stat_t *prs = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); print_removal_status(zhp, prs); + pool_checkpoint_stat_t *pcs = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_status(pcs); + pool_raidz_expand_stat_t *pres = NULL; + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c); + print_raidz_expand_status(zhp, pres); + cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, cbp->cb_name_flags | VDEV_NAME_TYPE_ID); if (cbp->cb_namewidth < 10) @@ -10705,8 +10810,9 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *pss = NULL; pool_removal_stat_t *prs = NULL; + pool_raidz_expand_stat_t *pres = NULL; const char *const headers[] = {"DISCARD", "FREE", "INITIALIZE", - "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM"}; + "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM", "RAIDZ_EXPAND"}; int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES]; /* Calculate the width of each column */ @@ -10765,6 +10871,13 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) vdev_activity_top_remaining(nvroot); } + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c); + if (pres != NULL && pres->pres_state == DSS_SCANNING) { + int64_t rem = pres->pres_to_reflow - pres->pres_reflowed; + bytes_rem[ZPOOL_WAIT_RAIDZ_EXPAND] = rem; + } + bytes_rem[ZPOOL_WAIT_INITIALIZE] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE); bytes_rem[ZPOOL_WAIT_TRIM] = @@ -10794,11 +10907,12 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) if (!wd->wd_enabled[i]) continue; - if (wd->wd_exact) + if (wd->wd_exact) { (void) snprintf(buf, sizeof (buf), "%" PRIi64, bytes_rem[i]); - else + } else { zfs_nicenum(bytes_rem[i], buf, sizeof (buf)); + } if (wd->wd_scripted) (void) printf(i == 0 ? "%s" : "\t%s", buf); @@ -10904,7 +11018,8 @@ zpool_do_wait(int argc, char **argv) for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const col_opts[] = { "discard", "free", "initialize", "replace", - "remove", "resilver", "scrub", "trim" }; + "remove", "resilver", "scrub", "trim", + "raidz_expand" }; for (i = 0; i < ARRAY_SIZE(col_opts); ++i) if (strcmp(tok, col_opts[i]) == 0) { diff --git a/cmd/ztest.c b/cmd/ztest.c index b6b99bfff6db..feb6ac3766c8 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -151,6 +151,7 @@ typedef struct ztest_shared_hdr { uint64_t zh_stats_count; uint64_t zh_ds_size; uint64_t zh_ds_count; + uint64_t zh_scratch_state_size; } ztest_shared_hdr_t; static ztest_shared_hdr_t *ztest_shared_hdr; @@ -174,6 +175,7 @@ typedef struct ztest_shared_opts { size_t zo_vdev_size; int zo_ashift; int zo_mirrors; + int zo_raid_do_expand; int zo_raid_children; int zo_raid_parity; char zo_raid_type[8]; @@ -188,6 +190,7 @@ typedef struct ztest_shared_opts { uint64_t zo_time; uint64_t zo_maxloops; uint64_t zo_metaslab_force_ganging; + uint64_t zo_raidz_expand_test; int zo_mmp_test; int zo_special_vdevs; int zo_dump_dbgmsg; @@ -249,6 +252,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = { .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, .zo_gvars_count = 0, + .zo_raidz_expand_test = 0, }; extern uint64_t metaslab_force_ganging; @@ -261,6 +265,7 @@ extern uint_t dmu_object_alloc_chunk_shift; extern boolean_t zfs_force_some_double_word_sm_entries; extern unsigned long zio_decompress_fail_fraction; extern unsigned long zfs_reconstruct_indirect_damage_fraction; +extern uint64_t raidz_expand_max_offset_pause; static ztest_shared_opts_t *ztest_shared_opts; @@ -274,6 +279,12 @@ typedef struct ztest_shared_ds { static ztest_shared_ds_t *ztest_shared_ds; #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) +typedef struct ztest_scratch_state { + uint64_t zs_raidz_scratch_verify_pause; +} ztest_shared_scratch_state_t; + +static ztest_shared_scratch_state_t *ztest_scratch_state; + #define BT_MAGIC 0x123456789abcdefULL #define MAXFAULTS(zs) \ (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) @@ -311,9 +322,9 @@ typedef struct bufwad { * still need to map from object ID to rangelock_t. */ typedef enum { - RL_READER, - RL_WRITER, - RL_APPEND + ZTRL_READER, + ZTRL_WRITER, + ZTRL_APPEND } rl_type_t; typedef struct rll { @@ -408,6 +419,7 @@ ztest_func_t ztest_mmp_enable_disable; ztest_func_t ztest_scrub; ztest_func_t ztest_dsl_dataset_promote_busy; ztest_func_t ztest_vdev_attach_detach; +ztest_func_t ztest_vdev_raidz_attach; ztest_func_t ztest_vdev_LUN_growth; ztest_func_t ztest_vdev_add_remove; ztest_func_t ztest_vdev_class_add; @@ -465,6 +477,7 @@ static ztest_info_t ztest_info[] = { ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), + ZTI_INIT(ztest_vdev_raidz_attach, 1, &zopt_sometimes), ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), @@ -481,6 +494,34 @@ static ztest_info_t ztest_info[] = { #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) +ztest_info_t raidz_expand_info[] = { +/* XXX - does this list of activities need further pruning? */ + ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), + ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), + ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), + ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), + ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), + ZTI_INIT(ztest_zap, 30, &zopt_always), + ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), + ZTI_INIT(ztest_split_pool, 1, &zopt_always), + ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), + ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), + ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), + ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), + ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), + ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), +#if 0 + ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), +#endif + ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), + ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), + ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), + ZTI_INIT(ztest_trim, 1, &zopt_sometimes), + ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), +}; + +#define RAIDZ_EXPAND_FUNCS (sizeof (raidz_expand_info) / sizeof (ztest_info_t)) + /* * The following struct is used to hold a list of uncalled commit callbacks. * The callbacks are ordered by txg number. @@ -745,7 +786,7 @@ static ztest_option_t option_table[] = { DEFAULT_RAID_CHILDREN, NULL}, { 'R', "raid-parity", "INTEGER", "Raid parity", DEFAULT_RAID_PARITY, NULL}, - { 'K', "raid-kind", "raidz|draid|random", "Raid kind", + { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind", NO_DEFAULT, "random"}, { 'D', "draid-data", "INTEGER", "Number of draid data drives", DEFAULT_DRAID_DATA, NULL}, @@ -781,6 +822,9 @@ static ztest_option_t option_table[] = { NO_DEFAULT, NULL}, { 'C', "vdev-class-state", "on|off|random", "vdev class state", NO_DEFAULT, "random"}, + { 'X', "raidz-expand-max-offset", "OFFSET", + "raidz_expand test, killing at off bytes into reflow", + NO_DEFAULT, NULL}, { 'o', "option", "\"OPTION=INTEGER\"", "Set global variable to an unsigned 32-bit integer value", NO_DEFAULT, NULL}, @@ -959,6 +1003,7 @@ process_options(int argc, char **argv) case 'T': case 'P': case 'F': + case 'X': value = nicenumtoull(optarg); } switch (opt) { @@ -1027,6 +1072,9 @@ process_options(int argc, char **argv) case 'V': zo->zo_verbose++; break; + case 'X': + zo->zo_raidz_expand_test = value; + break; case 'E': zo->zo_init = 0; break; @@ -1078,9 +1126,18 @@ process_options(int argc, char **argv) fini_options(); - /* When raid choice is 'random' add a draid pool 50% of the time */ if (strcmp(raid_kind, "random") == 0) { - raid_kind = (ztest_random(2) == 0) ? "draid" : "raidz"; + switch (ztest_random(3)) { + case 0: + raid_kind = "raidz"; + break; + case 1: + raid_kind = "eraidz"; + break; + case 2: + raid_kind = "draid"; + break; + } if (ztest_opts.zo_verbose >= 3) (void) printf("choosing RAID type '%s'\n", raid_kind); @@ -1119,6 +1176,16 @@ process_options(int argc, char **argv) (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, sizeof (zo->zo_raid_type)); + } else if (strcmp(raid_kind, "eraidz") == 0) { + /* using eraidz (expandable raidz) */ + zo->zo_raid_do_expand = B_TRUE; + + /* No top-level mirrors with raidz expansion for now */ + zo->zo_mirrors = 0; + + zo->zo_raid_parity = MIN(zo->zo_raid_parity, + zo->zo_raid_children - 1); + } else /* using raidz */ { ASSERT0(strcmp(raid_kind, "raidz")); @@ -1166,9 +1233,29 @@ ztest_kill(ztest_shared_t *zs) * Before we kill ourselves, make sure that the config is updated. * See comment above spa_write_cachefile(). */ - mutex_enter(&spa_namespace_lock); - spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); - mutex_exit(&spa_namespace_lock); + if (raidz_expand_max_offset_pause) { + if (mutex_tryenter(&spa_namespace_lock)) { + spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, + B_FALSE); + mutex_exit(&spa_namespace_lock); + + ztest_scratch_state->zs_raidz_scratch_verify_pause = + raidz_expand_max_offset_pause; + } else { + /* + * Do not verify scratch object in case if + * spa_namespace_lock cannot be acquired, + * it can cause deadlock in spa_config_update(). + */ + raidz_expand_max_offset_pause = 0; + + return; + } + } else { + mutex_enter(&spa_namespace_lock); + spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); + mutex_exit(&spa_namespace_lock); + } (void) raise(SIGKILL); } @@ -1615,7 +1702,7 @@ ztest_rll_lock(rll_t *rll, rl_type_t type) { mutex_enter(&rll->rll_lock); - if (type == RL_READER) { + if (type == ZTRL_READER) { while (rll->rll_writer != NULL) (void) cv_wait(&rll->rll_cv, &rll->rll_lock); rll->rll_readers++; @@ -2071,7 +2158,7 @@ ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); ASSERT3U(object, !=, 0); - ztest_object_lock(zd, object, RL_WRITER); + ztest_object_lock(zd, object, ZTRL_WRITER); VERIFY0(dmu_object_info(os, object, &doi)); @@ -2141,8 +2228,8 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) if (bt->bt_magic != BT_MAGIC) bt = NULL; - ztest_object_lock(zd, lr->lr_foid, RL_READER); - rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); + ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); + rl = ztest_range_lock(zd, lr->lr_foid, offset, length, ZTRL_WRITER); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); @@ -2245,9 +2332,9 @@ ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); - ztest_object_lock(zd, lr->lr_foid, RL_READER); + ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, - RL_WRITER); + ZTRL_WRITER); tx = dmu_tx_create(os); @@ -2287,7 +2374,7 @@ ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); - ztest_object_lock(zd, lr->lr_foid, RL_WRITER); + ztest_object_lock(zd, lr->lr_foid, ZTRL_WRITER); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); @@ -2415,7 +2502,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); - ztest_object_lock(zd, object, RL_READER); + ztest_object_lock(zd, object, ZTRL_READER); error = dmu_bonus_hold(os, object, FTAG, &db); if (error) { ztest_object_unlock(zd, object); @@ -2440,7 +2527,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, if (buf != NULL) { /* immediate write */ zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, - object, offset, size, RL_READER); + object, offset, size, ZTRL_READER); error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); @@ -2455,7 +2542,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, } zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, - object, offset, size, RL_READER); + object, offset, size, ZTRL_READER); error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); @@ -2532,7 +2619,7 @@ ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) ASSERT3U(od->od_object, !=, 0); ASSERT0(missing); /* there should be no gaps */ - ztest_object_lock(zd, od->od_object, RL_READER); + ztest_object_lock(zd, od->od_object, ZTRL_READER); VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, FTAG, &db)); dmu_object_info_from_db(db, &doi); @@ -2705,8 +2792,8 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) txg_wait_synced(dmu_objset_pool(os), 0); - ztest_object_lock(zd, object, RL_READER); - rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); + ztest_object_lock(zd, object, ZTRL_READER); + rl = ztest_range_lock(zd, object, offset, size, ZTRL_WRITER); tx = dmu_tx_create(os); @@ -3034,13 +3121,32 @@ ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) spa_config_exit(spa, SCL_CONFIG, FTAG); } +static int +ztest_get_raidz_children(spa_t *spa) +{ + (void) spa; + vdev_t *raidvd; + + ASSERT(MUTEX_HELD(&ztest_vdev_lock)); + + if (ztest_opts.zo_raid_do_expand) { + raidvd = ztest_spa->spa_root_vdev->vdev_child[0]; + + ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); + + return (raidvd->vdev_children); + } + + return (ztest_opts.zo_raid_children); +} + void ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa; uint64_t initial_version = SPA_VERSION_INITIAL; - uint64_t version, newversion; + uint64_t raidz_children, version, newversion; nvlist_t *nvroot, *props; char *name; @@ -3059,8 +3165,10 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) */ (void) spa_destroy(name); + raidz_children = ztest_get_raidz_children(ztest_spa); + nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, - NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); + NULL, raidz_children, ztest_opts.zo_mirrors, 1); /* * If we're configuring a RAIDZ device then make sure that the @@ -3126,6 +3234,7 @@ ztest_spa_checkpoint(spa_t *spa) case ZFS_ERR_DEVRM_IN_PROGRESS: case ZFS_ERR_DISCARDING_CHECKPOINT: case ZFS_ERR_CHECKPOINT_EXISTS: + case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: break; case ENOSPC: ztest_record_enospc(FTAG); @@ -3206,6 +3315,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) spa_t *spa = ztest_spa; uint64_t leaves; uint64_t guid; + uint64_t raidz_children; + nvlist_t *nvroot; int error; @@ -3213,8 +3324,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) return; mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * - ztest_opts.zo_raid_children; + raidz_children = ztest_get_raidz_children(spa); + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -3268,7 +3379,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) */ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? - "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, + "log" : NULL, raidz_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); @@ -3296,6 +3407,7 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) spa_t *spa = ztest_spa; uint64_t leaves; nvlist_t *nvroot; + uint64_t raidz_children; const char *class = (ztest_random(2) == 0) ? VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; int error; @@ -3323,15 +3435,15 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) return; } - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * - ztest_opts.zo_raid_children; + raidz_children = ztest_get_raidz_children(spa); + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; spa_config_exit(spa, SCL_VDEV, FTAG); nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); + class, raidz_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); fnvlist_free(nvroot); @@ -3593,6 +3705,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) uint64_t ashift = ztest_get_ashift(); uint64_t oldguid, pguid; uint64_t oldsize, newsize; + uint64_t raidz_children; char *oldpath, *newpath; int replacing; int oldvd_has_siblings = B_FALSE; @@ -3609,7 +3722,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; + raidz_children = ztest_get_raidz_children(spa); + leaves = MAX(zs->zs_mirrors, 1) * raidz_children; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -3624,6 +3738,14 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) goto out; } + /* + * Does not work with expandable raidz, bp corruptions detected. + */ + if (ztest_opts.zo_raid_do_expand) { + spa_config_exit(spa, SCL_ALL, FTAG); + goto out; + } + /* * Decide whether to do an attach or a replace. */ @@ -3648,7 +3770,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (zs->zs_mirrors >= 1) { ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); - oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; + oldvd = oldvd->vdev_child[leaf / raidz_children]; } /* pick a child out of the raidz group */ @@ -3657,8 +3779,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); else ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); - ASSERT3U(oldvd->vdev_children, ==, ztest_opts.zo_raid_children); - oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; + oldvd = oldvd->vdev_child[leaf % raidz_children]; } /* @@ -3826,6 +3947,177 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) umem_free(newpath, MAXPATHLEN); } +static void +raidz_scratch_verify(void) +{ + spa_t *spa; + uint64_t pause, offset; + raidz_reflow_scratch_state_t state; + + ASSERT(raidz_expand_max_offset_pause == 0); + + if (ztest_scratch_state->zs_raidz_scratch_verify_pause == 0) + return; + + kernel_init(SPA_MODE_READ); + + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(ztest_opts.zo_pool); + ASSERT(spa); + spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; + mutex_exit(&spa_namespace_lock); + + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + + ASSERT3U(RRSS_GET_OFFSET(&spa->spa_uberblock), !=, UINT64_MAX); + + pause = ztest_scratch_state->zs_raidz_scratch_verify_pause; + offset = RRSS_GET_OFFSET(&spa->spa_uberblock); + state = RRSS_GET_STATE(&spa->spa_uberblock); + + if (pause < RAIDZ_EXPAND_PAUSE_SCRATCH_VALID) { + ASSERT3U(offset, ==, 0); + ASSERT3U(state, ==, RRSS_SCRATCH_NOT_IN_USE); + } else if (pause >= RAIDZ_EXPAND_PAUSE_SCRATCH_VALID && + pause <= RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED) { + ASSERT3U(offset, >=, pause); + ASSERT3U(state, ==, RRSS_SCRATCH_VALID); + } else { + ASSERT(pause <= RAIDZ_EXPAND_PAUSE_SCRATCH_NOT_IN_USE); + ASSERT3U(offset, >, pause); + ASSERT3U(state, ==, RRSS_SCRATCH_NOT_IN_USE); + } + + ztest_scratch_state->zs_raidz_scratch_verify_pause = 0; + + spa_close(spa, FTAG); + kernel_fini(); +} + +static void +ztest_scratch_thread(void *arg) +{ + (void) arg; + + for (int t = 100; t > 0; t -= 1) { + if (raidz_expand_max_offset_pause == 0) + thread_exit(); + + (void) poll(NULL, 0, 100); + } + + ztest_kill(ztest_shared); +} + +/* + * Verify that we can attach raidz device. + */ +void +ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) +{ + (void) zd, (void) id; + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + uint64_t leaves, raidz_children, newsize, ashift = ztest_get_ashift(); + kthread_t *scratch_thread = NULL; + vdev_t *newvd, *pvd; + nvlist_t *root; + char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + int error, expected_error = 0; + + mutex_enter(&ztest_vdev_lock); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); + + if (!ztest_opts.zo_raid_do_expand) { + spa_config_exit(spa, SCL_ALL, FTAG); + goto out; + } + + if (ztest_opts.zo_mmp_test) { + spa_config_exit(spa, SCL_ALL, FTAG); + goto out; + } + + if (ztest_device_removal_active) { + spa_config_exit(spa, SCL_ALL, FTAG); + goto out; + } + + pvd = vdev_lookup_top(spa, 0); + + ASSERT(pvd->vdev_ops == &vdev_raidz_ops); + + /* + * Get size of a child of the raidz group, + * make sure device is a bit bigger + */ + newvd = pvd->vdev_child[ztest_random(pvd->vdev_children)]; + newsize = 10 * vdev_get_min_asize(newvd) / (9 + ztest_random(2)); + + /* + * Get next attached leaf id + */ + raidz_children = ztest_get_raidz_children(spa); + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; + zs->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; + + if (spa->spa_raidz_expand) + expected_error = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS; + + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * Path to vdev to be attached + */ + (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, zs->zs_vdev_next_leaf); + + /* + * Build the nvlist describing newpath. + */ + root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL, + 0, 0, 1); + + if (ztest_random(2) == 0 && expected_error == 0) { + raidz_expand_max_offset_pause = + ztest_random(RAIDZ_EXPAND_PAUSE_SCRATCH_NOT_IN_USE) + 1; + scratch_thread = thread_create(NULL, 0, ztest_scratch_thread, + ztest_shared, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); + } + + error = spa_vdev_attach(spa, pvd->vdev_guid, root, B_FALSE, B_FALSE); + + nvlist_free(root); + + if (error == EOVERFLOW || + error == ZFS_ERR_CHECKPOINT_EXISTS || + error == ZFS_ERR_DISCARDING_CHECKPOINT) + expected_error = error; + + if (error != 0 && error != expected_error) { + fatal(0, "raidz attach (%s %"PRIu64") returned %d, expected %d", + newpath, newsize, error, expected_error); + } + + if (raidz_expand_max_offset_pause) { + if (error != 0) { + /* + * Do not verify scratch object in case of error + * returned by vdev attaching. + */ + raidz_expand_max_offset_pause = 0; + } + + VERIFY0(thread_join(scratch_thread)); + } + +out: + mutex_exit(&ztest_vdev_lock); + + umem_free(newpath, MAXPATHLEN); +} + void ztest_device_removal(ztest_ds_t *zd, uint64_t id) { @@ -4032,6 +4324,18 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) return; } + /* + * If we under raidz expansion, the test can failed because metaslabs + * count will not increase immediately after vdevs growing. It will + * happen only after raidz expansion completion. + */ + if (spa->spa_raidz_expand) { + spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&ztest_vdev_lock); + mutex_exit(&ztest_checkpoint_lock); + return; + } + top = ztest_random_vdev_top(spa, B_TRUE); tvd = spa->spa_root_vdev->vdev_child[top]; @@ -5816,7 +6120,7 @@ ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) dmu_object_info_t doi; dmu_buf_t *db; - ztest_object_lock(zd, obj, RL_READER); + ztest_object_lock(zd, obj, ZTRL_READER); if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { ztest_object_unlock(zd, obj); continue; @@ -6039,6 +6343,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) uint64_t leaves; uint64_t bad = 0x1990c0ffeedecadeull; uint64_t top, leaf; + uint64_t raidz_children; char *path0; char *pathrand; size_t fsize; @@ -6049,6 +6354,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) vdev_t *vd0 = NULL; uint64_t guid0 = 0; boolean_t islog = B_FALSE; + boolean_t injected = B_FALSE; path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); @@ -6061,15 +6367,23 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * strategy for damaging blocks does not take in to account evacuated * blocks which may have already been damaged. */ - if (ztest_device_removal_active) { - mutex_exit(&ztest_vdev_lock); + if (ztest_device_removal_active) + goto out; + + /* + * The fault injection strategy for damaging blocks cannot be used + * if raidz expansion is in progress. The leaves value + * (attached raidz children) is variable and strategy for damaging + * blocks will corrupt same data blocks on different child vdevs + * because of reflow process. + */ + if (spa->spa_raidz_expand != NULL) goto out; - } maxfaults = MAXFAULTS(zs); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; + raidz_children = ztest_get_raidz_children(spa); + leaves = MAX(zs->zs_mirrors, 1) * raidz_children; mirror_save = zs->zs_mirrors; - mutex_exit(&ztest_vdev_lock); ASSERT3U(leaves, >=, 1); @@ -6210,13 +6524,9 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * call vdev_[on|off]line without holding locks * to force unpredictable failures but the side * effects of vdev_[on|off]line prevent us from - * doing so. We grab the ztest_vdev_lock here to - * prevent a race between injection testing and - * aux_vdev removal. + * doing so. */ - mutex_enter(&ztest_vdev_lock); (void) vdev_online(spa, guid0, 0, NULL); - mutex_exit(&ztest_vdev_lock); } } @@ -6290,9 +6600,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) continue; - mutex_enter(&ztest_vdev_lock); if (mirror_save != zs->zs_mirrors) { - mutex_exit(&ztest_vdev_lock); (void) close(fd); goto out; } @@ -6302,15 +6610,25 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) "can't inject bad word at 0x%"PRIx64" in %s", offset, pathrand); - mutex_exit(&ztest_vdev_lock); - if (ztest_opts.zo_verbose >= 7) (void) printf("injected bad word into %s," " offset 0x%"PRIx64"\n", pathrand, offset); + + injected = B_TRUE; } (void) close(fd); out: + mutex_exit(&ztest_vdev_lock); + + if (injected && ztest_opts.zo_raid_do_expand) { + int error = spa_scan(spa, POOL_SCAN_SCRUB); + if (error == 0) { + while (dsl_scan_scrubbing(spa_get_dsl(spa))) + txg_wait_synced(spa_get_dsl(spa), 0); + } + } + umem_free(path0, MAXPATHLEN); umem_free(pathrand, MAXPATHLEN); } @@ -7193,6 +7511,38 @@ ztest_execute(int test, ztest_info_t *zi, uint64_t id) (double)functime / NANOSEC, zi->zi_funcname); } +static __attribute__((noreturn)) void +ztest_rzx_thread(void *arg) +{ + int rand; + uint64_t id = (uintptr_t)arg; + ztest_shared_t *zs = ztest_shared; + uint64_t call_next; + hrtime_t now; + ztest_info_t *zi; + ztest_shared_callstate_t *zc; + + while ((now = gethrtime()) < zs->zs_thread_stop) { + /* + * Pick a random function to execute. + * XXX - better to pick a specific set of functions here? + * i.e. a deterministic set of operations to generate pool data. + */ + rand = ztest_random(RAIDZ_EXPAND_FUNCS); + zi = &raidz_expand_info[rand]; + zc = ZTEST_GET_SHARED_CALLSTATE(rand); + call_next = zc->zc_next; + + if (now >= call_next && + atomic_cas_64(&zc->zc_next, call_next, call_next + + ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { + ztest_execute(rand, zi, id); + } + } + + thread_exit(); +} + static __attribute__((noreturn)) void ztest_thread(void *arg) { @@ -7208,7 +7558,7 @@ ztest_thread(void *arg) /* * See if it's time to force a crash. */ - if (now > zs->zs_thread_kill) + if (now > zs->zs_thread_kill && !raidz_expand_max_offset_pause) ztest_kill(zs); /* @@ -7399,9 +7749,13 @@ ztest_freeze(void) spa_t *spa; int numloops = 0; + if (ztest_opts.zo_raid_do_expand) + return; + if (ztest_opts.zo_verbose >= 3) (void) printf("testing spa_freeze()...\n"); + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); VERIFY0(ztest_dataset_open(0)); @@ -7469,6 +7823,7 @@ ztest_freeze(void) /* * Open and close the pool and dataset to induce log replay. */ + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); @@ -7518,6 +7873,7 @@ ztest_import(ztest_shared_t *zs) mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); ztest_import_impl(); @@ -7540,6 +7896,363 @@ ztest_import(ztest_shared_t *zs) mutex_destroy(&ztest_checkpoint_lock); } +#define RAIDZ_EXPAND_KILLED UINT64_MAX +#define RAIDZ_EXPAND_CHECKED (UINT64_MAX - 1) + +/* + * Start a raidz expansion test. We run some I/O on the pool for a while + * to get some data in the pool. Then we grow the raidz and + * kill the test at the requested offset into the reflow, verifying that + * doing such does not lead to pool corruption. + */ +static void +ztest_raidz_expand_run(ztest_shared_t *zs) +{ + spa_t *spa; + objset_t *os; + kthread_t *resume_thread, *deadman_thread; + kthread_t **run_threads; + uint64_t object; + uint64_t ashift = ztest_get_ashift(); + int error; + int i, t, d; + vdev_t *rzvd, *cvd; + uint64_t csize, desreflow; + nvlist_t *root; + char *newpath; + pool_raidz_expand_stat_t rzx_stats; + pool_raidz_expand_stat_t *pres = &rzx_stats; + + newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + ztest_exiting = B_FALSE; + + /* + * Initialize parent/child shared state. + */ + mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); + VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + + zs->zs_thread_start = gethrtime(); + zs->zs_thread_stop = + zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; + zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); + zs->zs_thread_kill = zs->zs_thread_stop; + if (ztest_random(100) < ztest_opts.zo_killrate) { + zs->zs_thread_kill -= + ztest_random(ztest_opts.zo_passtime * NANOSEC); + } + + mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); + + list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), + offsetof(ztest_cb_data_t, zcd_node)); + + /* + * Open our pool. It may need to be imported first depending on + * what tests were running when the previous pass was terminated. + */ + raidz_scratch_verify(); + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + error = spa_open(ztest_opts.zo_pool, &spa, FTAG); + if (error) { + VERIFY3S(error, ==, ENOENT); + ztest_import_impl(); + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + zs->zs_metaslab_sz = + 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + } + + metaslab_preload_limit = ztest_random(20) + 1; + ztest_spa = spa; + + VERIFY0(vdev_raidz_impl_set("cycle")); + + dmu_objset_stats_t dds; + VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, + DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + dmu_objset_fast_stat(os, &dds); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + zs->zs_guid = dds.dds_guid; + dmu_objset_disown(os, B_TRUE, FTAG); + + /* + * Create a thread to periodically resume suspended I/O. + */ + resume_thread = thread_create(NULL, 0, ztest_resume_thread, + spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); + + /* + * Create a deadman thread and set to panic if we hang. + */ + deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, + zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); + + spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; + + /* + * Verify that we can safely inquire about any object, + * whether it's allocated or not. To make it interesting, + * we probe a 5-wide window around each power of two. + * This hits all edge cases, including zero and the max. + */ + for (t = 0; t < 64; t++) { + for (d = -5; d <= 5; d++) { + error = dmu_object_info(spa->spa_meta_objset, + (1ULL << t) + d, NULL); + ASSERT(error == 0 || error == ENOENT || + error == EINVAL); + } + } + + /* + * We should not get any ENOSPC errors in this test + */ + if (zs->zs_enospc_count != 0) { + fatal(0, "raidz expand: ENOSPC errors?"); + } + + run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), + UMEM_NOFAIL); + + if (ztest_opts.zo_verbose >= 4) + (void) printf("starting main threads...\n"); + + /* + * Replay all logs of all datasets in the pool. This is primarily for + * temporary datasets which wouldn't otherwise get replayed, which + * can trigger failures when attempting to offline a SLOG in + * ztest_fault_inject(). + */ + (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, + NULL, DS_FIND_CHILDREN); + + if (ztest_opts.zo_raidz_expand_test != 0 && + ztest_opts.zo_raidz_expand_test < RAIDZ_EXPAND_KILLED) { + desreflow = ztest_opts.zo_raidz_expand_test; + /* + * Set the reflow to pause at the desired offset + */ + raidz_expand_max_offset_pause = desreflow; + /* + * In here on first pass of test only. + */ + if (ztest_opts.zo_verbose >= 1) { + (void) printf("running raidz expansion test," + " killing when offset %llu of reflow reached\n", + (u_longlong_t)desreflow); + if (ztest_opts.zo_verbose > 1) { + /* XXX - pause to allow debugger attach */ + (void) printf( + "our pid is %d, pausing for 10 seconds\n", + getpid()); + sleep(10); + } + } + /* + * Put some data in the pool and then attach a vdev to initiate + * reflow. + */ + /* + * Kick off all the I/O generators that run in parallel. + */ + for (t = 0; t < ztest_opts.zo_threads; t++) { + if (t < ztest_opts.zo_datasets && + ztest_dataset_open(t) != 0) { + umem_free(run_threads, ztest_opts.zo_threads * + sizeof (kthread_t *)); + return; + } + + run_threads[t] = thread_create(NULL, 0, + ztest_rzx_thread, + (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, + defclsyspri); + } + + /* + * Wait a while for I/O to put some data in the pool + * XXX- add an option to specify if we wait for I/O to quiesce + */ + for (i = 0; i < 60; i++) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 1000); + } + + rzvd = spa->spa_root_vdev->vdev_child[0]; + ASSERT(rzvd->vdev_ops == &vdev_raidz_ops); + /* + * get size of a child of the raidz group + */ + cvd = rzvd->vdev_child[0]; + + csize = vdev_get_min_asize(cvd); + csize += csize / 10; /* make sure device is a bit bigger */ + /* + * Path to vdev to be attached + */ + (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, rzvd->vdev_children); + /* + * Build the nvlist describing newpath. + */ + root = make_vdev_root(newpath, NULL, NULL, csize, ashift, NULL, + 0, 0, 1); + /* + * Now attach the vdev to the raidz so it will expand + */ + if (ztest_opts.zo_verbose >= 1) { + (void) printf("expanding raidz\n"); + } + error = spa_vdev_attach(spa, rzvd->vdev_guid, root, B_FALSE, + B_FALSE); + nvlist_free(root); + if (error != 0) { + fatal(0, "raidz expand: attach (%s %llu) returned %d", + newpath, (long long)csize, error); + } + + /* + * Wait for desired reflow offset to be reached and kill the + * test + */ + /* + * Wait for reflow to begin + */ + while (spa->spa_raidz_expand == NULL) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); /* wait 1/10 second */ + } + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_raidz_expand_get_stats(spa, pres); + spa_config_exit(spa, SCL_CONFIG, FTAG); + while (pres->pres_state != DSS_SCANNING) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); /* wait 1/10 second */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_raidz_expand_get_stats(spa, pres); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + ASSERT3U(pres->pres_state, ==, DSS_SCANNING); + ASSERT3U(pres->pres_to_reflow, !=, 0); + /* + * Set so when we are killed we go to raidz checking rather than + * restarting test. + */ + ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_KILLED; + if (ztest_opts.zo_verbose >= 1) { + (void) printf("raidz expandsion reflow started," + " waiting for offset %llu to be reched\n", + (u_longlong_t)desreflow); + } + + while (pres->pres_reflowed < desreflow) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); /* wait 1/10 second */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_raidz_expand_get_stats(spa, pres); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + /* + * XXX - should we clear the reflow pause here? + */ + if (ztest_opts.zo_verbose >= 1) { + (void) printf( + "killing raidz expandsion test offset at %llu\n", + (u_longlong_t)pres->pres_reflowed); + } + /* + * Kill ourself, this simulates a panic during a reflow. Our + * parent will restart the test and the changed flag value + * will drive the test through the scrub/check code to + * verify the pool is not corrupted. + */ + ztest_kill(zs); + } else { /* check the pool is healthy */ + /* + * Set pool check done flag, main program will run a zdb check + * of the pool when we exit. + */ + ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_CHECKED; + /* XXX - wait for reflow done? */ + if (ztest_opts.zo_verbose >= 1) { + (void) printf("\nverifying raidz expansion\n"); + if (ztest_opts.zo_verbose > 1) { + /* XXX - pause to allow debugger attach */ + (void) printf( + "our pid is %d, pausing for 10 seconds\n", + getpid()); + sleep(10); + } + } + VERIFY0(ztest_scrub_impl(spa)); + if (ztest_opts.zo_verbose >= 1) { + (void) printf("raidz expansion scrub check complete\n"); + } + } + + + txg_wait_synced(spa_get_dsl(spa), 0); + + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); + + umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); + + /* Kill the resume and deadman threads */ + ztest_exiting = B_TRUE; + VERIFY0(thread_join(resume_thread)); + VERIFY0(thread_join(deadman_thread)); + ztest_resume(spa); + + /* + * Right before closing the pool, kick off a bunch of async I/O; + * spa_close() should wait for it to complete. + */ + for (object = 1; object < 50; object++) { + dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, + ZIO_PRIORITY_SYNC_READ); + } + + /* Verify that at least one commit cb was called in a timely fashion */ + if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) + VERIFY0(zc_min_txg_delay); + + spa_close(spa, FTAG); + + /* + * Verify that we can loop over all pools. + */ + mutex_enter(&spa_namespace_lock); + for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) + if (ztest_opts.zo_verbose > 3) + (void) printf("spa_next: found %s\n", spa_name(spa)); + mutex_exit(&spa_namespace_lock); + + /* + * Verify that we can export the pool and reimport it under a + * different name. + */ + if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { + char name[ZFS_MAX_DATASET_NAME_LEN]; + (void) snprintf(name, sizeof (name), "%s_import", + ztest_opts.zo_pool); + ztest_spa_import_export(ztest_opts.zo_pool, name); + ztest_spa_import_export(name, ztest_opts.zo_pool); + } + + kernel_fini(); + + list_destroy(&zcl.zcl_callbacks); + mutex_destroy(&zcl.zcl_callbacks_lock); + (void) pthread_rwlock_destroy(&ztest_name_lock); + mutex_destroy(&ztest_vdev_lock); + mutex_destroy(&ztest_checkpoint_lock); +} + /* * Kick off threads to run tests on all datasets in parallel. */ @@ -7582,6 +8295,7 @@ ztest_run(ztest_shared_t *zs) * Open our pool. It may need to be imported first depending on * what tests were running when the previous pass was terminated. */ + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); error = spa_open(ztest_opts.zo_pool, &spa, FTAG); if (error) { @@ -7595,7 +8309,10 @@ ztest_run(ztest_shared_t *zs) metaslab_preload_limit = ztest_random(20) + 1; ztest_spa = spa; - VERIFY0(vdev_raidz_impl_set("cycle")); + /* + * BUGBUG raidz expansion do not run this for now + * VERIFY0(vdev_raidz_impl_set("cycle")); + */ dmu_objset_stats_t dds; VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, @@ -7825,6 +8542,7 @@ ztest_init(ztest_shared_t *zs) mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); /* @@ -7909,6 +8627,7 @@ shared_data_size(ztest_shared_hdr_t *hdr) size += hdr->zh_size; size += hdr->zh_stats_size * hdr->zh_stats_count; size += hdr->zh_ds_size * hdr->zh_ds_count; + size += hdr->zh_scratch_state_size; return (size); } @@ -7932,6 +8651,7 @@ setup_hdr(void) hdr->zh_stats_count = ZTEST_FUNCS; hdr->zh_ds_size = sizeof (ztest_shared_ds_t); hdr->zh_ds_count = ztest_opts.zo_datasets; + hdr->zh_scratch_state_size = sizeof (ztest_shared_scratch_state_t); size = shared_data_size(hdr); VERIFY0(ftruncate(ztest_fd_data, size)); @@ -7966,6 +8686,8 @@ setup_data(void) ztest_shared_callstate = (void *)&buf[offset]; offset += hdr->zh_stats_size * hdr->zh_stats_count; ztest_shared_ds = (void *)&buf[offset]; + offset += hdr->zh_ds_size * hdr->zh_ds_count; + ztest_scratch_state = (void *)&buf[offset]; } static boolean_t @@ -8176,10 +8898,14 @@ main(int argc, char **argv) metaslab_df_alloc_threshold = zs->zs_metaslab_df_alloc_threshold; - if (zs->zs_do_init) + if (zs->zs_do_init) { ztest_run_init(); - else - ztest_run(zs); + } else { + if (ztest_opts.zo_raidz_expand_test) + ztest_raidz_expand_run(zs); + else + ztest_run(zs); + } exit(0); } @@ -8305,6 +9031,9 @@ main(int argc, char **argv) if (!ztest_opts.zo_mmp_test) ztest_run_zdb(ztest_opts.zo_pool); + if (ztest_shared_opts->zo_raidz_expand_test == + RAIDZ_EXPAND_CHECKED) + break; /* raidz expand test complete */ } if (ztest_opts.zo_verbose >= 1) { @@ -8318,6 +9047,8 @@ main(int argc, char **argv) kills, iters - kills, (100.0 * kills) / MAX(1, iters)); } + dump_debug_buffer(); + umem_free(cmd, MAXNAMELEN); return (0); diff --git a/contrib/pyzfs/libzfs_core/_constants.py b/contrib/pyzfs/libzfs_core/_constants.py index 5ee422dfa803..9c40ece1a7df 100644 --- a/contrib/pyzfs/libzfs_core/_constants.py +++ b/contrib/pyzfs/libzfs_core/_constants.py @@ -103,6 +103,7 @@ def enum(*sequential, **named): 'ZFS_ERR_NOT_USER_NAMESPACE', 'ZFS_ERR_RESUME_EXISTS', 'ZFS_ERR_CRYPTO_NOTSUP', + 'ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS', ], {} ) @@ -115,5 +116,6 @@ def enum(*sequential, **named): ZFS_ERR_VDEV_TOO_BIG = zfs_errno.ZFS_ERR_VDEV_TOO_BIG ZFS_ERR_WRONG_PARENT = zfs_errno.ZFS_ERR_WRONG_PARENT ZFS_ERR_VDEV_NOTSUP = zfs_errno.ZFS_ERR_VDEV_NOTSUP +ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS = zfs_errno.ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS # vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/pyzfs/libzfs_core/_error_translation.py b/contrib/pyzfs/libzfs_core/_error_translation.py index 26676db398c5..3d1a2d573e39 100644 --- a/contrib/pyzfs/libzfs_core/_error_translation.py +++ b/contrib/pyzfs/libzfs_core/_error_translation.py @@ -43,6 +43,7 @@ ZFS_ERR_DEVRM_IN_PROGRESS, ZFS_ERR_VDEV_TOO_BIG, ZFS_ERR_WRONG_PARENT, + ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, zfs_errno ) @@ -596,6 +597,8 @@ def lzc_pool_checkpoint_translate_error(ret, name, discard=False): raise lzc_exc.DeviceRemovalRunning() if ret == ZFS_ERR_VDEV_TOO_BIG: raise lzc_exc.DeviceTooBig() + if ret == ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: + raise lzc_exc.RaidzExpansionRunning() if discard: raise _generic_exception( ret, name, "Failed to discard pool checkpoint") diff --git a/contrib/pyzfs/libzfs_core/exceptions.py b/contrib/pyzfs/libzfs_core/exceptions.py index e484b07b6450..ba8f7e49093c 100644 --- a/contrib/pyzfs/libzfs_core/exceptions.py +++ b/contrib/pyzfs/libzfs_core/exceptions.py @@ -30,6 +30,7 @@ ZFS_ERR_DEVRM_IN_PROGRESS, ZFS_ERR_VDEV_TOO_BIG, ZFS_ERR_WRONG_PARENT, + ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, zfs_errno ) @@ -598,4 +599,9 @@ class DeviceTooBig(ZFSError): message = "One or more top-level vdevs exceed the maximum vdev size" +class RaidzExpansionRunning(ZFSError): + errno = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS + message = "A raidz device is currently expanding" + + # vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/include/libzfs.h b/include/libzfs.h index a7037e3e6266..f4c2d2753557 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -156,6 +156,7 @@ typedef enum zfs_error { EZFS_NOT_USER_NAMESPACE, /* a file is not a user namespace */ EZFS_CKSUM, /* insufficient replicas */ EZFS_RESUME_EXISTS, /* Resume on existing dataset without force */ + EZFS_RAIDZ_EXPAND_IN_PROGRESS, /* a raidz is currently expanding */ EZFS_UNKNOWN } zfs_error_t; diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index bc940e8a7929..bc329398e986 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -716,6 +716,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ #define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ +#define ZPOOL_CONFIG_RAIDZ_EXPAND_STATS "raidz_expand_stats" /* not on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ @@ -781,6 +782,8 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" +#define ZPOOL_CONFIG_RAIDZ_EXPANDING "raidz_expanding" +#define ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS "raidz_expand_txgs" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" @@ -899,6 +902,15 @@ typedef struct zpool_load_policy { #define VDEV_TOP_ZAP_ALLOCATION_BIAS \ "org.zfsonlinux:allocation_bias" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE \ + "org.freebsd:raidz_expand_state" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME \ + "org.freebsd:raidz_expand_start_time" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME \ + "org.freebsd:raidz_expand_end_time" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED \ + "org.freebsd:raidz_expand_bytes_copied" + /* vdev metaslab allocation bias */ #define VDEV_ALLOC_BIAS_LOG "log" #define VDEV_ALLOC_BIAS_SPECIAL "special" @@ -1130,6 +1142,16 @@ typedef struct pool_removal_stat { uint64_t prs_mapping_memory; } pool_removal_stat_t; +typedef struct pool_raidz_expand_stat { + uint64_t pres_state; /* dsl_scan_state_t */ + uint64_t pres_expanding_vdev; + uint64_t pres_start_time; + uint64_t pres_end_time; + uint64_t pres_to_reflow; /* bytes that need to be moved */ + uint64_t pres_reflowed; /* bytes moved so far */ + uint64_t pres_waiting_for_resilver; +} pool_raidz_expand_stat_t; + typedef enum dsl_scan_state { DSS_NONE, DSS_SCANNING, @@ -1569,6 +1591,7 @@ typedef enum { ZFS_ERR_NOT_USER_NAMESPACE, ZFS_ERR_RESUME_EXISTS, ZFS_ERR_CRYPTO_NOTSUP, + ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, } zfs_errno_t; /* @@ -1593,6 +1616,7 @@ typedef enum { ZPOOL_WAIT_RESILVER, ZPOOL_WAIT_SCRUB, ZPOOL_WAIT_TRIM, + ZPOOL_WAIT_RAIDZ_EXPAND, ZPOOL_WAIT_NUM_ACTIVITIES } zpool_wait_activity_t; diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 44afa763283a..802dc0516f07 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -321,6 +322,9 @@ struct spa { spa_condensing_indirect_t *spa_condensing_indirect; zthr_t *spa_condense_zthr; /* zthr doing condense. */ + vdev_raidz_expand_t *spa_raidz_expand; + zthr_t *spa_raidz_expand_zthr; + uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */ spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ zthr_t *spa_checkpoint_discard_zthr; diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h index 03bcfa8f4dd1..e57185a9494d 100644 --- a/include/sys/uberblock_impl.h +++ b/include/sys/uberblock_impl.h @@ -75,6 +75,27 @@ extern "C" { #define MMP_FAIL_INT_SET(fail) \ (((uint64_t)(fail & 0xFFFF) << 48) | MMP_FAIL_INT_VALID_BIT) +typedef enum raidz_reflow_scratch_state { + RRSS_SCRATCH_NOT_IN_USE = 0, + RRSS_SCRATCH_VALID, +} raidz_reflow_scratch_state_t; + +#define RRSS_GET_OFFSET(ub) \ + BF64_GET_SB((ub)->ub_raidz_reflow_info, 0, 55, SPA_MINBLOCKSHIFT, 0) +#define RRSS_SET_OFFSET(ub, x) \ + BF64_SET_SB((ub)->ub_raidz_reflow_info, 0, 55, SPA_MINBLOCKSHIFT, 0, x) + +#define RRSS_GET_STATE(ub) \ + BF64_GET_SB((ub)->ub_raidz_reflow_info, 55, 9, 0, 0) +#define RRSS_SET_STATE(ub, x) \ + BF64_SET_SB((ub)->ub_raidz_reflow_info, 55, 9, 0, 0, x) + +#define RAIDZ_REFLOW_SET(ub, state, offset) do { \ + (ub)->ub_raidz_reflow_info = 0; \ + RRSS_SET_OFFSET(ub, offset); \ + RRSS_SET_STATE(ub, state); \ +} while (0) + struct uberblock { uint64_t ub_magic; /* UBERBLOCK_MAGIC */ uint64_t ub_version; /* SPA_VERSION */ @@ -136,6 +157,8 @@ struct uberblock { * the ZIL block is not allocated [see uses of spa_min_claim_txg()]. */ uint64_t ub_checkpoint_txg; + + uint64_t ub_raidz_reflow_info; }; #ifdef __cplusplus diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 03e1f438aaf9..c2ad8c36dfcf 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -132,15 +132,19 @@ extern void vdev_space_update(vdev_t *vd, extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space); +extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, + uint64_t txg); extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); /* - * Return the amount of space allocated for a gang block header. + * Return the amount of space allocated for a gang block header. Note that + * since the physical birth txg is not provided, this must be constant for + * a given vdev. (e.g. raidz expansion can't change this) */ static inline uint64_t vdev_gang_header_asize(vdev_t *vd) { - return (vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE)); + return (vdev_psize_to_asize_txg(vd, SPA_GANGBLOCKSIZE, 0)); } extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); @@ -204,6 +208,7 @@ extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, int flags); extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *); extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *); +extern int vdev_uberblock_sync_list(vdev_t **, int, struct uberblock *, int); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 2b22b973ba49..75ae6775b581 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -72,7 +72,7 @@ typedef void vdev_fini_func_t(vdev_t *vd); typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *ashift, uint64_t *pshift); typedef void vdev_close_func_t(vdev_t *vd); -typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); +typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize, uint64_t txg); typedef uint64_t vdev_min_asize_func_t(vdev_t *vd); typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd); typedef void vdev_io_start_func_t(zio_t *zio); @@ -279,6 +279,7 @@ struct vdev { uint64_t vdev_noalloc; /* device is passivated? */ uint64_t vdev_removing; /* device is being removed? */ uint64_t vdev_failfast; /* device failfast setting */ + boolean_t vdev_rz_expanding; /* raidz is being expanded? */ boolean_t vdev_ishole; /* is a hole in the namespace */ uint64_t vdev_top_zap; vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ @@ -533,6 +534,7 @@ typedef struct vdev_label { /* * Size of embedded boot loader region on each label. * The total size of the first two labels plus the boot area is 4MB. + * On RAIDZ, this space is overwritten during RAIDZ expansion. */ #define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ @@ -605,7 +607,7 @@ extern vdev_ops_t vdev_indirect_ops; */ extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs); -extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); +extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg); extern uint64_t vdev_default_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index e34b6e4b158e..6a7961428e63 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -26,6 +26,7 @@ #define _SYS_VDEV_RAIDZ_H #include +#include #ifdef __cplusplus extern "C" { @@ -35,6 +36,8 @@ struct zio; struct raidz_col; struct raidz_row; struct raidz_map; +struct vdev_raidz; +struct uberblock; #if !defined(_KERNEL) struct kernel_param {}; #endif @@ -44,13 +47,19 @@ struct kernel_param {}; */ struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t, uint64_t); +struct raidz_map *vdev_raidz_map_alloc_expanded(struct zio *, + uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, boolean_t); void vdev_raidz_map_free(struct raidz_map *); +void vdev_raidz_free(struct vdev_raidz *); void vdev_raidz_generate_parity_row(struct raidz_map *, struct raidz_row *); void vdev_raidz_generate_parity(struct raidz_map *); void vdev_raidz_reconstruct(struct raidz_map *, const int *, int); void vdev_raidz_child_done(zio_t *); void vdev_raidz_io_done(zio_t *); void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *); +struct raidz_row *vdev_raidz_row_alloc(int); +void vdev_raidz_reflow_copy_scratch(spa_t *); +void raidz_dtl_reassessed(vdev_t *); extern const zio_vsd_ops_t vdev_raidz_vsd_ops; @@ -65,11 +74,97 @@ int vdev_raidz_math_reconstruct(struct raidz_map *, struct raidz_row *, const int *, const int *, const int); int vdev_raidz_impl_set(const char *); +typedef struct vdev_raidz_expand { + uint64_t vre_vdev_id; + + kmutex_t vre_lock; + kcondvar_t vre_cv; + + /* + * How much i/o is outstanding (issued and not completed). + */ + uint64_t vre_outstanding_bytes; + + /* + * Next offset to issue i/o for. + */ + uint64_t vre_offset; + + /* + * Lowest offset of a failed expansion i/o. The expansion will retry + * from here. Once the expansion thread notices the failure and exits, + * vre_failed_offset is reset back to UINT64_MAX, and + * vre_waiting_for_resilver will be set. + */ + uint64_t vre_failed_offset; + boolean_t vre_waiting_for_resilver; + + /* + * Offset that is completing each txg + */ + uint64_t vre_offset_pertxg[TXG_SIZE]; + + /* + * Bytes copied in each txg. + */ + uint64_t vre_bytes_copied_pertxg[TXG_SIZE]; + + /* + * The rangelock prevents normal read/write zio's from happening while + * there are expansion (reflow) i/os in progress to the same offsets. + */ + zfs_rangelock_t vre_rangelock; + + /* + * These fields are stored on-disk in the vdev_top_zap: + */ + dsl_scan_state_t vre_state; + uint64_t vre_start_time; + uint64_t vre_end_time; + uint64_t vre_bytes_copied; +} vdev_raidz_expand_t; + typedef struct vdev_raidz { - int vd_logical_width; + /* + * Number of child vdevs when this raidz vdev was created (i.e. before + * any raidz expansions). + */ + int vd_original_width; + + /* + * The current number of child vdevs, which may be more than the + * original width if an expansion is in progress or has completed. + */ + int vd_physical_width; + int vd_nparity; + + /* + * Tree of reflow_node_t's. The lock protects the avl tree only. + * The reflow_node_t's describe completed expansions, and are used + * to determine the logical width given a block's birth time. + */ + avl_tree_t vd_expand_txgs; + kmutex_t vd_expand_lock; + + /* + * If this vdev is being expanded, spa_raidz_expand is set to this + */ + vdev_raidz_expand_t vn_vre; } vdev_raidz_t; +extern int vdev_raidz_attach_check(vdev_t *); +extern void vdev_raidz_attach_sync(void *, dmu_tx_t *); +extern void spa_start_raidz_expansion_thread(spa_t *); +extern int spa_raidz_expand_get_stats(spa_t *, pool_raidz_expand_stat_t *); +extern int vdev_raidz_load(vdev_t *); + +/* Some of raidz scratch area states */ +#define RAIDZ_EXPAND_PAUSE_NONE 0 +#define RAIDZ_EXPAND_PAUSE_SCRATCH_VALID 4 +#define RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED 5 +#define RAIDZ_EXPAND_PAUSE_SCRATCH_NOT_IN_USE 11 + #ifdef __cplusplus } #endif diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index c1037fa12e30..fae03f8f53da 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -30,6 +30,8 @@ #include #include #include +#include +#include #ifdef __cplusplus extern "C" { @@ -102,28 +104,32 @@ typedef struct raidz_impl_ops { char name[RAIDZ_IMPL_NAME_MAX]; /* Name of the implementation */ } raidz_impl_ops_t; + typedef struct raidz_col { - uint64_t rc_devidx; /* child device index for I/O */ + int rc_devidx; /* child device index for I/O */ + uint32_t rc_size; /* I/O size */ uint64_t rc_offset; /* device offset */ - uint64_t rc_size; /* I/O size */ abd_t rc_abdstruct; /* rc_abd probably points here */ abd_t *rc_abd; /* I/O data */ abd_t *rc_orig_data; /* pre-reconstruction */ int rc_error; /* I/O error for this device */ - uint8_t rc_tried; /* Did we attempt this I/O column? */ - uint8_t rc_skipped; /* Did we skip this I/O column? */ - uint8_t rc_need_orig_restore; /* need to restore from orig_data? */ - uint8_t rc_force_repair; /* Write good data to this column */ - uint8_t rc_allow_repair; /* Allow repair I/O to this column */ + uint8_t rc_tried:1; /* Did we attempt this I/O column? */ + uint8_t rc_skipped:1; /* Did we skip this I/O column? */ + uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */ + uint8_t rc_force_repair:1; /* Write good data to this column */ + uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */ + int rc_shadow_devidx; /* for double write during expansion */ + int rc_shadow_error; /* for double write during expansion */ + uint64_t rc_shadow_offset; /* for double write during expansion */ } raidz_col_t; typedef struct raidz_row { - uint64_t rr_cols; /* Regular column count */ - uint64_t rr_scols; /* Count including skipped columns */ - uint64_t rr_bigcols; /* Remainder data column count */ - uint64_t rr_missingdata; /* Count of missing data devices */ - uint64_t rr_missingparity; /* Count of missing parity devices */ - uint64_t rr_firstdatacol; /* First data column/parity count */ + int rr_cols; /* Regular column count */ + int rr_scols; /* Count including skipped columns */ + int rr_bigcols; /* Remainder data column count */ + int rr_missingdata; /* Count of missing data devices */ + int rr_missingparity; /* Count of missing parity devices */ + int rr_firstdatacol; /* First data column/parity count */ abd_t *rr_abd_empty; /* dRAID empty sector buffer */ int rr_nempty; /* empty sectors included in parity */ #ifdef ZFS_DEBUG @@ -138,10 +144,25 @@ typedef struct raidz_map { int rm_nrows; /* Regular row count */ int rm_nskip; /* RAIDZ sectors skipped for padding */ int rm_skipstart; /* Column index of padding start */ + int rm_original_width; /* pre-expansion width of raidz vdev */ + int rm_nphys_cols; /* num entries in rm_phys_col[] */ + zfs_locked_range_t *rm_lr; const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ + raidz_col_t *rm_phys_col; /* if non-NULL, read i/o aggregation */ raidz_row_t *rm_row[0]; /* flexible array of rows */ } raidz_map_t; +/* + * Nodes in vdev_raidz_t:vd_expand_txgs. + * Blocks with physical birth time of re_txg or later have the specified + * logical width (until the next node). + */ +typedef struct reflow_node { + uint64_t re_txg; + uint64_t re_logical_width; + avl_node_t re_link; +} reflow_node_t; + #define RAIDZ_ORIGINAL_IMPL (INT_MAX) diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 7066c699e203..41989a946acc 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -80,6 +80,7 @@ typedef enum spa_feature { SPA_FEATURE_BLAKE3, SPA_FEATURE_BLOCK_CLONING, SPA_FEATURE_AVZ_V2, + SPA_FEATURE_RAIDZ_EXPANSION, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index d4af31c50cf8..0375ff4d29c2 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3507,9 +3507,8 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, break; case EBUSY: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, " - "or device removal is in progress"), - new_disk); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "%s is busy"), new_disk); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index b94abea3d581..d2cc0449fbbb 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -317,6 +317,8 @@ libzfs_error_description(libzfs_handle_t *hdl) case EZFS_RESUME_EXISTS: return (dgettext(TEXT_DOMAIN, "Resuming recv on existing " "dataset without force")); + case EZFS_RAIDZ_EXPAND_IN_PROGRESS: + return (dgettext(TEXT_DOMAIN, "raidz expansion in progress")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: @@ -763,6 +765,9 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ZFS_ERR_IOC_ARG_BADTYPE: zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; + case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: + zfs_verror(hdl, EZFS_RAIDZ_EXPAND_IN_PROGRESS, fmt, ap); + break; default: zfs_error_aux(hdl, "%s", strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); diff --git a/man/man1/ztest.1 b/man/man1/ztest.1 index 64514b317275..5910c95ffa47 100644 --- a/man/man1/ztest.1 +++ b/man/man1/ztest.1 @@ -122,11 +122,11 @@ Number of mirror copies. Number of raidz/draid disks. .It Fl R , -raid-parity Ns = (default: Sy 1 ) Raid parity (raidz & draid). -.It Fl K , -raid-kind Ns = Ns Sy raidz Ns | Ns Sy draid Ns | Ns Sy random No (default : Sy random ) +.It Fl K , -raid-kind Ns = Ns Sy raidz Ns | Ns Sy eraidz Ns | Ns Sy draid Ns | Ns Sy random No (default: Sy random ) The kind of RAID config to use. With .Sy random -the kind alternates between raidz and draid. +the kind alternates between raidz, eraidz (expandable raidz) and draid. .It Fl D , -draid-data Ns = (default: Sy 4 ) Number of data disks in a dRAID redundancy group. .It Fl S , -draid-spares Ns = (default: Sy 1 ) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 271b02b6ee42..d130b4ef9632 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -419,6 +419,19 @@ TXGs must pass before unloading will occur. .It Sy reference_history Ns = Ns Sy 3 Pq uint Maximum reference holders being tracked when reference_tracking_enable is active. +.It Sy raidz_expand_max_copy_bytes Ns = Ns Sy 160MB Pq ulong +Max amount of memory to use for RAID-Z expansion I/O. +This limits how much I/O can be outstanding at once. +. +.It Sy raidz_expand_max_offset_pause Ns = Ns Sy 0 Pq ulong +For testing, pause RAID-Z expansion at this offset. +. +.It Sy raidz_io_aggregate_rows Ns = Ns Sy 4 Pq ulong +For expanded RAID-Z, aggregate reads that have more rows than this. +. +.It Sy reference_history Ns = Ns Sy 3 Pq int +Maximum reference holders being tracked when reference_tracking_enable is +active. . .It Sy reference_tracking_enable Ns = Ns Sy 0 Ns | Ns 1 Pq int Track reference holders to diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index b901ce6c2935..f114e8a2bcf9 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -947,6 +947,14 @@ once all filesystems that have ever had their property set to .Sy zstd are destroyed. +. +.feature org.openzfs raidz_expansion no none +This feature enables the +.Nm zpool Cm attach +subcommand to attach a new device to a RAID-Z group, expanding the total +amount usable space in the pool. +See +.Xr zpool-attach 8 . .El . .Sh SEE ALSO diff --git a/man/man8/zpool-attach.8 b/man/man8/zpool-attach.8 index 73535cbdf108..8d98e1c664ea 100644 --- a/man/man8/zpool-attach.8 +++ b/man/man8/zpool-attach.8 @@ -26,7 +26,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd May 15, 2020 +.Dd June 28, 2023 .Dt ZPOOL-ATTACH 8 .Os . @@ -45,7 +45,15 @@ Attaches .Ar new_device to the existing .Ar device . -The existing device cannot be part of a raidz configuration. +The behavior differs depending on if the existing +.Ar device +is a RAID-Z device, or a mirror/plain device. +.Pp +If the existing device is a mirror or plain device +.Pq e.g. specified as Qo Li sda Qc or Qq Li mirror-7 , +the new device will be mirrored with the existing device, a resilver will be +initiated, and the new device will contribute to additional redundancy once the +resilver completes. If .Ar device is not currently part of a mirrored configuration, @@ -62,6 +70,39 @@ creates a three-way mirror, and so on. In either case, .Ar new_device begins to resilver immediately and any running scrub is cancelled. +.Pp +If the existing device is a RAID-Z device +.Pq e.g. specified as Qq Ar raidz2-0 , +the new device will become part of that RAID-Z group. +A "raidz expansion" will be initiated, and once the expansion completes, +the new device will contribute additional space to the RAID-Z group. +The expansion entails reading all allocated space from existing disks in the +RAID-Z group, and rewriting it to the new disks in the RAID-Z group (including +the newly added +.Ar device ) . +Its progress can be monitored with +.Nm zpool Cm status . +.Pp +Data redundancy is maintained during and after the expansion. +If a disk fails while the expansion is in progress, the expansion pauses until +the health of the RAID-Z vdev is restored (e.g. by replacing the failed disk +and waiting for reconstruction to complete). +Expansion does not change the number of failures that can be tolerated +without data loss (e.g. a RAID-Z2 is still a RAID-Z2 even after expansion). +A RAID-Z vdev can be expanded multiple times. +.Pp +After the expansion completes, old blocks retain their old data-to-parity +ratio +.Pq e.g. 5-wide RAID-Z2 has 3 data and 2 parity +but distributed among the larger set of disks. +New blocks will be written with the new data-to-parity ratio (e.g. a 5-wide +RAID-Z2 which has been expanded once to 6-wide, has 4 data and 2 parity). +However, the vdev's assumed parity ratio does not change, so slightly less +space than is expected may be reported for newly-written blocks, according to +.Nm zfs Cm list , +.Nm df , +.Nm ls Fl s , +and similar tools. .Bl -tag -width Ds .It Fl f Forces use of @@ -76,16 +117,15 @@ manual page for a list of valid properties that can be set. The only property supported at the moment is .Sy ashift . .It Fl s -The +When attaching to a mirror or plain device, the .Ar new_device is reconstructed sequentially to restore redundancy as quickly as possible. Checksums are not verified during sequential reconstruction so a scrub is started when the resilver completes. -Sequential reconstruction is not supported for raidz configurations. .It Fl w Waits until .Ar new_device -has finished resilvering before returning. +has finished resilvering or expanding before returning. .El . .Sh SEE ALSO diff --git a/man/man8/zpool-wait.8 b/man/man8/zpool-wait.8 index 683b0141425c..0e4c9bfdebee 100644 --- a/man/man8/zpool-wait.8 +++ b/man/man8/zpool-wait.8 @@ -20,7 +20,7 @@ .\" .\" .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. -.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012, 2021 by Delphix. All rights reserved. .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. .\" Copyright (c) 2017 Datto Inc. .\" Copyright (c) 2018 George Melikov. All Rights Reserved. @@ -76,6 +76,8 @@ Resilver to cease Scrub to cease .It Sy trim Manual trim to cease +.It Sy raidz_expand +Attaching to a RAID-Z vdev to complete .El .Pp If an diff --git a/module/os/linux/zfs/zfs_debug.c b/module/os/linux/zfs/zfs_debug.c index b090ec684e05..f707959c9445 100644 --- a/module/os/linux/zfs/zfs_debug.c +++ b/module/os/linux/zfs/zfs_debug.c @@ -175,7 +175,8 @@ __dprintf(boolean_t dprint, const char *file, const char *func, newfile = file; } - i = snprintf(buf, size, "%s%s:%d:%s(): ", prefix, newfile, line, func); + i = snprintf(buf, size, "%px %s%s:%d:%s(): ", + curthread, prefix, newfile, line, func); if (i < size) { va_start(adx, fmt); diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 4c9b7ed72a0f..509563b3d6f1 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -737,6 +737,11 @@ zpool_feature_init(void) ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + zfeature_register(SPA_FEATURE_RAIDZ_EXPANSION, + "org.openzfs:raidz_expansion", "raidz_expansion", + "Support for raidz expansion", + ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 7023f448182a..13f59f845e47 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -4735,7 +4735,7 @@ arc_evict_cb(void *arg, zthr_t *zthr) * broadcast will wake any remaining arc evict waiters. */ mutex_enter(&arc_evict_lock); - arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) && + arc_evict_needed = !zthr_iscancelled(zthr) && evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0; if (!arc_evict_needed) { /* diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 50428bff3ef4..3958ebea0a51 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -3055,7 +3055,6 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_visit_rootbp(scn, NULL, &dp->dp_meta_rootbp, tx); - spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); if (scn->scn_suspending) return; diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 176247d63b76..50f5d5257ccb 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -4296,7 +4296,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); - if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { + if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing || + vd->vdev_rz_expanding) { defer_allowed = B_FALSE; } @@ -5231,7 +5232,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, ASSERT(mg->mg_class == mc); - uint64_t asize = vdev_psize_to_asize(vd, psize); + uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); /* diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 88ee4ea9f458..48900d14e923 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include #include @@ -1646,6 +1647,10 @@ spa_destroy_aux_threads(spa_t *spa) zthr_destroy(spa->spa_livelist_condense_zthr); spa->spa_livelist_condense_zthr = NULL; } + if (spa->spa_raidz_expand_zthr != NULL) { + zthr_destroy(spa->spa_raidz_expand_zthr); + spa->spa_raidz_expand_zthr = NULL; + } } /* @@ -1804,6 +1809,8 @@ spa_unload(spa_t *spa) spa->spa_compatibility = NULL; } + spa->spa_raidz_expand = NULL; + spa_config_exit(spa, SCL_ALL, spa); } @@ -2942,6 +2949,7 @@ spa_spawn_aux_threads(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); + spa_start_raidz_expansion_thread(spa); spa_start_indirect_condensing_thread(spa); spa_start_livelist_destroy_thread(spa); spa_start_livelist_condensing_thread(spa); @@ -3696,6 +3704,12 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) } spa_load_note(spa, "using uberblock with txg=%llu", (u_longlong_t)ub->ub_txg); + if (ub->ub_raidz_reflow_info != 0) { + spa_load_note(spa, "uberblock raidz_reflow_info: " + "state=%u offset=%llu", + (int)RRSS_GET_STATE(ub), + (u_longlong_t)RRSS_GET_OFFSET(ub)); + } /* @@ -5016,6 +5030,15 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); + /* + * Before we do any zio_write's, complete the raidz expansion + * scratch space copying, if necessary. + */ + if (RRSS_GET_STATE(&spa->spa_uberblock) != + RRSS_SCRATCH_NOT_IN_USE) { + vdev_raidz_reflow_copy_scratch(spa); + } + /* * In case of a checkpoint rewind, log the original txg * of the checkpointed uberblock. @@ -6855,8 +6878,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; - int newvd_isspare; + int newvd_isspare = B_FALSE; int error; + boolean_t raidz = B_FALSE; ASSERT(spa_writeable(spa)); @@ -6886,16 +6910,31 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, ZFS_ERR_REBUILD_IN_PROGRESS)); } - if (spa->spa_vdev_removal != NULL) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + if (spa->spa_vdev_removal != NULL) { + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_DEVRM_IN_PROGRESS)); + } if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); - if (!oldvd->vdev_ops->vdev_op_leaf) + if (oldvd->vdev_ops == &vdev_raidz_ops) { + raidz = B_TRUE; + /* + * Can't expand a raidz while prior expand is in progress. + */ + if (spa->spa_raidz_expand != NULL) { + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); + } + } else if (!oldvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + } - pvd = oldvd->vdev_parent; + if (raidz) + pvd = oldvd; + else + pvd = oldvd->vdev_parent; if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, VDEV_ALLOC_ATTACH) != 0) @@ -6951,6 +6990,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, * vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_raidz_ops && pvd->vdev_ops != &vdev_root_ops) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); @@ -6990,7 +7030,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, /* * Make sure the new device is big enough. */ - if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) + vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; + if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* @@ -7000,32 +7041,57 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + /* + * RAIDZ-expansion-specific checks. + */ + if (raidz && vdev_raidz_attach_check(newvd) != 0) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } + + if (raidz) { + /* + * Note: oldvdpath is freed by spa_strfree(), but + * kmem_asprintf() is freed by kmem_strfree(), so we have to + * move it to a spa_strdup-ed string. + */ + char *tmp = kmem_asprintf("raidz%u-%u", + (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); + oldvdpath = spa_strdup(tmp); + kmem_strfree(tmp); + } else { + oldvdpath = spa_strdup(oldvd->vdev_path); + } + newvdpath = spa_strdup(newvd->vdev_path); + /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ - if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { + if (strcmp(oldvdpath, newvdpath) == 0) { spa_strfree(oldvd->vdev_path); - oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, + oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5, KM_SLEEP); - (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, - "%s/%s", newvd->vdev_path, "old"); + (void) sprintf(oldvd->vdev_path, "%s/old", + newvdpath); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } + spa_strfree(oldvdpath); + oldvdpath = spa_strdup(oldvd->vdev_path); } /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ - if (pvd->vdev_ops != pvops) + if (!raidz && pvd->vdev_ops != pvops) { pvd = vdev_add_parent(oldvd, pvops); + ASSERT(pvd->vdev_ops == pvops); + ASSERT(oldvd->vdev_parent == pvd); + } ASSERT(pvd->vdev_top->vdev_parent == rvd); - ASSERT(pvd->vdev_ops == pvops); - ASSERT(oldvd->vdev_parent == pvd); /* * Extract the new device from its root and add it to pvd. @@ -7053,41 +7119,66 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, - TXG_INITIAL, dtl_max_txg - TXG_INITIAL); + if (raidz) { + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); - if (newvd->vdev_isspare) { - spa_spare_activate(newvd); - spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); - } + vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE); + vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE); + vdev_autotrim_stop_wait(tvd); - oldvdpath = spa_strdup(oldvd->vdev_path); - newvdpath = spa_strdup(newvd->vdev_path); - newvd_isspare = newvd->vdev_isspare; + dtl_max_txg = spa_vdev_config_enter(spa); - /* - * Mark newvd's DTL dirty in this txg. - */ - vdev_dirty(tvd, VDD_DTL, newvd, txg); + tvd->vdev_rz_expanding = B_TRUE; - /* - * Schedule the resilver or rebuild to restart in the future. We do - * this to ensure that dmu_sync-ed blocks have been stitched into the - * respective datasets. - */ - if (rebuild) { - newvd->vdev_rebuild_txg = txg; + vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg); + vdev_config_dirty(tvd); - vdev_rebuild(tvd); + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, + dtl_max_txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, + newvd, tx); + dmu_tx_commit(tx); } else { - newvd->vdev_resilver_txg = txg; + vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, + dtl_max_txg - TXG_INITIAL); - if (dsl_scan_resilvering(spa_get_dsl(spa)) && - spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { - vdev_defer_resilver(newvd); + if (newvd->vdev_isspare) { + spa_spare_activate(newvd); + spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); + } + + newvd_isspare = newvd->vdev_isspare; + + /* + * Mark newvd's DTL dirty in this txg. + */ + vdev_dirty(tvd, VDD_DTL, newvd, txg); + + /* + * Schedule the resilver or rebuild to restart in the future. + * We do this to ensure that dmu_sync-ed blocks have been + * stitched into the respective datasets. + */ + if (rebuild) { + newvd->vdev_rebuild_txg = txg; + + vdev_rebuild(tvd); } else { - dsl_scan_restart_resilver(spa->spa_dsl_pool, - dtl_max_txg); + newvd->vdev_resilver_txg = txg; + + if (dsl_scan_resilvering(spa_get_dsl(spa)) && + spa_feature_is_enabled(spa, + SPA_FEATURE_RESILVER_DEFER)) { + vdev_defer_resilver(newvd); + } else { + dsl_scan_restart_resilver(spa->spa_dsl_pool, + dtl_max_txg); + } } } @@ -7412,7 +7503,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, */ if (cmd_type == POOL_INITIALIZE_START && (vd->vdev_initialize_thread != NULL || - vd->vdev_top->vdev_removing)) { + vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_INITIALIZE_CANCEL && @@ -7534,7 +7625,8 @@ spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, * which has completed but the thread is not exited. */ if (cmd_type == POOL_TRIM_START && - (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) { + (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing || + vd->vdev_top->vdev_rz_expanding)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_TRIM_CANCEL && @@ -8437,6 +8529,10 @@ spa_async_suspend(spa_t *spa) if (condense_thread != NULL) zthr_cancel(condense_thread); + zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; + if (raidz_expand_thread != NULL) + zthr_cancel(raidz_expand_thread); + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_cancel(discard_thread); @@ -8463,6 +8559,10 @@ spa_async_resume(spa_t *spa) if (condense_thread != NULL) zthr_resume(condense_thread); + zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; + if (raidz_expand_thread != NULL) + zthr_resume(raidz_expand_thread); + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_resume(discard_thread); @@ -9268,6 +9368,28 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) != NULL) vdev_sync(vd, txg); + if (pass == 1) { + /* + * dsl_pool_sync() -> dp_sync_tasks may have dirtied + * the config. If that happens, we don't want this + * txg to be able to be a no-op, so be sure to sync + * the config to the MOS before checking for no-op + * txg below. + * + * Note that when the config is dirty, it will + * be written to the MOS (i.e. the MOS will be + * dirtied) every time we call spa_sync_config_object() + * in this txg. Therefore we can't call this after + * dsl_pool_sync() every pass, because it would + * prevent us from converging, since we'd dirty + * the MOS every pass. + * + * Sync tasks can only be processed in pass 1, so + * there's no need to do this in later passes. + */ + spa_sync_config_object(spa, tx); + } + /* * Note: We need to check if the MOS is dirty because we could * have marked the MOS dirty without updating the uberblock @@ -9927,7 +10049,8 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, DSS_SCANNING); break; case ZPOOL_WAIT_RESILVER: - if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) + *in_progress = vdev_rebuild_active(spa->spa_root_vdev); + if (*in_progress) break; zfs_fallthrough; case ZPOOL_WAIT_SCRUB: @@ -9942,6 +10065,12 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, is_scrub == (activity == ZPOOL_WAIT_SCRUB)); break; } + case ZPOOL_WAIT_RAIDZ_EXPAND: + { + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); + break; + } default: panic("unrecognized value for activity %d", activity); } diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c index b588f7041e5c..1efff47f87a0 100644 --- a/module/zfs/spa_checkpoint.c +++ b/module/zfs/spa_checkpoint.c @@ -465,6 +465,9 @@ spa_checkpoint_check(void *arg, dmu_tx_t *tx) if (spa->spa_removing_phys.sr_state == DSS_SCANNING) return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS)); + if (spa->spa_raidz_expand != NULL) + return (SET_ERROR(ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); + if (spa->spa_checkpoint_txg != 0) return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS)); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 30551feb6322..f688ee103db1 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include "zfs_prop.h" @@ -305,13 +306,13 @@ vdev_derive_alloc_bias(const char *bias) * all children. This is what's used by anything other than RAID-Z. */ uint64_t -vdev_default_asize(vdev_t *vd, uint64_t psize) +vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { - csize = vdev_psize_to_asize(vd->vdev_child[c], psize); + csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg); asize = MAX(asize, csize); } @@ -924,6 +925,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); + vd->vdev_rz_expanding = nvlist_exists(nv, + ZPOOL_CONFIG_RAIDZ_EXPANDING); } else { ASSERT0(vd->vdev_top_zap); } @@ -1659,6 +1662,8 @@ vdev_probe_done(zio_t *zio) vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; + vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u", + vd->vdev_cant_read, vd->vdev_cant_write); if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { @@ -1880,17 +1885,19 @@ vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) } /* - * Compute the raidz-deflation ratio. Note, we hard-code - * in 128k (1 << 17) because it is the "typical" blocksize. - * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, - * otherwise it would inconsistently account for existing bp's. + * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17) + * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE + * changed, this algorithm can not change, otherwise it would inconsistently + * account for existing bp's. We also hard-code txg 0 for the same reason + * (expanded RAIDZ vdevs can use different asize for different birth txg's). */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / - (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); + (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >> + SPA_MINBLOCKSHIFT); } } @@ -3196,32 +3203,43 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); - return; + } else { + mutex_enter(&vd->vdev_dtl_lock); + for (int t = 0; t < DTL_TYPES; t++) { + /* account for child's outage in parent's missing map */ + int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; + if (t == DTL_SCRUB) { + /* leaf vdevs only */ + continue; + } + if (t == DTL_PARTIAL) { + /* i.e. non-zero */ + minref = 1; + } else if (vdev_get_nparity(vd) != 0) { + /* RAIDZ, DRAID */ + minref = vdev_get_nparity(vd) + 1; + } else { + /* any kind of mirror */ + minref = vd->vdev_children; + } + space_reftree_create(&reftree); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + mutex_enter(&cvd->vdev_dtl_lock); + space_reftree_add_map(&reftree, + cvd->vdev_dtl[s], 1); + mutex_exit(&cvd->vdev_dtl_lock); + } + space_reftree_generate_map(&reftree, + vd->vdev_dtl[t], minref); + space_reftree_destroy(&reftree); + } + mutex_exit(&vd->vdev_dtl_lock); } - mutex_enter(&vd->vdev_dtl_lock); - for (int t = 0; t < DTL_TYPES; t++) { - /* account for child's outage in parent's missing map */ - int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; - if (t == DTL_SCRUB) - continue; /* leaf vdevs only */ - if (t == DTL_PARTIAL) - minref = 1; /* i.e. non-zero */ - else if (vdev_get_nparity(vd) != 0) - minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */ - else - minref = vd->vdev_children; /* any kind of mirror */ - space_reftree_create(&reftree); - for (int c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - mutex_enter(&cvd->vdev_dtl_lock); - space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); - mutex_exit(&cvd->vdev_dtl_lock); - } - space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); - space_reftree_destroy(&reftree); + if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) { + raidz_dtl_reassessed(vd); } - mutex_exit(&vd->vdev_dtl_lock); } /* @@ -3596,6 +3614,12 @@ vdev_load(vdev_t *vd) vdev_set_deflate_ratio(vd); + if (vd->vdev_ops == &vdev_raidz_ops) { + error = vdev_raidz_load(vd); + if (error != 0) + return (error); + } + /* * On spa_load path, grab the allocation bias from our zap */ @@ -3973,10 +3997,22 @@ vdev_sync(vdev_t *vd, uint64_t txg) dmu_tx_commit(tx); } +/* + * Return the amount of space that should be (or was) allocated for the given + * psize (compressed block size) in the given TXG. Note that for expanded + * RAIDZ vdevs, the size allocated for older BP's may be larger. See + * vdev_raidz_asize(). + */ +uint64_t +vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg) +{ + return (vd->vdev_ops->vdev_op_asize(vd, psize, txg)); +} + uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { - return (vd->vdev_ops->vdev_op_asize(vd, psize)); + return (vdev_psize_to_asize_txg(vd, psize, 0)); } /* @@ -4142,9 +4178,6 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); - if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); - wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; @@ -5424,7 +5457,9 @@ vdev_expand(vdev_t *vd, uint64_t txg) vdev_set_deflate_ratio(vd); - if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && + if ((vd->vdev_spa->spa_raidz_expand == NULL || + vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) && + (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 307e2353d020..eebbf440fdb4 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -577,8 +577,9 @@ vdev_draid_permute_id(vdev_draid_config_t *vdc, * i.e. vdev_draid_psize_to_asize(). */ static uint64_t -vdev_draid_asize(vdev_t *vd, uint64_t psize) +vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { + (void) txg; vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t ashift = vd->vdev_ashift; @@ -960,7 +961,7 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t io_size = abd_size; - uint64_t io_asize = vdev_draid_asize(vd, io_size); + uint64_t io_asize = vdev_draid_asize(vd, io_size, 0); uint64_t group = vdev_draid_offset_to_group(vd, io_offset); uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1); @@ -1025,15 +1026,10 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, ASSERT3U(vdc->vdc_nparity, >, 0); - raidz_row_t *rr; - rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP); - rr->rr_cols = groupwidth; + raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth); rr->rr_scols = groupwidth; rr->rr_bigcols = bc; - rr->rr_missingdata = 0; - rr->rr_missingparity = 0; rr->rr_firstdatacol = vdc->vdc_nparity; - rr->rr_abd_empty = NULL; #ifdef ZFS_DEBUG rr->rr_offset = io_offset; rr->rr_size = io_size; @@ -1053,14 +1049,6 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c); rc->rc_offset = physical_offset; - rc->rc_abd = NULL; - rc->rc_orig_data = NULL; - rc->rc_error = 0; - rc->rc_tried = 0; - rc->rc_skipped = 0; - rc->rc_force_repair = 0; - rc->rc_allow_repair = 1; - rc->rc_need_orig_restore = B_FALSE; if (q == 0 && i >= bc) rc->rc_size = 0; @@ -1129,7 +1117,7 @@ vdev_draid_map_alloc(zio_t *zio) if (size < abd_size) { vdev_t *vd = zio->io_vd; - io_offset += vdev_draid_asize(vd, size); + io_offset += vdev_draid_asize(vd, size, 0); abd_offset += size; abd_size -= size; nrows++; @@ -1151,7 +1139,6 @@ vdev_draid_map_alloc(zio_t *zio) rm->rm_row[0] = rr[0]; if (nrows == 2) rm->rm_row[1] = rr[1]; - return (rm); } @@ -1783,7 +1770,7 @@ vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t asize = vdev_draid_asize(vd, psize); + uint64_t asize = vdev_draid_asize(vd, psize, 0); if (phys_birth == TXG_UNKNOWN) { /* @@ -1840,7 +1827,7 @@ vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col) range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_draid_asize(vd, rr->rr_size); + vdev_draid_asize(vd, rr->rr_size, 0); raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; @@ -2038,6 +2025,8 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) } } +extern const zio_vsd_ops_t vdev_raidz_vsd_ops; + /* * Start an IO operation to a dRAID vdev. */ diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index ffdcef1972c3..5aaef1a69986 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -48,7 +48,8 @@ static boolean_t vdev_initialize_should_stop(vdev_t *vd) { return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || - vd->vdev_detached || vd->vdev_top->vdev_removing); + vd->vdev_detached || vd->vdev_top->vdev_removing || + vd->vdev_top->vdev_rz_expanding); } static void @@ -67,7 +68,8 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); - if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + if (vd == NULL || vd->vdev_top->vdev_removing || + !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding) return; uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; @@ -631,6 +633,7 @@ vdev_initialize(vdev_t *vd) ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_initialize_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_top->vdev_rz_expanding); vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); vd->vdev_initialize_thread = thread_create(NULL, 0, @@ -791,13 +794,14 @@ vdev_initialize_restart(vdev_t *vd) ASSERT(err == 0 || err == ENOENT); vd->vdev_initialize_action_time = timestamp; - if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || - vd->vdev_offline) { + if ((vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || + vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) { /* load progress for reporting, but don't resume */ VERIFY0(vdev_initialize_load(vd)); } else if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) && !vd->vdev_top->vdev_removing && + !vd->vdev_top->vdev_rz_expanding && vd->vdev_initialize_thread == NULL) { vdev_initialize(vd); } diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index a5c76808f2d2..db38001817ae 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -142,6 +142,7 @@ #include #include #include +#include #include #include #include @@ -423,6 +424,13 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs, sizeof (pcs) / sizeof (uint64_t)); } + + pool_raidz_expand_stat_t pres; + if (spa_raidz_expand_get_stats(spa, &pres) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t *)&pres, + sizeof (pres) / sizeof (uint64_t)); + } } static void @@ -1494,7 +1502,8 @@ vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) } struct ubl_cbdata { - uberblock_t *ubl_ubbest; /* Best uberblock */ + uberblock_t ubl_latest; /* Most recent uberblock */ + uberblock_t *ubl_ubbest; /* Best uberblock (w/r/t max_txg) */ vdev_t *ubl_vd; /* vdev associated with the above */ }; @@ -1511,6 +1520,9 @@ vdev_uberblock_load_done(zio_t *zio) if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); + if (vdev_uberblock_compare(ub, &cbp->ubl_latest) > 0) { + cbp->ubl_latest = *ub; + } if (ub->ub_txg <= spa->spa_load_max_txg && vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { /* @@ -1568,10 +1580,10 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) ASSERT(config); memset(ub, 0, sizeof (uberblock_t)); + memset(&cb, 0, sizeof (cb)); *config = NULL; cb.ubl_ubbest = ub; - cb.ubl_vd = NULL; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); zio = zio_root(spa, NULL, &cb, flags); @@ -1588,6 +1600,22 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. " "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg); + if (ub->ub_raidz_reflow_info != + cb.ubl_latest.ub_raidz_reflow_info) { + vdev_dbgmsg(cb.ubl_vd, + "spa=%s best uberblock (txg=%llu info=0x%llx) " + "has different raidz_reflow_info than latest " + "uberblock (txg=%llu info=0x%llx)", + spa->spa_name, + (u_longlong_t)ub->ub_txg, + (u_longlong_t)ub->ub_raidz_reflow_info, + (u_longlong_t)cb.ubl_latest.ub_txg, + (u_longlong_t)cb.ubl_latest.ub_raidz_reflow_info); + memset(ub, 0, sizeof (uberblock_t)); + spa_config_exit(spa, SCL_ALL, FTAG); + return; + } + *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); if (*config == NULL && spa->spa_extreme_rewind) { vdev_dbgmsg(cb.ubl_vd, "failed to read label config. " @@ -1709,8 +1737,23 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, vd->vdev_copy_uberblocks = B_FALSE; } + /* + * We chose a slot based on the txg. If this uberblock has a special + * RAIDZ expansion state, then it is essentially an update of the + * current uberblock (it has the same txg). However, the current + * state is committed, so we want to write it to a different slot. If + * we overwrote the same slot, and we lose power during the uberblock + * write, and the disk does not do single-sector overwrites + * atomically (even though it is required to - i.e. we should see + * either the old or the new uberblock), then we could lose this + * txg's uberblock. Rewinding to the previous txg's uberblock may not + * be possible because RAIDZ expansion may have already overwritten + * some of the data, so we need the progress indicator in the + * uberblock. + */ int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0; - int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m); + int n = (ub->ub_txg - RRSS_GET_STATE(ub)) % + (VDEV_UBERBLOCK_COUNT(vd) - m); /* Copy the uberblock_t into the ABD */ abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); @@ -1727,7 +1770,7 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, } /* Sync the uberblocks to all vdevs in svd[] */ -static int +int vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) { spa_t *spa = svd[0]->vdev_spa; diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 14b98a76b84f..c3db114d6b62 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -27,15 +27,21 @@ #include #include +#include +#include #include +#include #include #include +#include #include +#include #include #include #include #include #include +#include #ifdef ZFS_DEBUG #include /* For vdev_xlate() in vdev_raidz_io_verify() */ @@ -135,6 +141,26 @@ VDEV_RAIDZ_64MUL_2((x), mask); \ } +/* + * For testing only: logical offset at which to pause the raidz expansion. + * (accessed by ZTS and ztest) + */ +#ifdef _KERNEL +static +#endif /* _KERNEL */ +unsigned long raidz_expand_max_offset_pause = 0; + +/* + * Maximum amount of copy io's outstanding at once. + */ +static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; + +/* + * Apply raidz map abds aggregation if the number of rows in the map is equal + * or greater than the value below. + */ +static unsigned long raidz_io_aggregate_rows = 4; + static void vdev_raidz_row_free(raidz_row_t *rr) { @@ -159,6 +185,17 @@ vdev_raidz_map_free(raidz_map_t *rm) for (int i = 0; i < rm->rm_nrows; i++) vdev_raidz_row_free(rm->rm_row[i]); + if (rm->rm_nphys_cols) { + for (int i = 0; i < rm->rm_nphys_cols; i++) { + if (rm->rm_phys_col[i].rc_abd != NULL) + abd_free(rm->rm_phys_col[i].rc_abd); + } + + kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * + rm->rm_nphys_cols); + } + + ASSERT3P(rm->rm_lr, ==, NULL); kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); } @@ -170,10 +207,37 @@ vdev_raidz_map_free_vsd(zio_t *zio) vdev_raidz_map_free(rm); } +static int +vdev_raidz_reflow_compare(const void *x1, const void *x2) +{ + const reflow_node_t *l = x1; + const reflow_node_t *r = x2; + + return (TREE_CMP(l->re_txg, r->re_txg)); +} + const zio_vsd_ops_t vdev_raidz_vsd_ops = { .vsd_free = vdev_raidz_map_free_vsd, }; +raidz_row_t * +vdev_raidz_row_alloc(int cols) +{ + raidz_row_t *rr = + kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); + + rr->rr_cols = cols; + rr->rr_scols = cols; + + for (int c = 0; c < cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_shadow_devidx = INT_MAX; + rc->rc_shadow_offset = UINT64_MAX; + rc->rc_allow_repair = 1; + } + return (rr); +} + static void vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) { @@ -343,18 +407,11 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, } ASSERT3U(acols, <=, scols); - - rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP); + rr = vdev_raidz_row_alloc(scols); rm->rm_row[0] = rr; - rr->rr_cols = acols; - rr->rr_scols = scols; rr->rr_bigcols = bc; - rr->rr_missingdata = 0; - rr->rr_missingparity = 0; rr->rr_firstdatacol = nparity; - rr->rr_abd_empty = NULL; - rr->rr_nempty = 0; #ifdef ZFS_DEBUG rr->rr_offset = zio->io_offset; rr->rr_size = zio->io_size; @@ -372,18 +429,8 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, } rc->rc_devidx = col; rc->rc_offset = coff; - rc->rc_abd = NULL; - rc->rc_orig_data = NULL; - rc->rc_error = 0; - rc->rc_tried = 0; - rc->rc_skipped = 0; - rc->rc_force_repair = 0; - rc->rc_allow_repair = 1; - rc->rc_need_orig_restore = B_FALSE; - if (c >= acols) - rc->rc_size = 0; - else if (c < bc) + if (c < bc) rc->rc_size = (q + 1) << ashift; else rc->rc_size = q << ashift; @@ -425,7 +472,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; rr->rr_col[1].rc_devidx = devidx; rr->rr_col[1].rc_offset = o; - if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } @@ -435,7 +481,379 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, } else { vdev_raidz_map_alloc_read(zio, rm); } + /* init RAIDZ parity ops */ + rm->rm_ops = vdev_raidz_math_get_ops(); + + return (rm); +} + +/* + * Everything before reflow_offset_synced should have been moved to the new + * location (read and write completed). However, this may not yet be reflected + * in the on-disk format (e.g. raidz_reflow_sync() has been called but the + * uberblock has not yet been written). If reflow is not in progress, + * reflow_offset_synced should be UINT64_MAX. For each row, if the row is + * entirely before reflow_offset_synced, it will come from the new location. + * Otherwise this row will come from the old location. Therefore, rows that + * straddle the reflow_offset_synced will come from the old location. + * + * For writes, reflow_offset_next is the next offset to copy. If a sector has + * been copied, but not yet reflected in the on-disk progress + * (reflow_offset_synced), it will also be written to the new (already copied) + * offset. + */ +noinline raidz_map_t * +vdev_raidz_map_alloc_expanded(zio_t *zio, + uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, + uint64_t nparity, uint64_t reflow_offset_synced, + uint64_t reflow_offset_next, boolean_t use_scratch) +{ + abd_t *abd = zio->io_abd; + uint64_t offset = zio->io_offset; + uint64_t size = zio->io_size; + + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = size >> ashift; + uint64_t q, r, bc, asize, tot; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + * AKA "full rows" + */ + q = s / (logical_cols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + r = s - q * (logical_cols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ + bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ + tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + /* How many rows contain data (not skip) */ + uint64_t rows = howmany(tot, logical_cols); + int cols = MIN(tot, logical_cols); + + raidz_map_t *rm = + kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), + KM_SLEEP); + rm->rm_nrows = rows; + rm->rm_nskip = roundup(tot, nparity + 1) - tot; + rm->rm_skipstart = bc; + asize = 0; + +#if 1 + zfs_dbgmsg("rm=%px s=%d q=%d r=%d bc=%d nrows=%d cols=%d rfo=%llx", + rm, (int)s, (int)q, (int)r, (int)bc, (int)rows, (int)cols, + (long long)reflow_offset_synced); +#endif + + for (uint64_t row = 0; row < rows; row++) { + boolean_t row_use_scratch = B_FALSE; + raidz_row_t *rr = vdev_raidz_row_alloc(cols); + rm->rm_row[row] = rr; + + /* The starting RAIDZ (parent) vdev sector of the row. */ + uint64_t b = (offset >> ashift) + row * logical_cols; + + /* + * If we are in the middle of a reflow, and the copying has + * not yet completed for any part of this row, then use the + * old location of this row. Note that reflow_offset_synced + * reflects the i/o that's been completed, because it's + * updated by a synctask, after zio_wait(spa_txg_zio[]). + * This is sufficient for our check, even if that progress + * has not yet been recorded to disk (reflected in + * spa_ubsync). Also note that we consider the last row to + * be "full width" (`cols`-wide rather than `bc`-wide) for + * this calculation. This causes a tiny bit of unnecessary + * double-writes but is safe and simpler to calculate. + */ + int row_phys_cols = physical_cols; + if (b + cols > reflow_offset_synced >> ashift) + row_phys_cols--; + else if (use_scratch) + row_use_scratch = B_TRUE; + + /* starting child of this row */ + uint64_t child_id = b % row_phys_cols; + /* The starting byte offset on each child vdev. */ + uint64_t child_offset = (b / row_phys_cols) << ashift; + + /* + * Note, rr_cols is the entire width of the block, even + * if this row is shorter. This is needed because parity + * generation (for Q and R) needs to know the entire width, + * because it treats the short row as though it was + * full-width (and the "phantom" sectors were zero-filled). + * + * Another approach to this would be to set cols shorter + * (to just the number of columns that we might do i/o to) + * and have another mechanism to tell the parity generation + * about the "entire width". Reconstruction (at least + * vdev_raidz_reconstruct_general()) would also need to + * know about the "entire width". + */ + rr->rr_firstdatacol = nparity; +#ifdef ZFS_DEBUG + /* + * note: rr_size is PSIZE, not ASIZE + */ + rr->rr_offset = b << ashift; + rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; +#endif + + for (int c = 0; c < rr->rr_cols; c++, child_id++) { + if (child_id >= row_phys_cols) { + child_id -= row_phys_cols; + child_offset += 1ULL << ashift; + } + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_devidx = child_id; + rc->rc_offset = child_offset; + + /* + * Get this from the scratch space if appropriate. + * This only happens if we crashed in the middle of + * raidz_reflow_scratch_sync() (while it's running, + * the rangelock prevents us from doing concurrent + * io), and even then only during zpool import or + * when the pool is imported readonly. + */ + if (row_use_scratch) + rc->rc_offset -= VDEV_BOOT_SIZE; + + uint64_t dc = c - rr->rr_firstdatacol; + if (c < rr->rr_firstdatacol) { + rc->rc_size = 1ULL << ashift; + + /* + * Parity sectors' rc_abd's and are set + * below after determining if this is an + * aggregation. + */ + } else if (row == rows - 1 && bc != 0 && c >= bc) { + /* + * Past the end of the block (even including + * skip sectors). This sector is part of the + * map so that we have full rows for p/q parity + * generation. + */ + rc->rc_size = 0; + rc->rc_abd = NULL; + } else { + /* XXX ASCII art diagram here */ + /* "data column" (col excluding parity) */ + uint64_t off; + + if (c < bc || r == 0) { + off = dc * rows + row; + } else { + off = r * rows + + (dc - r) * (rows - 1) + row; + } +#if 1 + zfs_dbgmsg("rm=%px row=%d c=%d dc=%d off=%u " + "devidx=%u offset=%llu rpc=%u", + rm, (int)row, (int)c, (int)dc, (int)off, + (int)child_id, (long long)child_offset, + (int)row_phys_cols); +#endif + rc->rc_size = 1ULL << ashift; + rc->rc_abd = abd_get_offset_struct( + &rc->rc_abdstruct, abd, off << ashift, + rc->rc_size); + } + + if (rc->rc_size == 0) + continue; + + /* + * If any part of this row is in both old and new + * locations, the primary location is the old + * location. If this sector was already copied to the + * new location, we need to also write to the new, + * "shadow" location. + * + * Note, `row_phys_cols != physical_cols` indicates + * that the primary location is the old location. + * `b+c < reflow_offset_next` indicates that the copy + * to the new location has been initiated. We know + * that the copy has completed because we have the + * rangelock, which is held exclusively while the + * copy is in progress. + */ + if (row_use_scratch || + (row_phys_cols != physical_cols && + b + c < reflow_offset_next >> ashift)) { + rc->rc_shadow_devidx = (b + c) % physical_cols; + rc->rc_shadow_offset = + ((b + c) / physical_cols) << ashift; + if (row_use_scratch) + rc->rc_shadow_offset -= VDEV_BOOT_SIZE; + + zfs_dbgmsg("rm=%px row=%d b+c=%llu " + "shadow_devidx=%u shadow_offset=%llu", + rm, (int)row, (long long)(b + c), + (int)rc->rc_shadow_devidx, + (long long)rc->rc_shadow_offset); + } + + asize += rc->rc_size; + } + + /* + * If all data stored spans all columns, there's a danger that + * parity will always be on the same device and, since parity + * isn't read during normal operation, that that device's I/O + * bandwidth won't be used effectively. We therefore switch the + * parity every 1MB. + * + * ... at least that was, ostensibly, the theory. As a + * practical matter unless we juggle the parity between all + * devices evenly, we won't see any benefit. Further, + * occasional writes that aren't a multiple of the LCM of the + * number of children and the minimum stripe width are + * sufficient to avoid pessimal behavior. + * Unfortunately, this decision created an implicit on-disk + * format requirement that we need to support for all eternity, + * but only for single-parity RAID-Z. + * + * If we intend to skip a sector in the zeroth column for + * padding we must make sure to note this swap. We will never + * intend to skip the first column since at least one data and + * one parity column must appear in each row. + */ + if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && + (offset & (1ULL << 20))) { + ASSERT(rr->rr_cols >= 2); + ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); + + int devidx0 = rr->rr_col[0].rc_devidx; + uint64_t offset0 = rr->rr_col[0].rc_offset; + int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; + uint64_t shadow_offset0 = + rr->rr_col[0].rc_shadow_offset; + + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; + rr->rr_col[0].rc_shadow_devidx = + rr->rr_col[1].rc_shadow_devidx; + rr->rr_col[0].rc_shadow_offset = + rr->rr_col[1].rc_shadow_offset; + + rr->rr_col[1].rc_devidx = devidx0; + rr->rr_col[1].rc_offset = offset0; + rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; + rr->rr_col[1].rc_shadow_offset = shadow_offset0; + } + } + ASSERT3U(asize, ==, tot << ashift); + + /* + * Determine if the block is contiguous, in which case we can use + * an aggregation. + */ + if (rows >= raidz_io_aggregate_rows) { + rm->rm_nphys_cols = physical_cols; + rm->rm_phys_col = + kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, + KM_SLEEP); + + /* + * Determine the aggregate io's offset and size, and check + * that the io is contiguous. + */ + for (int i = 0; + i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + raidz_col_t *prc = + &rm->rm_phys_col[rc->rc_devidx]; + + if (rc->rc_size == 0) + continue; + + if (prc->rc_size == 0) { + ASSERT0(prc->rc_offset); + prc->rc_offset = rc->rc_offset; + } else if (prc->rc_offset + prc->rc_size != + rc->rc_offset) { + /* + * This block is not contiguous and + * therefore can't be aggregated. + * This is expected to be rare, so + * the cost of allocating and then + * freeing rm_phys_col is not + * significant. + */ + kmem_free(rm->rm_phys_col, + sizeof (raidz_col_t) * + rm->rm_nphys_cols); + rm->rm_phys_col = NULL; + rm->rm_nphys_cols = 0; + break; + } + prc->rc_size += rc->rc_size; + } + } + } + if (rm->rm_phys_col != NULL) { + /* + * Allocate aggregate ABD's. + */ + for (int i = 0; i < rm->rm_nphys_cols; i++) { + raidz_col_t *prc = &rm->rm_phys_col[i]; + + prc->rc_devidx = i; + + if (prc->rc_size == 0) + continue; + + prc->rc_abd = + abd_alloc_linear(rm->rm_phys_col[i].rc_size, + B_FALSE); + } + /* + * Point the parity abd's into the aggregate abd's. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + raidz_col_t *prc = + &rm->rm_phys_col[rc->rc_devidx]; + rc->rc_abd = + abd_get_offset_struct(&rc->rc_abdstruct, + prc->rc_abd, + rc->rc_offset - prc->rc_offset, + rc->rc_size); + } + } + } else { + /* + * Allocate new abd's for the parity sectors. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = + abd_alloc_linear(rc->rc_size, + B_TRUE); + } + } + } /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); @@ -618,7 +1036,15 @@ vdev_raidz_generate_parity_pqr(raidz_row_t *rr) void vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) { - ASSERT3U(rr->rr_cols, !=, 0); + if (rr->rr_cols == 0) { + /* + * We are handling this block one row at a time (because + * this block has a different logical vs physical width, + * due to RAIDZ expansion), and this is a pad-only row, + * which has no parity. + */ + return; + } /* Generate using the new math implementation */ if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) @@ -770,6 +1196,9 @@ vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) int x = tgts[0]; abd_t *dst, *src; + zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", + rr, x); + ASSERT3U(ntgts, ==, 1); ASSERT3U(x, >=, rr->rr_firstdatacol); ASSERT3U(x, <, rr->rr_cols); @@ -802,6 +1231,9 @@ vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) int c, exp; abd_t *dst, *src; + zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", + rr, x); + ASSERT(ntgts == 1); ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); @@ -848,6 +1280,9 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) int y = tgts[1]; abd_t *xd, *yd; + zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", + rr, x, y); + ASSERT(ntgts == 2); ASSERT(x < y); ASSERT(x >= rr->rr_firstdatacol); @@ -1287,6 +1722,8 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; + zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", + rr, ntgts); uint8_t *p, *pp; size_t psize; uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; @@ -1427,10 +1864,20 @@ vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; + zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", + rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, + (int)rr->rr_missingparity); + nbadparity = rr->rr_firstdatacol; nbaddata = rr->rr_cols - nbadparity; ntgts = 0; for (i = 0, c = 0; c < rr->rr_cols; c++) { + zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " + "offset=%llx error=%u)", + rr, c, + (int)rr->rr_col[c].rc_devidx, + (long long)rr->rr_col[c].rc_offset, + (int)rr->rr_col[c].rc_error); if (c < rr->rr_firstdatacol) parity_valid[c] = B_FALSE; @@ -1537,8 +1984,15 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *physical_ashift, cvd->vdev_physical_ashift); } - *asize *= vd->vdev_children; - *max_asize *= vd->vdev_children; + if (vd->vdev_rz_expanding) { + *asize *= vd->vdev_children - 1; + *max_asize *= vd->vdev_children - 1; + + vd->vdev_min_asize = *asize; + } else { + *asize *= vd->vdev_children; + *max_asize *= vd->vdev_children; + } if (numerrors > nparity) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; @@ -1557,19 +2011,71 @@ vdev_raidz_close(vdev_t *vd) } } +/* + * Return the logical width to use, given the txg in which the allocation + * happened. Note that BP_PHYSICAL_BIRTH() is usually the txg in which the + * BP was allocated. Remapped BP's (that were relocated due to device + * removal, see remap_blkptr_cb()), will have a more recent + * BP_PHYSICAL_BIRTH() which reflects when the BP was relocated, but we can + * ignore these because they can't be on RAIDZ (device removal doesn't + * support RAIDZ). + */ +static uint64_t +vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) +{ + reflow_node_t lookup = { + .re_txg = txg, + }; + avl_index_t where; + + uint64_t width; + mutex_enter(&vdrz->vd_expand_lock); + reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); + if (re != NULL) { + width = re->re_logical_width; + } else { + re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); + if (re != NULL) + width = re->re_logical_width; + else + width = vdrz->vd_original_width; + } + mutex_exit(&vdrz->vd_expand_lock); + return (width); +} + +/* + * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated + * more space due to the lower data-to-parity ratio. In this case it's + * important to pass in the correct txg. Note that vdev_gang_header_asize() + * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, + * regardless of txg. This is assured because for a single data sector, we + * allocate P+1 sectors regardless of width ("cols", which is at least P+1). + */ static uint64_t -vdev_raidz_asize(vdev_t *vd, uint64_t psize) +vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vdrz->vd_logical_width; + uint64_t cols = vdrz->vd_original_width; uint64_t nparity = vdrz->vd_nparity; + cols = vdev_raidz_get_logical_width(vdrz, txg); + asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); asize = roundup(asize, nparity + 1) << ashift; +#ifdef ZFS_DEBUG + uint64_t asize_new = ((psize - 1) >> ashift) + 1; + uint64_t ncols_new = vdrz->vd_physical_width; + asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / + (ncols_new - nparity)); + asize_new = roundup(asize_new, nparity + 1) << ashift; + VERIFY3U(asize_new, <=, asize); +#endif + return (asize); } @@ -1596,21 +2102,37 @@ vdev_raidz_child_done(zio_t *zio) } static void -vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) +vdev_raidz_shadow_child_done(zio_t *zio) { -#ifdef ZFS_DEBUG - vdev_t *tvd = vd->vdev_top; + raidz_col_t *rc = zio->io_private; + rc->rc_shadow_error = zio->io_error; +} + +static void +vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) +{ + (void) rm; +#ifdef ZFS_DEBUG range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_raidz_asize(vd, rr->rr_size); + vdev_raidz_asize(zio->io_vd, rr->rr_size, + BP_PHYSICAL_BIRTH(zio->io_bp)); raidz_col_t *rc = &rr->rr_col[col]; - vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); ASSERT(vdev_xlate_is_empty(&remain_rs)); + if (vdev_xlate_is_empty(&physical_rs)) { + /* + * If we are in the middle of expansion, the + * physical->logical mapping is changing so vdev_xlate() + * can't give us a reliable answer. + */ + return; + } ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); /* @@ -1621,7 +2143,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) */ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + - rc->rc_size + (1 << tvd->vdev_ashift)); + rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); } else { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); } @@ -1629,7 +2151,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) } static void -vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) +vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; @@ -1641,31 +2163,67 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; /* Verify physical to logical translation */ - vdev_raidz_io_verify(vd, rr, c); + vdev_raidz_io_verify(zio, rm, rr, c); - if (rc->rc_size > 0) { - ASSERT3P(rc->rc_abd, !=, NULL); - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, - abd_get_size(rc->rc_abd), zio->io_type, - zio->io_priority, 0, vdev_raidz_child_done, rc)); - } else { - /* - * Generate optional write for skip sector to improve - * aggregation contiguity. - */ - ASSERT3P(rc->rc_abd, ==, NULL); - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, NULL, 1ULL << ashift, - zio->io_type, zio->io_priority, - ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, - NULL)); + if (rc->rc_size == 0) + continue; + + ASSERT(rc->rc_offset + rc->rc_size < + cvd->vdev_psize - VDEV_LABEL_END_SIZE); + + ASSERT3P(rc->rc_abd, !=, NULL); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, + abd_get_size(rc->rc_abd), zio->io_type, + zio->io_priority, 0, vdev_raidz_child_done, rc)); + + if (rc->rc_shadow_devidx != INT_MAX) { + vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; + + ASSERT(rc->rc_shadow_offset + abd_get_size(rc->rc_abd) < + cvd2->vdev_psize - VDEV_LABEL_END_SIZE); + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, + rc->rc_shadow_offset, rc->rc_abd, + abd_get_size(rc->rc_abd), + zio->io_type, zio->io_priority, 0, + vdev_raidz_shadow_child_done, rc)); } } } +/* + * Generate optional I/Os for skip sectors to improve aggregation contiguity. + * This only works for vdev_raidz_map_alloc() (not _expanded()). + */ +static void +raidz_start_skip_writes(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + uint64_t ashift = vd->vdev_top->vdev_ashift; + raidz_map_t *rm = zio->io_vsd; + ASSERT3U(rm->rm_nrows, ==, 1); + raidz_row_t *rr = rm->rm_row[0]; + for (int c = 0; c < rr->rr_scols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + if (rc->rc_size != 0) + continue; + ASSERT3P(rc->rc_abd, ==, NULL); + + ASSERT(rc->rc_offset + rc->rc_size < + cvd->vdev_psize - VDEV_LABEL_END_SIZE); + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift, + zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); + } +} + + static void -vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) +vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) { vdev_t *vd = zio->io_vd; @@ -1697,7 +2255,8 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) rc->rc_skipped = 1; continue; } - if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || + if (forceparity || + c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, @@ -1707,47 +2266,163 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) } } -/* - * Start an IO operation on a RAIDZ VDev - * - * Outline: - * - For write operations: - * 1. Generate the parity data - * 2. Create child zio write operations to each column's vdev, for both - * data and parity. - * 3. If the column skips any sectors for padding, create optional dummy - * write zio children for those areas to improve aggregation continuity. - * - For read operations: - * 1. Create child zio read operations to each data column's vdev to read - * the range of data required for zio. - * 2. If this is a scrub or resilver operation, or if any of the data - * vdevs have had errors, then create zio read operations to the parity - * columns' VDevs as well. - */ static void -vdev_raidz_io_start(zio_t *zio) +vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) { vdev_t *vd = zio->io_vd; - vdev_t *tvd = vd->vdev_top; - vdev_raidz_t *vdrz = vd->vdev_tsd; - raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, - vdrz->vd_logical_width, vdrz->vd_nparity); - zio->io_vsd = rm; - zio->io_vsd_ops = &vdev_raidz_vsd_ops; + for (int i = 0; i < rm->rm_nphys_cols; i++) { + raidz_col_t *prc = &rm->rm_phys_col[i]; + if (prc->rc_size == 0) + continue; + ASSERT3U(prc->rc_devidx, ==, i); + vdev_t *cvd = vd->vdev_child[i]; + if (!vdev_readable(cvd)) { + prc->rc_error = SET_ERROR(ENXIO); + prc->rc_tried = 1; /* don't even try */ + prc->rc_skipped = 1; + continue; + } + if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { + prc->rc_error = SET_ERROR(ESTALE); + prc->rc_skipped = 1; + continue; + } + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + prc->rc_offset, prc->rc_abd, prc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, prc)); + } +} + +static void +vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) +{ /* - * Until raidz expansion is implemented all maps for a raidz vdev - * contain a single row. + * If there are multiple rows, we will be hitting + * all disks, so go ahead and read the parity so + * that we are reading in decent size chunks. */ - ASSERT3U(rm->rm_nrows, ==, 1); - raidz_row_t *rr = rm->rm_row[0]; + boolean_t forceparity = rm->rm_nrows > 1; + + if (rm->rm_phys_col) { + vdev_raidz_io_start_read_phys_cols(zio, rm); + } else { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + vdev_raidz_io_start_read_row(zio, rr, forceparity); + } + } +} + +/* + * Start an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. + * 3. If the column skips any sectors for padding, create optional dummy + * write zio children for those areas to improve aggregation continuity. + * - For read operations: + * 1. Create child zio read operations to each data column's vdev to read + * the range of data required for zio. + * 2. If this is a scrub or resilver operation, or if any of the data + * vdevs have had errors, then create zio read operations to the parity + * columns' VDevs as well. + */ +static void +vdev_raidz_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; + vdev_raidz_t *vdrz = vd->vdev_tsd; + raidz_map_t *rm; + + uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, + BP_PHYSICAL_BIRTH(zio->io_bp)); + zfs_dbgmsg("zio=%px bm=%llu/%llu/%llu/%llu phys_birth=%llu " + "logical_width=%llu", + zio, + (long long)zio->io_bookmark.zb_objset, + (long long)zio->io_bookmark.zb_object, + (long long)zio->io_bookmark.zb_level, + (long long)zio->io_bookmark.zb_blkid, + (long long)BP_PHYSICAL_BIRTH(zio->io_bp), + (long long)logical_width); + if (logical_width != vdrz->vd_physical_width) { + zfs_locked_range_t *lr = NULL; + uint64_t synced_offset = UINT64_MAX; + uint64_t next_offset = UINT64_MAX; + boolean_t use_scratch = B_FALSE; + /* + * Note: when the expansion is completing, we set + * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) + * in a later txg than when we last update spa_ubsync's state + * (see the end of spa_raidz_expand_cb()). Therefore we may + * see vre_state!=SCANNING before + * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected + * on disk, but the copying progress has been synced to disk + * (and reflected in spa_ubsync). In this case it's fine to + * treat the expansion as completed, since if we crash there's + * no additional copying to do. + */ + if (vdrz->vn_vre.vre_state == DSS_SCANNING) { + ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, + &vdrz->vn_vre); + lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, + zio->io_offset, zio->io_size, RL_READER); + use_scratch = + (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == + RRSS_SCRATCH_VALID); + synced_offset = + RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); + next_offset = vdrz->vn_vre.vre_offset; + /* + * If we haven't resumed expanding since importing the + * pool, vre_offset won't have been set yet. In + * this case the next offset to be copied is the same + * as what was synced. + */ + if (next_offset == UINT64_MAX) { + next_offset = synced_offset; + } + } + zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=%lld " + "next_offset=%lld use_scratch=%u", + zio, + zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", + (long long)zio->io_offset, + (long long)synced_offset, + (long long)next_offset, + use_scratch); + + rm = vdev_raidz_map_alloc_expanded(zio, + tvd->vdev_ashift, vdrz->vd_physical_width, + logical_width, vdrz->vd_nparity, + synced_offset, next_offset, use_scratch); + rm->rm_lr = lr; + } else { + rm = vdev_raidz_map_alloc(zio, + tvd->vdev_ashift, logical_width, vdrz->vd_nparity); + } + rm->rm_original_width = vdrz->vd_original_width; + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; if (zio->io_type == ZIO_TYPE_WRITE) { - vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift); + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_raidz_io_start_write(zio, rm->rm_row[i]); + } + + if (logical_width == vdrz->vd_physical_width) { + raidz_start_skip_writes(zio); + } } else { ASSERT(zio->io_type == ZIO_TYPE_READ); - vdev_raidz_io_start_read(zio, rr); + vdev_raidz_io_start_read(zio, rm); } zio_execute(zio); @@ -1847,6 +2522,9 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr) continue; if (abd_cmp(orig[c], rc->rc_abd) != 0) { + zfs_dbgmsg("raidz_parity_verify found error on " + "col=%u devidx=%u", + c, (int)rc->rc_devidx); vdev_raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; @@ -1862,8 +2540,10 @@ vdev_raidz_worst_error(raidz_row_t *rr) { int error = 0; - for (int c = 0; c < rr->rr_cols; c++) + for (int c = 0; c < rr->rr_cols; c++) { error = zio_worst_error(error, rr->rr_col[c].rc_error); + error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); + } return (error); } @@ -1905,6 +2585,13 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) * Note that we also regenerate parity when resilvering so we * can write it out to failed devices later. */ +#if 1 + zfs_dbgmsg("parity_errors=%u parity_untried=%u data_errors=%u " + "verifying=%s", + parity_errors, parity_untried, data_errors, + (parity_errors + parity_untried < + rr->rr_firstdatacol - data_errors) ? "yes" : "no"); +#endif if (parity_errors + parity_untried < rr->rr_firstdatacol - data_errors || (zio->io_flags & ZIO_FLAG_RESILVER)) { @@ -1929,6 +2616,12 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) continue; } + zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " + "offset=%llx", + zio, c, + rc->rc_devidx, + (long long)rc->rc_offset); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, @@ -1938,6 +2631,48 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } + + /* + * Scrub or resilver i/o's: overwrite any shadow locations with the + * good data. This ensures that if we've already copied this sector, + * it will be corrected if it was damaged. This writes more than is + * necessary, but since expansion is paused during scrub/resilver, at + * most a single row will have a shadow location. + */ + if (zio->io_error == 0 && spa_writeable(zio->io_spa) && + (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *vd = zio->io_vd; + + if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) + continue; + vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; + + zfs_dbgmsg("zio=%px overwriting c=%u shadow_devidx=%u " + "shadow_offset=%llx", + zio, c, + rc->rc_shadow_devidx, + (long long)rc->rc_shadow_offset); + + /* + * Note: We don't want to update the repair stats + * because that would incorrectly indicate that there + * was bad data to repair, which we aren't sure about. + * By clearing the SCAN_THREAD flag, we prevent this + * from happening, despite having the REPAIR flag set. + * We need to set SELF_HEAL so that this i/o can't be + * bypassed by zio_vdev_io_start(). + */ + zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, + rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, + NULL, NULL); + cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; + zio_nowait(cio); + } + } } static void @@ -1956,6 +2691,51 @@ raidz_restore_orig_data(raidz_map_t *rm) } } +/* + * Treating logical child i as failed, return TRUE if the given column should + * be treated as failed. The idea of logical children allows us to imagine + * that a disk silently failed before a RAIDZ expansion (reads from this disk + * succeed but return the wrong data). Since the expansion doesn't verify + * checksums, the incorrect data will be moved to new locations spread among + * the children (going diagonally across them). + * + * Higher "logical child failures" (values of `i`) indicate these + * "pre-expansion failures". The first physical_width values imagine that a + * current child failed; the next physical_width-1 values imagine that a + * child failed before the most recent expansion; the next physical_width-2 + * values imagine a child failed in the expansion before that, etc. + */ +static boolean_t +raidz_simulate_failure(int physical_width, int original_width, int ashift, + int i, raidz_col_t *rc) +{ + uint64_t sector_id = + physical_width * (rc->rc_offset >> ashift) + + rc->rc_devidx; + +#if 1 + zfs_dbgmsg("raidz_simulate_failure(pw=%u lw=%u ashift=%u i=%u " + "rc_offset=%llx rc_devidx=%u sector_id=%llu", + physical_width, + original_width, + ashift, + i, + (long long)rc->rc_offset, + (int)rc->rc_devidx, + (long long)sector_id); +#endif + + for (int w = physical_width; w >= original_width; w--) { + if (i < w) { + return (sector_id % w == i); + } else { + i -= w; + } + } + ASSERT(!"invalid logical child id"); + return (B_FALSE); +} + /* * returns EINVAL if reconstruction of the block will not be possible * returns ECKSUM if this specific reconstruction failed @@ -1965,6 +2745,13 @@ static int raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) { raidz_map_t *rm = zio->io_vsd; + int physical_width = zio->io_vd->vdev_children; + int original_width = (rm->rm_original_width != 0) ? + rm->rm_original_width : physical_width; + + zfs_dbgmsg( + "raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u ntgts=%u", + zio, ltgts[0], ltgts[1], ltgts[2], ntgts); /* Reconstruct each row */ for (int r = 0; r < rm->rm_nrows; r++) { @@ -1974,6 +2761,9 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) int dead = 0; int dead_data = 0; + zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", + r); + for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; ASSERT0(rc->rc_need_orig_restore); @@ -1986,7 +2776,10 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) if (rc->rc_size == 0) continue; for (int lt = 0; lt < ntgts; lt++) { - if (rc->rc_devidx == ltgts[lt]) { + if (raidz_simulate_failure(physical_width, + original_width, + zio->io_vd->vdev_top->vdev_ashift, + ltgts[lt], rc)) { if (rc->rc_orig_data == NULL) { rc->rc_orig_data = abd_alloc_linear( @@ -1999,13 +2792,33 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) dead++; if (c >= nparity) dead_data++; - my_tgts[t++] = c; + /* + * Note: simulating failure of a + * pre-expansion device can hit more + * than one column, in which case we + * might try to simulate more + * failures than can be + * reconstructed, which is also more + * than the size of my_tgts. This + * check prevents accessing past the + * end of my_tgts. The "dead > + * nparity" check below will fail + * this reconstruction attempt. + */ + if (t < VDEV_RAIDZ_MAXPARITY) { + my_tgts[t++] = c; + zfs_dbgmsg("simulating failure " + "of col %u devidx %u", + c, (int)rc->rc_devidx); + } break; } } } if (dead > nparity) { /* reconstruction not possible */ + zfs_dbgmsg("reconstruction not possible; " + "too many failures"); raidz_restore_orig_data(rm); return (EINVAL); } @@ -2049,11 +2862,14 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) zio_checksum_verified(zio); + zfs_dbgmsg("reconstruction successful (checksum verified)"); return (0); } /* Reconstruction failed - restore original data */ raidz_restore_orig_data(rm); + zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum failed", + zio); return (ECKSUM); } @@ -2068,7 +2884,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) * The order that we find the various possible combinations of failed * disks is dictated by these rules: * - Examine each "slot" (the "i" in tgts[i]) - * - Try to increment this slot (tgts[i] = tgts[i] + 1) + * - Try to increment this slot (tgts[i] += 1) * - if we can't increment because it runs into the next slot, * reset our slot to the minimum, and examine the next slot * @@ -2099,18 +2915,22 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) * * This strategy works for dRAID but is less efficient when there are a large * number of child vdevs and therefore permutations to check. Furthermore, - * since the raidz_map_t rows likely do not overlap reconstruction would be + * since the raidz_map_t rows likely do not overlap, reconstruction would be * possible as long as there are no more than nparity data errors per row. * These additional permutations are not currently checked but could be as * a future improvement. + * + * Returns 0 on success, ECKSUM on failure. */ static int vdev_raidz_combrec(zio_t *zio) { int nparity = vdev_get_nparity(zio->io_vd); raidz_map_t *rm = zio->io_vsd; + int physical_width = zio->io_vd->vdev_children; + int original_width = (rm->rm_original_width != 0) ? + rm->rm_original_width : physical_width; - /* Check if there's enough data to attempt reconstrution. */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; int total_errors = 0; @@ -2128,8 +2948,16 @@ vdev_raidz_combrec(zio_t *zio) int tstore[VDEV_RAIDZ_MAXPARITY + 2]; int *ltgts = &tstore[1]; /* value is logical child ID */ - /* Determine number of logical children, n */ - int n = zio->io_vd->vdev_children; + + /* + * Determine number of logical children, n. See comment + * above raidz_simulate_failure(). + */ + int n = 0; + for (int w = physical_width; + w >= original_width; w--) { + n += w; + } ASSERT3U(num_failures, <=, nparity); ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); @@ -2160,6 +2988,10 @@ vdev_raidz_combrec(zio_t *zio) if (ltgts[t] == n) { /* try more failures */ ASSERT3U(t, ==, num_failures - 1); + zfs_dbgmsg("reconstruction failed " + "for num_failures=%u; tried all " + "combinations", + num_failures); break; } @@ -2171,7 +3003,7 @@ vdev_raidz_combrec(zio_t *zio) * Try the next combination. */ if (ltgts[t] != ltgts[t + 1]) - break; + break; // found next combination /* * Otherwise, reset this tgt to the minimum, @@ -2186,7 +3018,7 @@ vdev_raidz_combrec(zio_t *zio) break; } } - + zfs_dbgmsg("reconstruction failed for all num_failures"); return (ECKSUM); } @@ -2211,7 +3043,8 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) static void vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) { - int total_errors = 0; + int normal_errors = 0; + int shadow_errors = 0; ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); @@ -2220,24 +3053,31 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; - if (rc->rc_error) { + if (rc->rc_error != 0) { ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ - - total_errors++; + normal_errors++; + } + if (rc->rc_shadow_error != 0) { + ASSERT(rc->rc_shadow_error != ECKSUM); + shadow_errors++; } } /* * Treat partial writes as a success. If we couldn't write enough - * columns to reconstruct the data, the I/O failed. Otherwise, - * good enough. + * columns to reconstruct the data, the I/O failed. Otherwise, good + * enough. Note that in the case of a shadow write (during raidz + * expansion), depending on if we crash, either the normal (old) or + * shadow (new) location may become the "real" version of the block, + * so both locations must have sufficient redundancy. * * Now that we support write reallocation, it would be better * to treat partial failure as real failure unless there are * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ - if (total_errors > rr->rr_firstdatacol) { + if (normal_errors > rr->rr_firstdatacol || + shadow_errors > rr->rr_firstdatacol) { zio->io_error = zio_worst_error(zio->io_error, vdev_raidz_worst_error(rr)); } @@ -2254,7 +3094,6 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); - ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; @@ -2337,7 +3176,7 @@ vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) * for a normal read then allocate an ABD for them now so they * may be read, verified, and any needed repairs performed. */ - if (rr->rr_nempty && rr->rr_abd_empty == NULL) + if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) vdev_draid_map_alloc_empty(zio, rr); for (int c = 0; c < rr->rr_cols; c++) { @@ -2395,11 +3234,50 @@ vdev_raidz_io_done(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; + zfs_dbgmsg("vdev_raidz_io_done(%px)", zio); + + ASSERT(zio->io_bp != NULL); if (zio->io_type == ZIO_TYPE_WRITE) { for (int i = 0; i < rm->rm_nrows; i++) { vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); } } else { + if (rm->rm_phys_col) { + /* + * This is an aggregated read. Copy the data and status + * from the aggregate abd's to the individual rows. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_size == 0) + continue; + + raidz_col_t *prc = + &rm->rm_phys_col[rc->rc_devidx]; + rc->rc_error = prc->rc_error; + rc->rc_tried = prc->rc_tried; + rc->rc_skipped = prc->rc_skipped; + if (c >= rr->rr_firstdatacol) { + /* + * Note: this is slightly faster + * than using abd_copy_off(). + */ + char *physbuf = abd_to_buf( + prc->rc_abd); + void *physloc = physbuf + + rc->rc_offset - + prc->rc_offset; + + abd_copy_from_buf(rc->rc_abd, + physloc, rc->rc_size); + } + } + } + } + for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_reconstruct_known_missing(zio, @@ -2446,7 +3324,54 @@ vdev_raidz_io_done(zio_t *zio) zio_vdev_io_redone(zio); return; } - + /* + * It would be too expensive to try every possible + * combination of failed sectors in every row, so + * instead we try every combination of failed current or + * past physical disk. This means that if the incorrect + * sectors were all on Nparity disks at any point in the + * past, we will find the correct data. I think that + * the only case where this is less durable than + * a non-expanded RAIDZ, is if we have a silent + * failure during expansion. In that case, one block + * could be partially in the old format and partially + * in the new format, so we'd lost some sectors + * from the old format and some from the new format. + * + * e.g. logical_width=4 physical_width=6 + * the 15 (6+5+4) possible failed disks are: + * width=6 child=0 + * width=6 child=1 + * width=6 child=2 + * width=6 child=3 + * width=6 child=4 + * width=6 child=5 + * width=5 child=0 + * width=5 child=1 + * width=5 child=2 + * width=5 child=3 + * width=5 child=4 + * width=4 child=0 + * width=4 child=1 + * width=4 child=2 + * width=4 child=3 + * And we will try every combination of Nparity of these + * failing. + * + * As a first pass, we can generate every combo, + * and try reconstructing, ignoring any known + * failures. If any row has too many known + simulated + * failures, then we bail on reconstructing with this + * number of simulated failures. As an improvement, + * we could detect the number of whole known failures + * (i.e. we have known failures on these disks for + * every row; the disks never succeeded), and + * subtract that from the max # failures to simulate. + * We could go even further like the current + * combrec code, but that doesn't seem like it + * gains us very much. If we simulate a failure + * that is also a known failure, that's fine. + */ zio->io_error = vdev_raidz_combrec(zio); if (zio->io_error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -2454,6 +3379,10 @@ vdev_raidz_io_done(zio_t *zio) } } } + if (rm->rm_lr != NULL) { + zfs_rangelock_exit(rm->rm_lr); + rm->rm_lr = NULL; + } } static void @@ -2480,6 +3409,14 @@ vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { vdev_raidz_t *vdrz = vd->vdev_tsd; + + /* + * If we're in the middle of a RAIDZ expansion, this block may be in + * the old and/or new location. For simplicity, always resilver it. + */ + if (vdrz->vn_vre.vre_state == DSS_SCANNING) + return (B_TRUE); + uint64_t dcols = vd->vdev_children; uint64_t nparity = vdrz->vd_nparity; uint64_t ashift = vd->vdev_top->vdev_ashift; @@ -2524,7 +3461,24 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); - uint64_t width = raidvd->vdev_children; + vdev_raidz_t *vdrz = raidvd->vdev_tsd; + + if (vdrz->vn_vre.vre_state == DSS_SCANNING) { + /* + * We're in the middle of expansion, in which case the + * translation is in flux. Any answer we give may be wrong + * by the time we return, so it isn't safe for the caller to + * act on it. Therefore we say that this range isn't present + * on any children. The only consumers of this are "zpool + * initialize" and trimming, both of which are "best effort" + * anyway. + */ + physical_rs->rs_start = physical_rs->rs_end = 0; + remain_rs->rs_start = remain_rs->rs_end = 0; + return; + } + + uint64_t width = vdrz->vd_physical_width; uint64_t tgt_col = cvd->vdev_id; uint64_t ashift = raidvd->vdev_top->vdev_ashift; @@ -2550,15 +3504,1036 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, logical_rs->rs_end - logical_rs->rs_start); } +static void +raidz_reflow_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = arg; + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + + /* + * Ensure there are no i/os to the range that is being committed. + */ + uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); + ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); + + mutex_enter(&vre->vre_lock); + uint64_t new_offset = + MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); + /* + * We should not have committed anything that failed. + */ + VERIFY3U(vre->vre_failed_offset, >=, old_offset); + mutex_exit(&vre->vre_lock); + + zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, + old_offset, new_offset - old_offset, + RL_WRITER); + + /* + * Update the uberblock that will be written when this txg completes. + */ + zfs_dbgmsg("reflow syncing txg=%llu off_pertxg=%llu failed_off=%llu", + (long long)dmu_tx_get_txg(tx), + (long long)vre->vre_offset_pertxg[txgoff], + (long long)vre->vre_failed_offset); + RAIDZ_REFLOW_SET(&spa->spa_uberblock, + RRSS_SCRATCH_NOT_IN_USE, new_offset); + vre->vre_offset_pertxg[txgoff] = 0; + zfs_rangelock_exit(lr); + + mutex_enter(&vre->vre_lock); + vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; + vre->vre_bytes_copied_pertxg[txgoff] = 0; + mutex_exit(&vre->vre_lock); + + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + VERIFY0(zap_update(spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, + sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); +} + +static void +raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = arg; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + vdev_raidz_t *vdrz = raidvd->vdev_tsd; + + for (int i = 0; i < TXG_SIZE; i++) + VERIFY0(vre->vre_offset_pertxg[i]); + + reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); + re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; + re->re_logical_width = vdrz->vd_physical_width; + mutex_enter(&vdrz->vd_expand_lock); + avl_add(&vdrz->vd_expand_txgs, re); + mutex_exit(&vdrz->vd_expand_lock); + + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + + /* + * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS + * will get written (based on vd_expand_txgs). + */ + vdev_config_dirty(vd); + + /* + * Before we change vre_state, the on-disk state must reflect that we + * have completed all copying, so that vdev_raidz_io_start() can use + * vre_state to determine if the reflow is in progress. See also the + * end of spa_raidz_expand_cb(). + */ + VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, + raidvd->vdev_ms_count << raidvd->vdev_ms_shift); + + vre->vre_end_time = gethrestime_sec(); + vre->vre_state = DSS_FINISHED; + + uint64_t state = vre->vre_state; + VERIFY0(zap_update(spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, + sizeof (state), 1, &state, tx)); + + uint64_t end_time = vre->vre_end_time; + VERIFY0(zap_update(spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, + sizeof (end_time), 1, &end_time, tx)); + + spa->spa_uberblock.ub_raidz_reflow_info = 0; + + spa_history_log_internal(spa, "raidz vdev expansion completed", tx, + "%s vdev %llu new width %llu", spa_name(spa), + (unsigned long long)vd->vdev_id, + (unsigned long long)vd->vdev_children); + + spa->spa_raidz_expand = NULL; + raidvd->vdev_rz_expanding = B_FALSE; + + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); + spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); + spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); + + spa_notify_waiters(spa); +} + +/* + * Struct for one copy zio. + */ +typedef struct raidz_reflow_arg { + vdev_raidz_expand_t *rra_vre; + zfs_locked_range_t *rra_lr; + uint64_t rra_txg; +} raidz_reflow_arg_t; + +/* + * The write of the new location is done. + */ +static void +raidz_reflow_write_done(zio_t *zio) +{ + raidz_reflow_arg_t *rra = zio->io_private; + vdev_raidz_expand_t *vre = rra->rra_vre; + + abd_free(zio->io_abd); + + zfs_dbgmsg("completed reflow offset=%llu size=%llu txg=%llu err=%u", + (long long)rra->rra_lr->lr_offset, + (long long)rra->rra_lr->lr_length, + (long long)rra->rra_txg, + zio->io_error); + + mutex_enter(&vre->vre_lock); + if (zio->io_error != 0) { + vre->vre_failed_offset = + MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); + } + ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); + vre->vre_outstanding_bytes -= zio->io_size; + if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < + vre->vre_failed_offset) { + vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += + zio->io_size; + } + cv_signal(&vre->vre_cv); + mutex_exit(&vre->vre_lock); + + zfs_rangelock_exit(rra->rra_lr); + + kmem_free(rra, sizeof (*rra)); + spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); +} + +/* + * The read of the old location is done. The parent zio is the write to + * the new location. Allow it to start. + */ +static void +raidz_reflow_read_done(zio_t *zio) +{ + raidz_reflow_arg_t *rra = zio->io_private; + vdev_raidz_expand_t *vre = rra->rra_vre; + + /* + * If the read failed, or if it was done on a vdev that is not fully + * healthy (e.g. a child that has a resilver in progress), we may not + * have the correct data. Note that it's OK if the write proceeds. + * It may write garbage but the location is otherwise unused and we + * will retry later due to vre_failed_offset. + */ + if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { + zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu \ +err=%u partial_dtl_empty=%u missing_dtl_empty=%u", + (long long)rra->rra_lr->lr_offset, + (long long)rra->rra_lr->lr_length, + (long long)rra->rra_txg, + zio->io_error, + vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), + vdev_dtl_empty(zio->io_vd, DTL_MISSING)); + mutex_enter(&vre->vre_lock); + vre->vre_failed_offset = + MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); + mutex_exit(&vre->vre_lock); + } + + zio_nowait(zio_unique_parent(zio)); +} + +static void +raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, + dmu_tx_t *tx) +{ + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (offset == 0) + return; + + mutex_enter(&vre->vre_lock); + ASSERT3U(vre->vre_offset, <=, offset); + vre->vre_offset = offset; + mutex_exit(&vre->vre_lock); + + if (vre->vre_offset_pertxg[txgoff] == 0) { + dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, + spa, tx); + } + vre->vre_offset_pertxg[txgoff] = offset; +} + +static boolean_t +raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, + dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + int ashift = vd->vdev_top->vdev_ashift; + uint64_t offset, size; + + if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize, + &offset, &size)) { + return (B_FALSE); + } + ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); + ASSERT3U(size, >=, 1 << ashift); + uint64_t length = 1 << ashift; + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + uint64_t blkid = offset >> ashift; + + int old_children = vd->vdev_children - 1; + + /* + * We can only progress to the point that writes will not overlap + * with blocks whose progress has not yet been recorded on disk. + * Since partially-copied rows are still read from the old location, + * we need to stop one row before the sector-wise overlap, to prevent + * row-wise overlap. + * + * Note that even if we are skipping over a large unallocated region, + * we can't move the on-disk progress to `offset`, because concurrent + * writes/allocations could still use the currently-unallocated + * region. + */ + uint64_t ubsync_blkid = + RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; + uint64_t next_overwrite_blkid = ubsync_blkid + + ubsync_blkid / old_children - old_children; + VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); + + if (blkid >= next_overwrite_blkid) { + raidz_reflow_record_progress(vre, + next_overwrite_blkid << ashift, tx); + + zfs_dbgmsg("copying offset %llu, ubsync offset = %llu, " + "max_overwrite = %llu wait for txg %llu to sync", + (long long)offset, + (long long)ubsync_blkid << ashift, + (long long)next_overwrite_blkid << ashift, + (long long)dmu_tx_get_txg(tx)); + return (B_TRUE); + } + + range_tree_remove(rt, offset, length); + + raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP); + rra->rra_vre = vre; + rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, + offset, length, RL_WRITER); + rra->rra_txg = dmu_tx_get_txg(tx); + + zfs_dbgmsg("initiating reflow write offset=%llu length=%llu", + (long long)offset, (long long)length); + + raidz_reflow_record_progress(vre, offset + length, tx); + + mutex_enter(&vre->vre_lock); + vre->vre_outstanding_bytes += length; + mutex_exit(&vre->vre_lock); + + /* + * SCL_STATE will be released when the read and write are done, + * by raidz_reflow_write_done(). + */ + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + zio_t *pio = spa->spa_txg_zio[txgoff]; + abd_t *abd = abd_alloc_for_io(length, B_FALSE); + zio_t *write_zio = zio_vdev_child_io(pio, NULL, + vd->vdev_child[blkid % vd->vdev_children], + (blkid / vd->vdev_children) << ashift, + abd, length, + ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + raidz_reflow_write_done, rra); + + zio_nowait(zio_vdev_child_io(write_zio, NULL, + vd->vdev_child[blkid % old_children], + (blkid / old_children) << ashift, + abd, length, + ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + raidz_reflow_read_done, rra)); + + return (B_FALSE); +} + +/* + * For testing. + */ +static void +raidz_expand_pause(uint64_t progress) +{ + while (raidz_expand_max_offset_pause != 0 && + raidz_expand_max_offset_pause <= progress) + delay(hz); +} + +static void +raidz_scratch_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + mutex_enter(&pio->io_lock); + pio->io_error = zio_worst_error(pio->io_error, zio->io_error); + mutex_exit(&pio->io_lock); +} + +static void +raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) +{ + vdev_raidz_expand_t *vre = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + zio_t *pio; + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + int ashift = raidvd->vdev_ashift; + uint64_t write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << ashift); + uint64_t logical_size = write_size * raidvd->vdev_children; + uint64_t read_size = + P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), + 1 << ashift); + + /* + * The scratch space much be large enough to get us to the point + * that one row does not overlap itself when moved. This is checked + * by vdev_raidz_attach_check(). + */ + VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); + VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); + VERIFY3U(write_size, <=, read_size); + + zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, + 0, logical_size, RL_WRITER); + + abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), + KM_SLEEP); + for (int i = 0; i < raidvd->vdev_children; i++) { + abds[i] = abd_alloc_linear(read_size, B_FALSE); + } + + raidz_expand_pause(1); + + /* + * Read from original location. + */ + pio = zio_root(spa, NULL, NULL, 0); + for (int i = 0; i < raidvd->vdev_children - 1; i++) { + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + 0, abds[i], read_size, ZIO_TYPE_READ, + ZIO_PRIORITY_ASYNC_READ, 0, raidz_scratch_child_done, pio)); + } + zio_wait(pio); + + /* + * Reflow in memory. + */ + raidz_expand_pause(2); + uint64_t logical_sectors = logical_size >> ashift; + for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { + int oldchild = i % (raidvd->vdev_children - 1); + uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; + + int newchild = i % raidvd->vdev_children; + uint64_t newoff = (i / raidvd->vdev_children) << ashift; + + /* a single sector should not be copying over itself */ + ASSERT(!(newchild == oldchild && newoff == oldoff)); + + abd_copy_off(abds[newchild], abds[oldchild], + newoff, oldoff, 1 << ashift); + } + + /* + * Verify that we filled in everything we intended to (write_size on + * each child). + */ + VERIFY0(logical_sectors % raidvd->vdev_children); + VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, + write_size); + + /* + * Write to scratch location (boot area). + */ + pio = zio_root(spa, NULL, NULL, 0); + for (int i = 0; i < raidvd->vdev_children; i++) { + /* + * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to + * the offset to calculate the physical offset to write to. + * Passing in a negative offset lets us access the boot area. + */ + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], + write_size, ZIO_TYPE_WRITE, + ZIO_PRIORITY_ASYNC_WRITE, 0, + raidz_scratch_child_done, pio)); + } + zio_wait(pio); + pio = zio_root(spa, NULL, NULL, 0); + zio_flush(pio, raidvd); + zio_wait(pio); + + zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", + (long long)logical_size); + + raidz_expand_pause(3); + + /* + * Update uberblock to indicate that scratch space is valid. This is + * needed because after this point, the real location may be + * overwritten. If we crash, we need to get the data from the + * scratch space, rather than the real location. + * + * Note: ub_timestamp is bumped so that vdev_uberblock_compare() + * will prefer this uberblock. + */ + RAIDZ_REFLOW_SET(&spa->spa_ubsync, + RRSS_SCRATCH_VALID, logical_size); + spa->spa_ubsync.ub_timestamp++; + ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, + &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + if (spa_multihost(spa)) + mmp_update_uberblock(spa, &spa->spa_ubsync); + + + zfs_dbgmsg("reflow: uberblock updated " + "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", + (long long)spa->spa_ubsync.ub_txg, + (long long)logical_size, + (long long)spa->spa_ubsync.ub_timestamp); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); + + /* + * Overwrite with reflow'ed data. + */ + pio = zio_root(spa, NULL, NULL, 0); + for (int i = 0; i < raidvd->vdev_children; i++) { + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + 0, abds[i], write_size, ZIO_TYPE_WRITE, + ZIO_PRIORITY_ASYNC_WRITE, 0, + raidz_scratch_child_done, pio)); + } + zio_wait(pio); + pio = zio_root(spa, NULL, NULL, 0); + zio_flush(pio, raidvd); + zio_wait(pio); + + zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", + (long long)logical_size); + + for (int i = 0; i < raidvd->vdev_children; i++) + abd_free(abds[i]); + kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); + + /* + * Update uberblock to indicate that the initial part has been + * reflow'ed. This is needed because after this point (when we exit + * the rangelock), we allow regular writes to this region, which will + * be written to the new location only (because reflow_offset_next == + * reflow_offset_synced). If we crashed and re-copied from the + * scratch space, we would lose the regular writes. + */ + RAIDZ_REFLOW_SET(&spa->spa_ubsync, + RRSS_SCRATCH_NOT_IN_USE, logical_size); + spa->spa_ubsync.ub_timestamp++; + ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, + &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + if (spa_multihost(spa)) + mmp_update_uberblock(spa, &spa->spa_ubsync); + + zfs_dbgmsg("reflow: uberblock updated " + "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", + (long long)spa->spa_ubsync.ub_txg, + (long long)logical_size, + (long long)spa->spa_ubsync.ub_timestamp); + + raidz_expand_pause(6); + + /* + * Update progress. + */ + vre->vre_offset = logical_size; + zfs_rangelock_exit(lr); + spa_config_exit(spa, SCL_STATE, FTAG); + + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + vre->vre_offset_pertxg[txgoff] = vre->vre_offset; + vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; + raidz_reflow_sync(spa, tx); + + raidz_expand_pause(7); +} + +/* + * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work + * here. No other i/o can be in progress, so we don't need the + * vre_rangelock. + */ +void +vdev_raidz_reflow_copy_scratch(spa_t *spa) +{ + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); + ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + ASSERT0(logical_size % raidvd->vdev_children); + uint64_t write_size = logical_size / raidvd->vdev_children; + + zio_t *pio; + + /* + * Read from scratch space. + */ + abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), + KM_SLEEP); + for (int i = 0; i < raidvd->vdev_children; i++) { + abds[i] = abd_alloc_linear(write_size, B_FALSE); + } + + raidz_expand_pause(8); + pio = zio_root(spa, NULL, NULL, 0); + for (int i = 0; i < raidvd->vdev_children; i++) { + /* + * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to + * the offset to calculate the physical offset to write to. + * Passing in a negative offset lets us access the boot area. + */ + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], + write_size, ZIO_TYPE_READ, + ZIO_PRIORITY_ASYNC_READ, 0, + raidz_scratch_child_done, pio)); + } + zio_wait(pio); + raidz_expand_pause(9); + + /* + * Overwrite real location with reflow'ed data. + */ + pio = zio_root(spa, NULL, NULL, 0); + for (int i = 0; i < raidvd->vdev_children; i++) { + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + 0, abds[i], write_size, ZIO_TYPE_WRITE, + ZIO_PRIORITY_ASYNC_WRITE, 0, + raidz_scratch_child_done, pio)); + } + zio_wait(pio); + pio = zio_root(spa, NULL, NULL, 0); + zio_flush(pio, raidvd); + zio_wait(pio); + + zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " + "to real location", (long long)logical_size); + + for (int i = 0; i < raidvd->vdev_children; i++) + abd_free(abds[i]); + kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); + raidz_expand_pause(10); + + /* + * Update uberblock. + */ + RAIDZ_REFLOW_SET(&spa->spa_ubsync, + RRSS_SCRATCH_NOT_IN_USE, logical_size); + spa->spa_ubsync.ub_timestamp++; + VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, + &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + if (spa_multihost(spa)) + mmp_update_uberblock(spa, &spa->spa_ubsync); + + zfs_dbgmsg("reflow recovery: uberblock updated " + "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", + (long long)spa->spa_ubsync.ub_txg, + (long long)logical_size, + (long long)spa->spa_ubsync.ub_timestamp); + + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, + spa_first_txg(spa)); + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + vre->vre_offset = logical_size; + vre->vre_offset_pertxg[txgoff] = vre->vre_offset; + vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; + raidz_reflow_sync(spa, tx); + + dmu_tx_commit(tx); + + spa_config_exit(spa, SCL_STATE, FTAG); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_NOT_IN_USE); +} + +static boolean_t +spa_raidz_expand_cb_check(void *arg, zthr_t *zthr) +{ + (void) zthr; + spa_t *spa = arg; + + return (spa->spa_raidz_expand != NULL && + !spa->spa_raidz_expand->vre_waiting_for_resilver); +} + +static void +spa_raidz_expand_cb(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + + ASSERT(vre->vre_offset == UINT64_MAX || + vre->vre_offset == RRSS_GET_OFFSET(&spa->spa_ubsync)); + vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); + + if (vre->vre_offset == 0) { + VERIFY0(dsl_sync_task(spa_name(spa), + NULL, raidz_reflow_scratch_sync, + vre, 0, ZFS_SPACE_CHECK_NONE)); + } + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + + uint64_t guid = raidvd->vdev_guid; + + for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; + i < raidvd->vdev_ms_count && + !zthr_iscancelled(zthr) && + vre->vre_failed_offset == UINT64_MAX; i++) { + metaslab_t *msp = raidvd->vdev_ms[i]; + + metaslab_disable(msp); + mutex_enter(&msp->ms_lock); + + /* + * The metaslab may be newly created (for the expanded + * space), in which case its trees won't exist yet, + * so we need to bail out early. + */ + if (msp->ms_new) { + mutex_exit(&msp->ms_lock); + metaslab_enable(msp, B_FALSE, B_FALSE); + continue; + } + + VERIFY0(metaslab_load(msp)); + + /* + * We want to copy everything except the free (allocatable) + * space. Note that there may be a little bit more free + * space (e.g. in ms_defer), and it's fine to copy that too. + */ + range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64, + NULL, 0, 0); + range_tree_add(rt, msp->ms_start, msp->ms_size); + range_tree_walk(msp->ms_allocatable, range_tree_remove, rt); + mutex_exit(&msp->ms_lock); + + /* + * Force the last sector of each metaslab to be copied. This + * ensures that we advance the on-disk progress to the end of + * this metaslab while the metaslab is disabled. Otherwise, we + * could move past this metaslab without advancing the on-disk + * progress, and then an allocation to this metaslab would not + * be copied. + */ + int sectorsz = 1 << raidvd->vdev_ashift; + uint64_t ms_last_offset = msp->ms_start + + msp->ms_size - sectorsz; + if (!range_tree_contains(rt, ms_last_offset, sectorsz)) { + range_tree_add(rt, ms_last_offset, sectorsz); + } + + /* + * When we are resuming from a paused expansion (i.e. + * when importing a pool with a expansion in progress), + * discard any state that we have already processed. + */ + range_tree_clear(rt, 0, vre->vre_offset); + + while (!zthr_iscancelled(zthr) && + !range_tree_is_empty(rt) && + vre->vre_failed_offset == UINT64_MAX) { + + /* + * We need to periodically drop the config lock so that + * writers can get in. Additionally, we can't wait + * for a txg to sync while holding a config lock + * (since a waiting writer could cause a 3-way deadlock + * with the sync thread, which also gets a config + * lock for reader). So we can't hold the config lock + * while calling dmu_tx_assign(). + */ + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * This delay will pause the removal around the point + * specified by zfs_remove_max_bytes_pause. We do this + * solely from the test suite or during debugging. + */ + while (raidz_expand_max_offset_pause != 0 && + raidz_expand_max_offset_pause <= vre->vre_offset && + !zthr_iscancelled(zthr)) + delay(hz); + + mutex_enter(&vre->vre_lock); + while (vre->vre_outstanding_bytes > + raidz_expand_max_copy_bytes) { + cv_wait(&vre->vre_cv, &vre->vre_lock); + } + mutex_exit(&vre->vre_lock); + + dmu_tx_t *tx = + dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + /* + * Reacquire the vdev_config lock. Theoretically, the + * vdev_t that we're expanding may have changed. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + + boolean_t needsync = + raidz_reflow_impl(raidvd, vre, rt, tx); + + dmu_tx_commit(tx); + + if (needsync) { + spa_config_exit(spa, SCL_CONFIG, FTAG); + txg_wait_synced(spa->spa_dsl_pool, txg); + spa_config_enter(spa, SCL_CONFIG, FTAG, + RW_READER); + } + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + metaslab_enable(msp, B_FALSE, B_FALSE); + range_tree_vacate(rt, NULL, NULL); + range_tree_destroy(rt); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * The txg_wait_synced() here ensures that all reflow zio's have + * completed, and vre_failed_offset has been set if necessary. It + * also ensures that the progress of the last raidz_reflow_sync() is + * written to disk before raidz_reflow_complete_sync() changes the + * in-memory vre_state. vdev_raidz_io_start() uses vre_state to + * determine if a reflow is in progress, in which case we may need to + * write to both old and new locations. Therefore we can only change + * vre_state once this is not necessary, which is once the on-disk + * progress (in spa_ubsync) has been set past any possible writes (to + * the end of the last metaslab). + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + + if (!zthr_iscancelled(zthr) && + vre->vre_failed_offset == UINT64_MAX) { + /* + * We are not being canceled, so the reflow must be + * complete. In that case also mark it as completed on disk. + */ + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + raidz_reflow_complete_sync, spa, + 0, ZFS_SPACE_CHECK_NONE)); + (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); + } else { + /* + * Wait for all copy zio's to complete and for all the + * raidz_reflow_sync() synctasks to be run. + */ + spa_history_log_internal(spa, "reflow pause", + NULL, "offset=%llu failed_offset=%lld", + (long long)vre->vre_offset, + (long long)vre->vre_failed_offset); + mutex_enter(&vre->vre_lock); + if (vre->vre_failed_offset != UINT64_MAX) { + /* + * Reset progress so that we will retry everything + * after the point that something failed. + */ + vre->vre_offset = vre->vre_failed_offset; + vre->vre_failed_offset = UINT64_MAX; + vre->vre_waiting_for_resilver = B_TRUE; + } + mutex_exit(&vre->vre_lock); + } +} + +void +spa_start_raidz_expansion_thread(spa_t *spa) +{ + ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); + spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", + spa_raidz_expand_cb_check, spa_raidz_expand_cb, spa, defclsyspri); +} + +void +raidz_dtl_reassessed(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + if (spa->spa_raidz_expand != NULL) { + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + if (vd->vdev_top->vdev_id == vre->vre_vdev_id) { + mutex_enter(&vre->vre_lock); + if (vre->vre_waiting_for_resilver) { + vdev_dbgmsg(vd, "DTL reassessed, " + "continuing raidz expansion"); + vre->vre_waiting_for_resilver = B_FALSE; + zthr_wakeup(spa->spa_raidz_expand_zthr); + } + mutex_exit(&vre->vre_lock); + } + } +} + +int +vdev_raidz_attach_check(vdev_t *new_child) +{ + vdev_t *raidvd = new_child->vdev_parent; + uint64_t new_children = raidvd->vdev_children; + + /* + * We use the "boot" space as scratch space to handle overwriting the + * initial part of the vdev. If it is too small, then this expansion + * is not allowed. This would be very unusual (e.g. ashift > 13 and + * >200 children). + */ + if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { + return (EINVAL); + } + return (0); +} + +void +vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) +{ + vdev_t *new_child = arg; + spa_t *spa = new_child->vdev_spa; + vdev_t *raidvd = new_child->vdev_parent; + vdev_raidz_t *vdrz = raidvd->vdev_tsd; + ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); + ASSERT3P(raidvd->vdev_top, ==, raidvd); + ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); + ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); + ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, + new_child); + + spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); + + vdrz->vd_physical_width++; + + VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); + vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; + vdrz->vn_vre.vre_offset = 0; + vdrz->vn_vre.vre_failed_offset = UINT64_MAX; + spa->spa_raidz_expand = &vdrz->vn_vre; + zthr_wakeup(spa->spa_raidz_expand_zthr); + + /* + * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get + * written to the config. + */ + vdev_config_dirty(raidvd); + + vdrz->vn_vre.vre_start_time = gethrestime_sec(); + vdrz->vn_vre.vre_end_time = 0; + vdrz->vn_vre.vre_state = DSS_SCANNING; + vdrz->vn_vre.vre_bytes_copied = 0; + + uint64_t state = vdrz->vn_vre.vre_state; + VERIFY0(zap_update(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, + sizeof (state), 1, &state, tx)); + + uint64_t start_time = vdrz->vn_vre.vre_start_time; + VERIFY0(zap_update(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, + sizeof (start_time), 1, &start_time, tx)); + + (void) zap_remove(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); + (void) zap_remove(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); + + spa_history_log_internal(spa, "raidz vdev expansion started", tx, + "%s vdev %llu new width %llu", spa_name(spa), + (unsigned long long)raidvd->vdev_id, + (unsigned long long)raidvd->vdev_children); +} + +int +vdev_raidz_load(vdev_t *vd) +{ + vdev_raidz_t *vdrz = vd->vdev_tsd; + int err; + + uint64_t state = DSS_NONE; + uint64_t start_time = 0; + uint64_t end_time = 0; + uint64_t bytes_copied = 0; + + if (vd->vdev_top_zap != 0) { + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, + sizeof (state), 1, &state); + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, + sizeof (start_time), 1, &start_time); + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, + sizeof (end_time), 1, &end_time); + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, + sizeof (bytes_copied), 1, &bytes_copied); + if (err != 0 && err != ENOENT) + return (err); + } + + /* + * If we are in the middle of expansion, vre_state should have + * already been set by vdev_raidz_init(). + */ + EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); + vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; + vdrz->vn_vre.vre_start_time = start_time; + vdrz->vn_vre.vre_end_time = end_time; + vdrz->vn_vre.vre_bytes_copied = bytes_copied; + + return (0); +} + +int +spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) +{ + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + + if (vre == NULL) { + /* no removal in progress; find most recent completed */ + for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; + if (vd->vdev_ops == &vdev_raidz_ops) { + vdev_raidz_t *vdrz = vd->vdev_tsd; + + if (vdrz->vn_vre.vre_end_time != 0 && + (vre == NULL || + vdrz->vn_vre.vre_end_time > + vre->vre_end_time)) { + vre = &vdrz->vn_vre; + } + } + } + } + + if (vre == NULL) { + return (SET_ERROR(ENOENT)); + } + + pres->pres_state = vre->vre_state; + pres->pres_expanding_vdev = vre->vre_vdev_id; + + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + pres->pres_to_reflow = vd->vdev_stat.vs_alloc; + + mutex_enter(&vre->vre_lock); + pres->pres_reflowed = vre->vre_bytes_copied; + for (int i = 0; i < TXG_SIZE; i++) + pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; + mutex_exit(&vre->vre_lock); + + pres->pres_start_time = vre->vre_start_time; + pres->pres_end_time = vre->vre_end_time; + pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; + + return (0); +} + /* * Initialize private RAIDZ specific fields from the nvlist. */ static int vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) { - vdev_raidz_t *vdrz; - uint64_t nparity; - uint_t children; nvlist_t **child; int error = nvlist_lookup_nvlist_array(nv, @@ -2566,6 +4541,7 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) if (error != 0) return (SET_ERROR(EINVAL)); + uint64_t nparity; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); @@ -2592,10 +4568,54 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) nparity = 1; } - vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); - vdrz->vd_logical_width = children; + vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); + vdrz->vn_vre.vre_vdev_id = -1; + vdrz->vn_vre.vre_offset = UINT64_MAX; + vdrz->vn_vre.vre_failed_offset = UINT64_MAX; + mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); + zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); + mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, + sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); + + vdrz->vd_physical_width = children; vdrz->vd_nparity = nparity; + /* note, the ID does not exist when creating a pool */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, + &vdrz->vn_vre.vre_vdev_id); + + boolean_t reflow_in_progress = + nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); + zfs_dbgmsg("reflow_in_progress=%u", (int)reflow_in_progress); + if (reflow_in_progress) { + spa->spa_raidz_expand = &vdrz->vn_vre; + vdrz->vn_vre.vre_state = DSS_SCANNING; + } + + vdrz->vd_original_width = children; + uint64_t *txgs; + unsigned int txgs_size; + error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, + &txgs, &txgs_size); + if (error == 0) { + for (int i = 0; i < txgs_size; i++) { + reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); + re->re_txg = txgs[txgs_size - i - 1]; + re->re_logical_width = vdrz->vd_physical_width - i; + + if (reflow_in_progress) + re->re_logical_width--; + + avl_add(&vdrz->vd_expand_txgs, re); + } + + vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; + } + if (reflow_in_progress) + vdrz->vd_original_width--; + *tsd = vdrz; return (0); @@ -2604,7 +4624,20 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) static void vdev_raidz_fini(vdev_t *vd) { - kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t)); + vdev_raidz_t *vdrz = vd->vdev_tsd; + if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) + vd->vdev_spa->spa_raidz_expand = NULL; + reflow_node_t *re; + void *cookie = NULL; + avl_tree_t *tree = &vdrz->vd_expand_txgs; + while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) + kmem_free(re, sizeof (*re)); + avl_destroy(&vdrz->vd_expand_txgs); + mutex_destroy(&vdrz->vd_expand_lock); + mutex_destroy(&vdrz->vn_vre.vre_lock); + cv_destroy(&vdrz->vn_vre.vre_cv); + zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); + kmem_free(vdrz, sizeof (*vdrz)); } /* @@ -2632,6 +4665,29 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) * it. */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); + + if (vdrz->vn_vre.vre_state == DSS_SCANNING) { + fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); + } + + mutex_enter(&vdrz->vd_expand_lock); + if (!avl_is_empty(&vdrz->vd_expand_txgs)) { + uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); + uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, + KM_SLEEP); + uint64_t i = 0; + + for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); + re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { + txgs[i++] = re->re_txg; + } + + fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, + txgs, count); + + kmem_free(txgs, sizeof (uint64_t) * count); + } + mutex_exit(&vdrz->vd_expand_lock); } static uint64_t @@ -2671,3 +4727,10 @@ vdev_ops_t vdev_raidz_ops = { .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; + +ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_offset_pause, ULONG, ZMOD_RW, + "For testing, pause RAIDZ expansion at this offset"); +ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, + "Max amount of concurrent i/o for RAIDZ expansion"); +ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, + "For expanded RAIDZ, aggregate reads that have more rows than this"); diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 0d71b9434342..f6f14b241fd7 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -168,7 +168,8 @@ static boolean_t vdev_trim_should_stop(vdev_t *vd) { return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) || - vd->vdev_detached || vd->vdev_top->vdev_removing); + vd->vdev_detached || vd->vdev_top->vdev_removing || + vd->vdev_top->vdev_rz_expanding); } /* @@ -179,6 +180,7 @@ vdev_autotrim_should_stop(vdev_t *tvd) { return (tvd->vdev_autotrim_exit_wanted || !vdev_writeable(tvd) || tvd->vdev_removing || + tvd->vdev_rz_expanding || spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF); } @@ -221,7 +223,8 @@ vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx) kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); - if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + if (vd == NULL || vd->vdev_top->vdev_removing || + !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding) return; uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK]; @@ -995,6 +998,7 @@ vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure) ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_trim_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_rz_expanding); vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure); vd->vdev_trim_thread = thread_create(NULL, 0, @@ -1152,12 +1156,13 @@ vdev_trim_restart(vdev_t *vd) ASSERT(err == 0 || err == ENOENT); vd->vdev_trim_action_time = timestamp; - if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || - vd->vdev_offline) { + if ((vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || + vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) { /* load progress for reporting, but don't resume */ VERIFY0(vdev_trim_load(vd)); } else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE && vdev_writeable(vd) && !vd->vdev_top->vdev_removing && + !vd->vdev_top->vdev_rz_expanding && vd->vdev_trim_thread == NULL) { VERIFY0(vdev_trim_load(vd)); vdev_trim(vd, vd->vdev_trim_rate, @@ -1482,7 +1487,8 @@ vdev_autotrim(spa_t *spa) mutex_enter(&tvd->vdev_autotrim_lock); if (vdev_writeable(tvd) && !tvd->vdev_removing && - tvd->vdev_autotrim_thread == NULL) { + tvd->vdev_autotrim_thread == NULL && + !tvd->vdev_rz_expanding) { ASSERT3P(tvd->vdev_top, ==, tvd); tvd->vdev_autotrim_thread = thread_create(NULL, 0, @@ -1707,6 +1713,7 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_top->vdev_rz_expanding); ta.trim_vdev = vd; ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 342f56d50d04..1b5e100a4aba 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -769,8 +769,11 @@ tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted', tags = ['functional', 'redacted_send'] [tests/functional/raidz] -tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos'] +tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos', + 'raidz_expand_001_pos', 'raidz_expand_002_pos', 'raidz_expand_003_neg', + 'raidz_expand_003_pos', 'raidz_expand_004_pos', 'raidz_expand_005_pos'] tags = ['functional', 'raidz'] +timeout = 1200 [tests/functional/redundancy] tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2', diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 80e7bcb3bd09..24c21eea796b 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -34,6 +34,7 @@ DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check DMU_OFFSET_NEXT_SYNC dmu_offset_next_sync zfs_dmu_offset_next_sync +EMBEDDED_SLOG_MIN_MS embedded_slog_min_ms zfs_embedded_slog_min_ms INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size INITIALIZE_VALUE initialize_value zfs_initialize_value KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export @@ -62,6 +63,7 @@ MULTIHOST_IMPORT_INTERVALS multihost.import_intervals zfs_multihost_import_inter MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable +RAIDZ_EXPAND_MAX_OFFSET_PAUSE vdev.expand_max_offset_pause raidz_expand_max_offset_pause REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index ff65dc1ac2b0..ec4a8a41ed6e 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1648,6 +1648,12 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/raidz/raidz_002_pos.ksh \ functional/raidz/raidz_003_pos.ksh \ functional/raidz/raidz_004_pos.ksh \ + functional/raidz/raidz_expand_001_pos.ksh \ + functional/raidz/raidz_expand_002_pos.ksh \ + functional/raidz/raidz_expand_003_neg.ksh \ + functional/raidz/raidz_expand_003_pos.ksh \ + functional/raidz/raidz_expand_004_pos.ksh \ + functional/raidz/raidz_expand_005_pos.ksh \ functional/raidz/setup.ksh \ functional/redacted_send/cleanup.ksh \ functional/redacted_send/redacted_compressed.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 160a0ca2e6db..ef32f736310c 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -105,5 +105,6 @@ if is_linux || is_freebsd; then "feature@blake3" "feature@block_cloning" "feature@vdev_zaps_v2" + "feature@raidz_expansion" ) fi diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh index ce44906d5a4f..39e474793d50 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh @@ -36,6 +36,6 @@ # runtime might be longer. # -log_must raidz_test -S -e -t 60 +log_must raidz_test -S -e -t 300 log_pass "raidz_test parameter sweep test with expanded map succeeded." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh index 0e3affd5143c..bd0cc9cf4145 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh @@ -36,6 +36,6 @@ # runtime might be longer. # -log_must raidz_test -S -e -r 0 -t 60 +log_must raidz_test -S -e -r 0 -t 300 log_pass "raidz_test parameter sweep test with expanded map succeeded." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh new file mode 100755 index 000000000000..c96202abae9f --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh @@ -0,0 +1,220 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool attach poolname raidz ...' should attach new devive to the pool. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool +# - fill it with some directories/files +# - attach device to the raidz pool +# - verify that device attached and the raidz pool size increase +# - verify resilver by replacing parity devices +# - verify resilver by replacing data devices +# - verify scrub by zeroing parity devices +# - verify scrub by zeroing data devices +# - verify the raidz pool +# - destroy the raidz pool + +typeset -r devs=6 +typeset -r dev_size_mb=128 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) +max_offset=$(get_tunable RAIDZ_EXPAND_MAX_OFFSET_PAUSE) + +function cleanup +{ + log_pos zpool status $TESTPOOL + + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $prefetch_disable + log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $max_offset +} + +function wait_expand_paused +{ + oldcopied='0' + newcopied='1' + while [[ $oldcopied != $newcopied ]]; do + oldcopied=$newcopied + sleep 2 + newcopied=$(zpool status $TESTPOOL | \ + grep 'copied out of' | \ + awk '{print $1}') + log_note "newcopied=$newcopied" + done + log_note "paused at $newcopied" +} + +function test_resilver # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool offline $pool $dir/dev-$i + done + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool labelclear -f $dir/dev-$i + done + + log_must zpool import -o cachefile=none -d $dir $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool replace -f $pool $dir/dev-$i + done + + while ! is_pool_resilvered $pool; do + sleep 1 + done + + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool offline $pool $dir/dev-$i + done + + log_must zpool export $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool labelclear -f $dir/dev-$i + done + + log_must zpool import -o cachefile=none -d $dir $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool replace -f $pool $dir/dev-$i + done + + while ! is_pool_resilvered $pool; do + sleep 1 + done + + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool +} + +function test_scrub # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + typeset combrec=$4 + + randbyte=$(( ((RANDOM<<15) + RANDOM) % \ + (dev_size_mb * (devs-1) * 1024 * 1024) )) + log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $randbyte + log_must zpool attach $TESTPOOL ${raid}-0 $dir/dev-$devs + wait_expand_paused + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + log_must zpool scrub -w $pool + log_must zpool clear $pool + log_must zpool export $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + log_must zpool scrub -w $pool + + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool + log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $max_offset + log_must zpool wait -t raidz_expand $TESTPOOL +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 + +# Disk files which will be used by pool +for i in {0..$(($devs - 1))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +# Disk file which will be attached +log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs + +for nparity in 1 2 3; do + raid=raidz$nparity + dir=$TEST_BASE_DIR + + log_must zpool create -f -o cachefile=none $TESTPOOL $raid ${disks[@]} + log_must zfs set primarycache=metadata $TESTPOOL + + log_must zfs create $TESTPOOL/fs + log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R + + log_must zfs create -o compress=on $TESTPOOL/fs2 + log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R + + log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 + log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R + + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + + test_scrub $TESTPOOL $nparity $dir + # XXX - why is test_resilver commented out? + #test_resilver $TESTPOOL $nparity $dir + + zpool destroy "$TESTPOOL" +done + +log_pass "raidz expansion test succeeded." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh new file mode 100755 index 000000000000..23e050a9b15e --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh @@ -0,0 +1,116 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool attach poolname raidz ...' should attach new devive to the pool. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool with minimum block device files required +# - for each free test block device +# - attach to the pool +# - verify the raidz pool +# - destroy the raidz pool + +typeset -r devs=6 +typeset -r dev_size_mb=512 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $prefetch_disable +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +for nparity in 1 2 3; do + raid=raidz$nparity + dir=$TEST_BASE_DIR + pool=$TESTPOOL + opts="-o cachefile=none" + + log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} + log_must zfs set primarycache=metadata $pool + + log_must zfs create $pool/fs + log_must fill_fs /$pool/fs 1 512 100 1024 R + + log_must zfs create -o compress=on $pool/fs2 + log_must fill_fs /$pool/fs2 1 512 100 1024 R + + log_must zfs create -o compress=on -o recordsize=8k $pool/fs3 + log_must fill_fs /$pool/fs3 1 512 100 1024 R + + typeset pool_size=$(get_pool_prop size $pool) + + for disk in ${disks[$(($nparity+2))..$devs]}; do + log_must dd if=/dev/urandom of=/${pool}/FILE-$RANDOM bs=1M \ + count=64 + + log_must zpool attach -w $pool ${raid}-0 $disk + + # Wait some time for pool size increase + sleep 5 + + # Confirm that disk was attached to the pool + log_must zpool get -H path $TESTPOOL $disk + + typeset expand_size=$(get_pool_prop size $pool) + if [[ "$expand_size" -le "$pool_size" ]]; then + log_fail "pool $pool not expanded" + fi + + verify_pool $pool + + pool_size=$expand_size + done + + zpool destroy "$pool" +done + +log_pass "raidz expansion test succeeded." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh new file mode 100755 index 000000000000..5e5ff6717083 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool attach poolname raidz ...' should reject device attach if pool +# is in checkpointed state. If checkpoint creation requested on +# expanding pool, the request should be rejected. + +# +# STRATEGY: +# 1. Create block device files for the test raidz pool. +# 2. Create pool and checkpoint it. +# 3. Try to expand raidz, ensure that request rejected. +# 4. Recreate the pool. +# 5. Apply raidz expansion. +# 6. Ensure, that checkpoint cannot be created. + +typeset -r devs=6 +typeset -r dev_size_mb=512 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $prefetch_disable +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +nparity=1 +raid=raidz$nparity +pool=$TESTPOOL +opts="-o cachefile=none" + +# case 1: checkpoint exist, try to expand +log_must zpool create -f $opts $pool $raid ${disks[1..$(($devs-1))]} +log_must zfs set primarycache=metadata $pool +log_must zpool checkpoint $pool +log_mustnot zpool attach $pool ${raid}-0 ${disks[$devs]} +log_must zpool destroy $pool + +# case 2: expansion in progress, try to checkpoint +log_must zpool create -f $opts $pool $raid ${disks[1..$(($devs-1))]} +log_must zfs set primarycache=metadata $pool +log_must zfs create $pool/fs +log_must fill_fs /$pool/fs 1 512 100 1024 R +log_must zpool attach $pool ${raid}-0 ${disks[$devs]} +log_mustnot zpool checkpoint $pool +log_must zpool destroy $pool + +log_pass "raidz expansion test succeeded." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh new file mode 100755 index 000000000000..b06ccfeb14ab --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh @@ -0,0 +1,135 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_wait/zfs_wait.kshlib + +# +# DESCRIPTION: +# Check raidz expansion is able to work correctly under i/o load. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool with minimum block device files required +# - create couple of datasets with different recordsize and fill it +# - set raidz expand offset pause +# - start randwritecomp on one of the datasets files +# - attach new device to the pool +# - wait reflow offset become equal to raidz expand pause offset +# - kill randwritecomp +# - verify pool +# - set raidz expand offset to max value to complete raidz expansion + +typeset -r devs=10 +typeset -r dev_size_mb=128 + +typeset -a disks + +embedded_slog_min_ms=$(get_tunable EMBEDDED_SLOG_MIN_MS) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 EMBEDDED_SLOG_MIN_MS $embedded_slog_min_ms +} + +function wait_expand_paused +{ + oldcopied='0' + newcopied='1' + while [[ $oldcopied != $newcopied ]]; do + oldcopied=$newcopied + sleep 1 + newcopied=$(zpool status $TESTPOOL | \ + grep 'copied out of' | \ + awk '{print $1}') + done +} + +log_onexit cleanup + +log_must set_tunable32 EMBEDDED_SLOG_MIN_MS 99999 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +for nparity in 1 2 3; do + raid=raidz$nparity + pool=$TESTPOOL + opts="-o cachefile=none" + + log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} + + log_must zfs create -o recordsize=8k $pool/fs + log_must fill_fs /$pool/fs 1 128 100 1024 R + + log_must zfs create -o recordsize=128k $pool/fs2 + log_must fill_fs /$pool/fs2 1 128 100 1024 R + + for disk in ${disks[$(($nparity+2))..$devs]}; do + pool_size=$(get_pool_prop size $pool) + pause=$((((RANDOM << 15) + RANDOM) % pool_size)) + log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $pause + + log_bkgrnd randwritecomp /$pool/fs/file + pid0=$! + + log_bkgrnd randwritecomp /$pool/fs2/file + pid1=$! + + log_must zpool attach $pool ${raid}-0 $disk + wait_expand_paused + + kill_if_running $pid0 + kill_if_running $pid1 + + log_must zpool scrub -w $pool + + log_must check_pool_status $pool "errors" "No known data errors" + log_must check_pool_status $pool "scan" "with 0 errors" + log_must check_pool_status $pool "scan" "repaired 0B" + + pause=$((devs*dev_size_mb*1024*1024)) + log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $pause + + log_must zpool wait -t raidz_expand $pool + done + + log_must zpool destroy "$pool" +done + +log_pass "raidz expansion test succeeded." + diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh new file mode 100755 index 000000000000..585a293ae90b --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh @@ -0,0 +1,119 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Check device replacement during raidz expansion. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool with minimum block device files required +# - create couple of datasets with different recordsize and fill it +# - attach new device to the pool +# - offline and zero vdevs allowed by parity +# - wait some time and start offlined vdevs replacement +# - wait replacement completion and verify pool status + +typeset -r devs=10 +typeset -r dev_size_mb=128 + +typeset -a disks + +embedded_slog_min_ms=$(get_tunable EMBEDDED_SLOG_MIN_MS) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $embedded_slog_min_ms +} + +log_onexit cleanup + +log_must set_tunable32 EMBEDDED_SLOG_MIN_MS 99999 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +for nparity in 1 2 3; do + raid=raidz$nparity + pool=$TESTPOOL + opts="-o cachefile=none" + + log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} + + log_must zfs create -o recordsize=8k $pool/fs + log_must fill_fs /$pool/fs 1 128 100 1024 R + + log_must zfs create -o recordsize=128k $pool/fs2 + log_must fill_fs /$pool/fs2 1 128 100 1024 R + + for disk in ${disks[$(($nparity+2))..$devs]}; do + log_must zpool attach $pool ${raid}-0 $disk + + sleep 10 + + for (( i=1; i<=$nparity; i=i+1 )); do + log_must zpool offline $pool ${disks[$i]} + log_must dd if=/dev/zero of=${disks[$i]} \ + bs=1024k count=$dev_size_mb conv=notrunc + done + + sleep 3 + + for (( i=1; i<=$nparity; i=i+1 )); do + log_must zpool replace $pool ${disks[$i]} + done + + log_must zpool wait -t replace $pool + log_must check_pool_status $pool "scan" "with 0 errors" + + log_must zpool wait -t raidz_expand $pool + + log_must zpool clear $pool + log_must zpool scrub -w $pool + + # XXX step sometimes FAILED + log_must zpool status -v + # log_must check_pool_status $pool "scan" "repaired 0B" + done + + log_must zpool destroy "$pool" +done + +log_pass "raidz expansion test succeeded." + diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh new file mode 100755 index 000000000000..954ee8a82d79 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh @@ -0,0 +1,170 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Check device replacement during raidz expansion using expansion pausing. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool with minimum block device files required +# - create couple of datasets with different recordsize and fill it +# - set raidz expand offset pause +# - attach new device to the pool +# - wait reflow offset become equal to raidz expand pause offset +# - offline and zero vdevs allowed by parity +# - wait some time and start offlined vdevs replacement +# - wait replacement completion and verify pool status +# - loop thru vdevs replacing and raidz expand pause offset increasing +# - verify pool +# - set raidz expand offset to max value to complete raidz expansion + +typeset -r devs=10 +typeset -r dev_size_mb=128 + +typeset -a disks + +embedded_slog_min_ms=$(get_tunable EMBEDDED_SLOG_MIN_MS) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $embedded_slog_min_ms +} + +function wait_expand_paused +{ + oldcopied='0' + newcopied='1' + while [[ $oldcopied != $newcopied ]]; do + oldcopied=$newcopied + sleep 1 + newcopied=$(zpool status $TESTPOOL | \ + grep 'copied out of' | \ + awk '{print $1}') + done +} + +log_onexit cleanup + +function test_replace # +{ + pool=${1} + devices=${2} + nparity=${3} + device_count=0 + + log_must echo "devices=$devices" + + for dev in ${devices}; do + device_count=$((device_count+1)) + done + + index=$((RANDOM%(device_count-nparity))) + for (( j=1; j<=$nparity; j=j+1 )); do + log_must zpool offline $pool ${disks[$((index+j))]} + log_must dd if=/dev/zero of=${disks[$((index+j))]} \ + bs=1024k count=$dev_size_mb conv=notrunc + done + + for (( j=1; j<=$nparity; j=j+1 )); do + log_must zpool replace $pool ${disks[$((index+j))]} + done + + log_must zpool wait -t replace $pool + log_must check_pool_status $pool "scan" "with 0 errors" + + log_must zpool clear $pool + log_must zpool scrub -w $pool + + # XXX step sometimes FAILED + log_must zpool status -v + # log_must check_pool_status $pool "scan" "repaired 0B" +} + +log_must set_tunable32 EMBEDDED_SLOG_MIN_MS 99999 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +for nparity in 1 2 3; do + raid=raidz$nparity + pool=$TESTPOOL + opts="-o cachefile=none" + devices="" + + log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} + devices="${disks[1..$(($nparity+1))]}" + + log_must zfs create -o recordsize=8k $pool/fs + log_must fill_fs /$pool/fs 1 128 100 1024 R + + log_must zfs create -o recordsize=128k $pool/fs2 + log_must fill_fs /$pool/fs2 1 128 100 1024 R + + for disk in ${disks[$(($nparity+2))..$devs]}; do + pool_size=$(get_pool_prop size $pool) + pause=$((((RANDOM << 15) + RANDOM) % pool_size / 2)) + log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $pause + + log_must zpool attach $pool ${raid}-0 $disk + devices="$devices $disk" + + wait_expand_paused + + for (( i=0; i<2; i++ )); do + test_replace $pool "$devices" $nparity + + pause=$((pause + (((RANDOM << 15) + RANDOM) % \ + pool_size) / 4)) + log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $pause + + wait_expand_paused + done + + pause=$((devs*dev_size_mb*1024*1024)) + log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $pause + + log_must zpool wait -t raidz_expand $pool + done + + log_must zpool destroy "$pool" +done + +log_pass "raidz expansion test succeeded." +