diff --git a/cmd/ztest.c b/cmd/ztest.c index e5aba5507404..d655a9f28eea 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -275,7 +275,8 @@ extern uint_t dmu_object_alloc_chunk_shift; extern boolean_t zfs_force_some_double_word_sm_entries; extern unsigned long zio_decompress_fail_fraction; extern unsigned long zfs_reconstruct_indirect_damage_fraction; -extern uint64_t raidz_expand_max_offset_pause; +extern uint64_t raidz_expand_max_reflow_bytes; +extern uint_t raidz_expand_pause_point; static ztest_shared_opts_t *ztest_shared_opts; @@ -1226,21 +1227,21 @@ ztest_kill(ztest_shared_t *zs) * Before we kill ourselves, make sure that the config is updated. * See comment above spa_write_cachefile(). */ - if (raidz_expand_max_offset_pause) { + if (raidz_expand_pause_point != RAIDZ_EXPAND_PAUSE_NONE) { if (mutex_tryenter(&spa_namespace_lock)) { spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); mutex_exit(&spa_namespace_lock); ztest_scratch_state->zs_raidz_scratch_verify_pause = - raidz_expand_max_offset_pause; + raidz_expand_pause_point; } else { /* * Do not verify scratch object in case if * spa_namespace_lock cannot be acquired, * it can cause deadlock in spa_config_update(). */ - raidz_expand_max_offset_pause = 0; + raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; return; } @@ -3949,7 +3950,7 @@ raidz_scratch_verify(void) vdev_raidz_expand_t *vre; vdev_t *raidvd; - ASSERT(raidz_expand_max_offset_pause == 0); + ASSERT(raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE); if (ztest_scratch_state->zs_raidz_scratch_verify_pause == 0) return; @@ -4031,7 +4032,7 @@ ztest_scratch_thread(void *arg) /* wait up to 10 seconds */ for (int t = 100; t > 0; t -= 1) { - if (raidz_expand_max_offset_pause == 0) + if (raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) thread_exit(); (void) poll(NULL, 0, 100); @@ -4113,14 +4114,14 @@ ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) 0, 0, 1); /* - * 50% of the time, set raidz_expand_max_offset_pause to cause + * 50% of the time, set raidz_expand_pause_point to cause * raidz_reflow_scratch_sync() and vdev_raidz_reflow_copy_scratch() * to pause at a certain point and then kill the test after 10 * seconds so raidz_scratch_verify() can confirm consistency when * the pool is imported. */ if (ztest_random(2) == 0 && expected_error == 0) { - raidz_expand_max_offset_pause = + raidz_expand_pause_point = ztest_random(RAIDZ_EXPAND_PAUSE_SCRATCH_NOT_IN_USE) + 1; scratch_thread = thread_create(NULL, 0, ztest_scratch_thread, ztest_shared, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); @@ -4140,13 +4141,13 @@ ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) newpath, newsize, error, expected_error); } - if (raidz_expand_max_offset_pause) { + if (raidz_expand_pause_point) { if (error != 0) { /* * Do not verify scratch object in case of error * returned by vdev attaching. */ - raidz_expand_max_offset_pause = 0; + raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; } VERIFY0(thread_join(scratch_thread)); @@ -7635,8 +7636,10 @@ ztest_thread(void *arg) /* * See if it's time to force a crash. */ - if (now > zs->zs_thread_kill && !raidz_expand_max_offset_pause) + if (now > zs->zs_thread_kill && + raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) { ztest_kill(zs); + } /* * If we're getting ENOSPC with some regularity, stop. @@ -8110,7 +8113,7 @@ ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) /* Set our reflow target to 25%, 50% or 75% of allocated size */ uint_t multiple = ztest_random(3) + 1; uint64_t reflow_max = (rzvd->vdev_stat.vs_alloc * multiple) / 4; - raidz_expand_max_offset_pause = reflow_max; + raidz_expand_max_reflow_bytes = reflow_max; if (ztest_opts.zo_verbose >= 1) { (void) printf("running raidz expansion test, killing when " @@ -8194,7 +8197,7 @@ ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) } /* Reset the reflow pause before killing */ - raidz_expand_max_offset_pause = 0; + raidz_expand_max_reflow_bytes = 0; if (ztest_opts.zo_verbose >= 1) { (void) printf("killing raidz expansion test after reflow " diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index 6a7961428e63..69e2ca19ec1f 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -159,10 +159,18 @@ extern void spa_start_raidz_expansion_thread(spa_t *); extern int spa_raidz_expand_get_stats(spa_t *, pool_raidz_expand_stat_t *); extern int vdev_raidz_load(vdev_t *); -/* Some of raidz scratch area states */ +/* RAIDZ scratch area pause points (for testing) */ #define RAIDZ_EXPAND_PAUSE_NONE 0 +#define RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1 1 +#define RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2 2 +#define RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3 3 #define RAIDZ_EXPAND_PAUSE_SCRATCH_VALID 4 #define RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED 5 +#define RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1 6 +#define RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2 7 +#define RAIDZ_EXPAND_PAUSE_SCRATCH_CRASH_COPY_1 8 +#define RAIDZ_EXPAND_PAUSE_SCRATCH_CRASH_COPY_2 9 +#define RAIDZ_EXPAND_PAUSE_SCRATCH_CRASH_COPY_3 10 #define RAIDZ_EXPAND_PAUSE_SCRATCH_NOT_IN_USE 11 #ifdef __cplusplus diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 56cab2406f70..db700d1e74ec 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -428,8 +428,8 @@ active. Max amount of memory to use for RAID-Z expansion I/O. This limits how much I/O can be outstanding at once. . -.It Sy raidz_expand_max_offset_pause Ns = Ns Sy 0 Pq ulong -For testing, pause RAID-Z expansion at this offset. +.It Sy raidz_expand_max_reflow_bytes Ns = Ns Sy 0 Pq ulong +For testing, pause RAID-Z expansion when reflow amount reaches this value. . .It Sy raidz_io_aggregate_rows Ns = Ns Sy 4 Pq ulong For expanded RAID-Z, aggregate reads that have more rows than this. diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 77cbcd8a8e93..8612c9951003 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -142,13 +142,18 @@ } /* - * For testing only: logical offset at which to pause the raidz expansion. + * For testing only: pause the raidz expansion after reflowing this amount. * (accessed by ZTS and ztest) */ #ifdef _KERNEL static #endif /* _KERNEL */ -unsigned long raidz_expand_max_offset_pause = 0; +unsigned long raidz_expand_max_reflow_bytes = 0; + +/* + * For testing only: pause the raidz expansion at a certain point. + */ +uint_t raidz_expand_pause_point = 0; /* * Maximum amount of copy io's outstanding at once. @@ -3771,13 +3776,13 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, } /* - * For testing. + * For testing (ztest specific) */ static void -raidz_expand_pause(uint64_t progress) +raidz_expand_pause(uint_t pause_point) { - while (raidz_expand_max_offset_pause != 0 && - raidz_expand_max_offset_pause <= progress) + while (raidz_expand_pause_point != 0 && + raidz_expand_pause_point <= pause_point) delay(hz); } @@ -3834,7 +3839,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) abds[i] = abd_alloc_linear(read_size, B_FALSE); } - raidz_expand_pause(1); + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); /* * If we have already written the scratch area then we must read from @@ -3893,10 +3898,11 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) return; } + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); + /* * Reflow in memory. */ - raidz_expand_pause(2); uint64_t logical_sectors = logical_size >> ashift; for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { int oldchild = i % (raidvd->vdev_children - 1); @@ -3946,7 +3952,8 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", (long long)logical_size); - raidz_expand_pause(3); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); /* * Update uberblock to indicate that scratch space is valid. This is @@ -4032,7 +4039,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) (long long)logical_size, (long long)spa->spa_ubsync.ub_timestamp); - raidz_expand_pause(6); + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); /* * Update progress. @@ -4050,7 +4057,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) */ raidz_reflow_sync(spa, tx); - raidz_expand_pause(7); + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); } /* @@ -4080,7 +4087,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa) abds[i] = abd_alloc_linear(write_size, B_FALSE); } - raidz_expand_pause(8); + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_CRASH_COPY_1); pio = zio_root(spa, NULL, NULL, 0); for (int i = 0; i < raidvd->vdev_children; i++) { /* @@ -4095,7 +4102,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa) raidz_scratch_child_done, pio)); } zio_wait(pio); - raidz_expand_pause(9); + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_CRASH_COPY_2); /* * Overwrite real location with reflow'ed data. @@ -4118,7 +4125,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa) for (int i = 0; i < raidvd->vdev_children; i++) abd_free(abds[i]); kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); - raidz_expand_pause(10); + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_CRASH_COPY_3); /* * Update uberblock. @@ -4272,14 +4279,12 @@ spa_raidz_expand_cb(void *arg, zthr_t *zthr) /* * If requested, pause the reflow when the amount - * specified by raidz_expand_max_offset_pause is reached + * specified by raidz_expand_max_reflow_bytes is reached * * This pause is only used during testing or debugging. - * - * XXX Rename once we confirm that we want bytes */ - while (raidz_expand_max_offset_pause != 0 && - raidz_expand_max_offset_pause <= + while (raidz_expand_max_reflow_bytes != 0 && + raidz_expand_max_reflow_bytes <= vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { delay(hz); } @@ -4775,8 +4780,8 @@ vdev_ops_t vdev_raidz_ops = { .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; -ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_offset_pause, ULONG, ZMOD_RW, - "For testing, pause RAIDZ expansion at this offset"); +ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, + "For testing, pause RAIDZ expansion after reflowing this many bytes"); ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, "Max amount of concurrent i/o for RAIDZ expansion"); ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 24c21eea796b..ef603b2918a2 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -63,7 +63,7 @@ MULTIHOST_IMPORT_INTERVALS multihost.import_intervals zfs_multihost_import_inter MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable -RAIDZ_EXPAND_MAX_OFFSET_PAUSE vdev.expand_max_offset_pause raidz_expand_max_offset_pause +RAIDZ_EXPAND_MAX_REFLOW_BYTES vdev.expand_max_reflow_bytes raidz_expand_max_reflow_bytes REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh index c4e3b5af876c..c7af7809d4ae 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh @@ -62,7 +62,7 @@ function cleanup done log_must set_tunable32 PREFETCH_DISABLE $prefetch_disable - log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE 0 + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES 0 } function wait_expand_paused @@ -144,7 +144,7 @@ function test_scrub # randbyte=$(( ((RANDOM<<15) + RANDOM) % \ (dev_size_mb * (devs-1) * 1024 * 1024) )) - log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $randbyte + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $randbyte log_must zpool attach $TESTPOOL ${raid}-0 $dir/dev-$devs wait_expand_paused @@ -175,7 +175,7 @@ function test_scrub # log_must check_pool_status $pool "errors" "No known data errors" log_must zpool clear $pool - log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE 0 + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES 0 log_must zpool wait -t raidz_expand $TESTPOOL } diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh index 7df42a27510c..8d3fe20493bc 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh @@ -36,13 +36,13 @@ # 2. For each parity value [1..3] # - create raidz pool with minimum block device files required # - create couple of datasets with different recordsize and fill it -# - set raidz expand offset pause +# - set raidz expand maximum reflow bytes # - start randwritecomp on one of the datasets files # - attach new device to the pool -# - wait until reflow offset is equal to raidz expand pause offset +# - wait for reflow bytes to reach the maximum # - kill randwritecomp # - verify pool -# - set raidz expand offset to max value to complete raidz expansion +# - set reflow bytes to max value to complete the expansion typeset -r devs=10 typeset -r dev_size_mb=128 @@ -60,7 +60,7 @@ function cleanup done log_must set_tunable32 EMBEDDED_SLOG_MIN_MS $embedded_slog_min_ms - log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE 0 + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES 0 } function wait_expand_paused @@ -105,7 +105,7 @@ for nparity in 1 2 3; do pool_size=$(get_pool_prop size $pool) # Pause at random location near the end of vdev pause=$((((RANDOM << 15) + RANDOM) % pool_size)) - log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $pause + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $pause log_bkgrnd randwritecomp /$pool/fs/file pid0=$! @@ -125,9 +125,9 @@ for nparity in 1 2 3; do log_must check_pool_status $pool "scan" "with 0 errors" log_must check_pool_status $pool "scan" "repaired 0B" - # Set pause past largest possible offset for this pool + # Set pause past largest possible value for this pool pause=$((devs*dev_size_mb*1024*1024)) - log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $pause + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $pause log_must zpool wait -t raidz_expand $pool done diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh index ced7c44cdf70..213f88b22ba8 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh @@ -35,15 +35,15 @@ # 2. For each parity value [1..3] # - create raidz pool with minimum block device files required # - create couple of datasets with different recordsize and fill it -# - set raidz expand offset pause +# - set raidz expand maximum reflow bytes # - attach new device to the pool -# - wait reflow offset become equal to raidz expand pause offset +# - wait for reflow bytes to reach the maximum # - offline and zero vdevs allowed by parity # - wait some time and start offlined vdevs replacement # - wait replacement completion and verify pool status -# - loop thru vdevs replacing and raidz expand pause offset increasing +# - loop thru vdevs replacing with the max reflow bytes increasing # - verify pool -# - set raidz expand offset to max value to complete raidz expansion +# - set reflow bytes to max value to complete the expansion typeset -r devs=10 typeset -r dev_size_mb=128 @@ -61,7 +61,7 @@ function cleanup done log_must set_tunable32 PREFETCH_DISABLE $embedded_slog_min_ms - log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE 0 + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES 0 } function wait_expand_paused @@ -139,10 +139,10 @@ for nparity in 1 2 3; do log_must fill_fs /$pool/fs2 1 128 100 1024 R for disk in ${disks[$(($nparity+2))..$devs]}; do - # Set pause to some random offset near halfway point + # Set pause to some random value near halfway point pool_size=$(get_pool_prop size $pool) pause=$((((RANDOM << 15) + RANDOM) % pool_size / 2)) - log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $pause + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $pause log_must zpool attach $pool ${raid}-0 $disk devices="$devices $disk" @@ -155,14 +155,14 @@ for nparity in 1 2 3; do # Increase pause by about 25% pause=$((pause + (((RANDOM << 15) + RANDOM) % \ pool_size) / 4)) - log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $pause + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $pause wait_expand_paused done - # Set pause past largest possible offset for this pool + # Set pause past largest possible value for this pool pause=$((devs*dev_size_mb*1024*1024)) - log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $pause + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $pause log_must zpool wait -t raidz_expand $pool done