Skip to content

Commit

Permalink
Add trim support to zpool wait
Browse files Browse the repository at this point in the history
Manual trims fall into the category of long-running pool activities
which people might want to wait synchronously for. This change adds
support to 'zpool wait' for waiting for manual trim operations to
complete. It also adds a '-w' flag to 'zpool trim' which can be used to
turn 'zpool trim' into a synchronous operation.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #10071
  • Loading branch information
behlendorf committed Mar 4, 2020
1 parent b3212d2 commit 2288d41
Show file tree
Hide file tree
Showing 17 changed files with 347 additions and 73 deletions.
43 changes: 34 additions & 9 deletions cmd/zpool/zpool_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ get_usage(zpool_help_t idx)
case HELP_RESILVER:
return (gettext("\tresilver <pool> ...\n"));
case HELP_TRIM:
return (gettext("\ttrim [-d] [-r <rate>] [-c | -s] <pool> "
return (gettext("\ttrim [-dw] [-r <rate>] [-c | -s] <pool> "
"[<device> ...]\n"));
case HELP_STATUS:
return (gettext("\tstatus [-c [script1,script2,...]] "
Expand Down Expand Up @@ -6979,6 +6979,7 @@ zpool_do_resilver(int argc, char **argv)
* -r <rate> Sets the TRIM rate in bytes (per second). Supports
* adding a multiplier suffix such as 'k' or 'm'.
* -s Suspend. TRIM can then be restarted with no flags.
* -w Wait. Blocks until trimming has completed.
*/
int
zpool_do_trim(int argc, char **argv)
Expand All @@ -6988,15 +6989,17 @@ zpool_do_trim(int argc, char **argv)
{"secure", no_argument, NULL, 'd'},
{"rate", required_argument, NULL, 'r'},
{"suspend", no_argument, NULL, 's'},
{"wait", no_argument, NULL, 'w'},
{0, 0, 0, 0}
};

pool_trim_func_t cmd_type = POOL_TRIM_START;
uint64_t rate = 0;
boolean_t secure = B_FALSE;
boolean_t wait = B_FALSE;

int c;
while ((c = getopt_long(argc, argv, "cdr:s", long_options, NULL))
while ((c = getopt_long(argc, argv, "cdr:sw", long_options, NULL))
!= -1) {
switch (c) {
case 'c':
Expand Down Expand Up @@ -7037,6 +7040,9 @@ zpool_do_trim(int argc, char **argv)
}
cmd_type = POOL_TRIM_SUSPEND;
break;
case 'w':
wait = B_TRUE;
break;
case '?':
if (optopt != 0) {
(void) fprintf(stderr,
Expand All @@ -7059,6 +7065,12 @@ zpool_do_trim(int argc, char **argv)
return (-1);
}

if (wait && (cmd_type != POOL_TRIM_START)) {
(void) fprintf(stderr, gettext("-w cannot be used with -c or "
"-s\n"));
usage(B_FALSE);
}

char *poolname = argv[0];
zpool_handle_t *zhp = zpool_open(g_zfs, poolname);
if (zhp == NULL)
Expand All @@ -7067,6 +7079,7 @@ zpool_do_trim(int argc, char **argv)
trimflags_t trim_flags = {
.secure = secure,
.rate = rate,
.wait = wait,
};

nvlist_t *vdevs = fnvlist_alloc();
Expand Down Expand Up @@ -9466,21 +9479,30 @@ zpool_do_set(int argc, char **argv)

return (error);
}
/* Add up the total number of bytes left to initialize across all vdevs */

/* Add up the total number of bytes left to initialize/trim across all vdevs */
static uint64_t
vdev_initialize_remaining(nvlist_t *nv)
vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity)
{
uint64_t bytes_remaining;
nvlist_t **child;
uint_t c, children;
vdev_stat_t *vs;

assert(activity == ZPOOL_WAIT_INITIALIZE ||
activity == ZPOOL_WAIT_TRIM);

verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0);

if (vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE)
if (activity == ZPOOL_WAIT_INITIALIZE &&
vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE)
bytes_remaining = vs->vs_initialize_bytes_est -
vs->vs_initialize_bytes_done;
else if (activity == ZPOOL_WAIT_TRIM &&
vs->vs_trim_state == VDEV_TRIM_ACTIVE)
bytes_remaining = vs->vs_trim_bytes_est -
vs->vs_trim_bytes_done;
else
bytes_remaining = 0;

Expand All @@ -9489,7 +9511,7 @@ vdev_initialize_remaining(nvlist_t *nv)
children = 0;

for (c = 0; c < children; c++)
bytes_remaining += vdev_initialize_remaining(child[c]);
bytes_remaining += vdev_activity_remaining(child[c], activity);

return (bytes_remaining);
}
Expand Down Expand Up @@ -9547,7 +9569,7 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)
pool_scan_stat_t *pss = NULL;
pool_removal_stat_t *prs = NULL;
char *headers[] = {"DISCARD", "FREE", "INITIALIZE", "REPLACE",
"REMOVE", "RESILVER", "SCRUB"};
"REMOVE", "RESILVER", "SCRUB", "TRIM"};
int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES];

/* Calculate the width of each column */
Expand Down Expand Up @@ -9603,7 +9625,10 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)
bytes_rem[ZPOOL_WAIT_RESILVER] = rem;
}

bytes_rem[ZPOOL_WAIT_INITIALIZE] = vdev_initialize_remaining(nvroot);
bytes_rem[ZPOOL_WAIT_INITIALIZE] =
vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE);
bytes_rem[ZPOOL_WAIT_TRIM] =
vdev_activity_remaining(nvroot, ZPOOL_WAIT_TRIM);

/*
* A replace finishes after resilvering finishes, so the amount of work
Expand Down Expand Up @@ -9731,7 +9756,7 @@ zpool_do_wait(int argc, char **argv)
{
static char *col_subopts[] = { "discard", "free",
"initialize", "replace", "remove", "resilver",
"scrub", NULL };
"scrub", "trim", NULL };

/* Reset activities array */
bzero(&wd.wd_enabled, sizeof (wd.wd_enabled));
Expand Down
3 changes: 3 additions & 0 deletions include/libzfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,9 @@ typedef struct trimflags {
/* request a secure trim, requires support from device */
boolean_t secure;

/* after starting trim, block until trim completes */
boolean_t wait;

/* trim at the requested rate in bytes/second */
uint64_t rate;
} trimflags_t;
Expand Down
1 change: 1 addition & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -1353,6 +1353,7 @@ typedef enum {
ZPOOL_WAIT_REMOVE,
ZPOOL_WAIT_RESILVER,
ZPOOL_WAIT_SCRUB,
ZPOOL_WAIT_TRIM,
ZPOOL_WAIT_NUM_ACTIVITIES
} zpool_wait_activity_t;

Expand Down
29 changes: 28 additions & 1 deletion lib/libzfs/libzfs_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -2263,6 +2263,30 @@ xlate_trim_err(int err)
return (err);
}

static int
zpool_trim_wait(zpool_handle_t *zhp, nvlist_t *vdev_guids)
{
int err;
nvpair_t *elem;

for (elem = nvlist_next_nvpair(vdev_guids, NULL); elem != NULL;
elem = nvlist_next_nvpair(vdev_guids, elem)) {

uint64_t guid = fnvpair_value_uint64(elem);

err = lzc_wait_tag(zhp->zpool_name,
ZPOOL_WAIT_TRIM, guid, NULL);
if (err != 0) {
(void) zpool_standard_error_fmt(zhp->zpool_hdl,
err, dgettext(TEXT_DOMAIN, "error "
"waiting to trim '%s'"), nvpair_name(elem));

return (err);
}
}
return (0);
}

/*
* Begin, suspend, or cancel the TRIM (discarding of all free blocks) for
* the given vdevs in the given pool.
Expand All @@ -2286,9 +2310,12 @@ zpool_trim(zpool_handle_t *zhp, pool_trim_func_t cmd_type, nvlist_t *vds,
err = lzc_trim(zhp->zpool_name, cmd_type, trim_flags->rate,
trim_flags->secure, vdev_guids, &errlist);
if (err == 0) {
if (trim_flags->wait)
err = zpool_trim_wait(zhp, vdev_guids);

fnvlist_free(vdev_guids);
fnvlist_free(guids_to_paths);
return (0);
return (err);
}

if (errlist != NULL) {
Expand Down
8 changes: 5 additions & 3 deletions man/man8/zpool-trim.8
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\"
.Dd August 9, 2019
.Dd February 25, 2020
.Dt ZPOOL-TRIM 8
.Os Linux
.Sh NAME
Expand All @@ -36,7 +36,7 @@
.Sh SYNOPSIS
.Nm
.Cm trim
.Op Fl d
.Op Fl dw
.Op Fl r Ar rate
.Op Fl c | Fl s
.Ar pool
Expand All @@ -46,7 +46,7 @@
.It Xo
.Nm
.Cm trim
.Op Fl d
.Op Fl dw
.Op Fl c | Fl s
.Ar pool
.Op Ar device Ns ...
Expand Down Expand Up @@ -84,6 +84,8 @@ trimmed, the command will fail and no suspension will occur on any device.
Trimming can then be resumed by running
.Nm zpool Cm trim
with no flags on the relevant target devices.
.It Fl w -wait
Wait until the devices are done being trimmed before returning.
.El
.El
.Sh SEE ALSO
Expand Down
6 changes: 4 additions & 2 deletions man/man8/zpool-wait.8
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\"
.Dd August 9, 2019
.Dd February 25, 2020
.Dt ZPOOL-WAIT 8
.Os Linux
.Sh NAME
Expand Down Expand Up @@ -73,6 +73,7 @@ along with what each one waits for:
remove Device removal to cease
resilver Resilver to cease
scrub Scrub to cease
trim Manual trim to cease
.Ed
.Pp
If an
Expand Down Expand Up @@ -109,4 +110,5 @@ See
.Xr zpool-replace 8 ,
.Xr zpool-remove 8 ,
.Xr zpool-resilver 8 ,
.Xr zpool-scrub 8
.Xr zpool-scrub 8 ,
.Xr zpool-trim 8
47 changes: 29 additions & 18 deletions module/zfs/spa.c
Original file line number Diff line number Diff line change
Expand Up @@ -9280,28 +9280,35 @@ spa_wake_waiters(spa_t *spa)
mutex_exit(&spa->spa_activities_lock);
}

/* Whether the vdev or any of its descendants is initializing. */
/* Whether the vdev or any of its descendants are being initialized/trimmed. */
static boolean_t
spa_vdev_initializing_impl(vdev_t *vd)
spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity)
{
spa_t *spa = vd->vdev_spa;
boolean_t initializing;

ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER));
ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
ASSERT(activity == ZPOOL_WAIT_INITIALIZE ||
activity == ZPOOL_WAIT_TRIM);

kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ?
&vd->vdev_initialize_lock : &vd->vdev_trim_lock;

mutex_exit(&spa->spa_activities_lock);
mutex_enter(&vd->vdev_initialize_lock);
mutex_enter(lock);
mutex_enter(&spa->spa_activities_lock);

initializing = (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE);
mutex_exit(&vd->vdev_initialize_lock);
boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ?
(vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) :
(vd->vdev_trim_state == VDEV_TRIM_ACTIVE);
mutex_exit(lock);

if (initializing)
if (in_progress)
return (B_TRUE);

for (int i = 0; i < vd->vdev_children; i++) {
if (spa_vdev_initializing_impl(vd->vdev_child[i]))
if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i],
activity))
return (B_TRUE);
}

Expand All @@ -9310,12 +9317,13 @@ spa_vdev_initializing_impl(vdev_t *vd)

/*
* If use_guid is true, this checks whether the vdev specified by guid is
* being initialized. Otherwise, it checks whether any vdev in the pool is being
* initialized. The caller must hold the config lock and spa_activities_lock.
* being initialized/trimmed. Otherwise, it checks whether any vdev in the pool
* is being initialized/trimmed. The caller must hold the config lock and
* spa_activities_lock.
*/
static int
spa_vdev_initializing(spa_t *spa, boolean_t use_guid, uint64_t guid,
boolean_t *in_progress)
spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid,
zpool_wait_activity_t activity, boolean_t *in_progress)
{
mutex_exit(&spa->spa_activities_lock);
spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
Expand All @@ -9332,7 +9340,7 @@ spa_vdev_initializing(spa_t *spa, boolean_t use_guid, uint64_t guid,
vd = spa->spa_root_vdev;
}

*in_progress = spa_vdev_initializing_impl(vd);
*in_progress = spa_vdev_activity_in_progress_impl(vd, activity);

spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
return (0);
Expand Down Expand Up @@ -9403,7 +9411,9 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
spa_livelist_delete_check(spa));
break;
case ZPOOL_WAIT_INITIALIZE:
error = spa_vdev_initializing(spa, use_tag, tag, in_progress);
case ZPOOL_WAIT_TRIM:
error = spa_vdev_activity_in_progress(spa, use_tag, tag,
activity, in_progress);
break;
case ZPOOL_WAIT_REPLACE:
mutex_exit(&spa->spa_activities_lock);
Expand Down Expand Up @@ -9443,15 +9453,16 @@ spa_wait_common(const char *pool, zpool_wait_activity_t activity,
{
/*
* The tag is used to distinguish between instances of an activity.
* 'initialize' is the only activity that we use this for. The other
* activities can only have a single instance in progress in a pool at
* one time, making the tag unnecessary.
* 'initialize' and 'trim' are the only activities that we use this for.
* The other activities can only have a single instance in progress in a
* pool at one time, making the tag unnecessary.
*
* There can be multiple devices being replaced at once, but since they
* all finish once resilvering finishes, we don't bother keeping track
* of them individually, we just wait for them all to finish.
*/
if (use_tag && activity != ZPOOL_WAIT_INITIALIZE)
if (use_tag && activity != ZPOOL_WAIT_INITIALIZE &&
activity != ZPOOL_WAIT_TRIM)
return (EINVAL);

if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES)
Expand Down
3 changes: 3 additions & 0 deletions module/zfs/vdev_trim.c
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,9 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
}

dmu_tx_commit(tx);

if (new_state != VDEV_TRIM_ACTIVE)
spa_notify_waiters(spa);
}

/*
Expand Down
1 change: 1 addition & 0 deletions tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,7 @@ tests = ['zpool_wait_discard', 'zpool_wait_freeing',
'zpool_wait_initialize_basic', 'zpool_wait_initialize_cancel',
'zpool_wait_initialize_flag', 'zpool_wait_multiple',
'zpool_wait_no_activity', 'zpool_wait_remove', 'zpool_wait_remove_cancel',
'zpool_wait_trim_basic', 'zpool_wait_trim_cancel', 'zpool_wait_trim_flag',
'zpool_wait_usage']
tags = ['functional', 'cli_root', 'zpool_wait']

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ dist_pkgdata_SCRIPTS = \
zpool_wait_no_activity.ksh \
zpool_wait_remove.ksh \
zpool_wait_remove_cancel.ksh \
zpool_wait_trim_basic.ksh \
zpool_wait_trim_cancel.ksh \
zpool_wait_trim_flag.ksh \
zpool_wait_usage.ksh

dist_pkgdata_DATA = \
Expand Down
Loading

0 comments on commit 2288d41

Please sign in to comment.