Skip to content

Commit

Permalink
8414 Implemented zpool scrub pause/resume
Browse files Browse the repository at this point in the history
illumos/illumos-gate@1702cce
illumos/illumos-gate@1702cce

https://www.illumos.org/issues/8414
  This issue tracks the port of scrub pause from ZoL: openzfs/zfs#6167
  Currently, there is no way to pause a scrub. Pausing may be useful when
 the pool is busy with other I/O to preserve bandwidth.

  Description

  This patch adds the ability to pause and resume scrubbing.  This is achieved
  by maintaining a persistent on-disk scrub state.  While the state is 'paused'
  we do not scrub any more blocks.  We do however perform regular scan
  housekeeping such as freeing async destroyed and deadlist blocks while paused.

  If you're testing this change, you probably want to include the patch from #6164

  Motivation and Context

  Scrub pausing can be an I/O intensive operation and people have been asking
  for the ability to pause a scrub for a while. This allows one to preserve scrub
  progress while freeing up bandwidth for other I/O.

  How Has This Been Tested?

  Unit testing and zfs-tests.  to the pool.  This patch will also include the
  patch from https://github.com/zfsonlinux/zfs/ pull/6164 In certain cases
  (dsl_scan_sync() is one), we may end up calling

Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Approved by: Dan McDonald <danmcd@joyent.com>
Author: Alek Pinchuk <apinchuk@datto.com>


git-svn-id: https://svn.freebsd.org/base/vendor-sys/illumos/dist@323107 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f
  • Loading branch information
avg committed Sep 1, 2017
1 parent f665625 commit dc60830
Show file tree
Hide file tree
Showing 9 changed files with 197 additions and 39 deletions.
4 changes: 4 additions & 0 deletions uts/common/fs/zfs/bpobj.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright (c) 2017 Datto Inc.
*/

#include <sys/bpobj.h>
Expand Down Expand Up @@ -212,6 +213,9 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,

mutex_enter(&bpo->bpo_lock);

if (!bpobj_hasentries(bpo))
goto out;

if (free)
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);

Expand Down
172 changes: 139 additions & 33 deletions uts/common/fs/zfs/dsl_scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
* Copyright 2016 Gary Mills
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017 Datto Inc.
*/

#include <sys/dsl_scan.h>
Expand Down Expand Up @@ -285,6 +286,8 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
scn->scn_phys.scn_queue_obj = 0;
}

scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;

/*
* If we were "restarted" from a stopped state, don't bother
* with anything else.
Expand Down Expand Up @@ -369,6 +372,91 @@ dsl_scan_cancel(dsl_pool_t *dp)
dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
}

boolean_t
dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
{
if (dsl_scan_scrubbing(scn->scn_dp) &&
scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED)
return (B_TRUE);

return (B_FALSE);
}

static int
dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx)
{
pool_scrub_cmd_t *cmd = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_scan_t *scn = dp->dp_scan;

if (*cmd == POOL_SCRUB_PAUSE) {
/* can't pause a scrub when there is no in-progress scrub */
if (!dsl_scan_scrubbing(dp))
return (SET_ERROR(ENOENT));

/* can't pause a paused scrub */
if (dsl_scan_is_paused_scrub(scn))
return (SET_ERROR(EBUSY));
} else if (*cmd != POOL_SCRUB_NORMAL) {
return (SET_ERROR(ENOTSUP));
}

return (0);
}

static void
dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
{
pool_scrub_cmd_t *cmd = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
spa_t *spa = dp->dp_spa;
dsl_scan_t *scn = dp->dp_scan;

if (*cmd == POOL_SCRUB_PAUSE) {
/* can't pause a scrub when there is no in-progress scrub */
spa->spa_scan_pass_scrub_pause = gethrestime_sec();
scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
dsl_scan_sync_state(scn, tx);
} else {
ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
if (dsl_scan_is_paused_scrub(scn)) {
/*
* We need to keep track of how much time we spend
* paused per pass so that we can adjust the scrub rate
* shown in the output of 'zpool status'
*/
spa->spa_scan_pass_scrub_spent_paused +=
gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
spa->spa_scan_pass_scrub_pause = 0;
scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
dsl_scan_sync_state(scn, tx);
}
}
}

/*
* Set scrub pause/resume state if it makes sense to do so
*/
int
dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
{
return (dsl_sync_task(spa_name(dp->dp_spa),
dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
ZFS_SPACE_CHECK_RESERVED));
}

boolean_t
dsl_scan_scrubbing(const dsl_pool_t *dp)
{
dsl_scan_t *scn = dp->dp_scan;

if (scn->scn_phys.scn_state == DSS_SCANNING &&
scn->scn_phys.scn_func == POOL_SCAN_SCRUB)
return (B_TRUE);

return (B_FALSE);
}

static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
dmu_objset_type_t ostype, dmu_tx_t *tx);
Expand Down Expand Up @@ -410,14 +498,14 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
extern int zfs_vdev_async_write_active_min_dirty_percent;

static boolean_t
dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb)
dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
{
/* we never skip user/group accounting objects */
if (zb && (int64_t)zb->zb_object < 0)
return (B_FALSE);

if (scn->scn_pausing)
return (B_TRUE); /* we're already pausing */
if (scn->scn_suspending)
return (B_TRUE); /* we're already suspending */

if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
return (B_FALSE); /* we're resuming */
Expand All @@ -427,7 +515,7 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb)
return (B_FALSE);

/*
* We pause if:
* We suspend if:
* - we have scanned for the maximum time: an entire txg
* timeout (default 5 sec)
* or
Expand All @@ -450,19 +538,19 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb)
dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) ||
spa_shutting_down(scn->scn_dp->dp_spa)) {
if (zb) {
dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
(longlong_t)zb->zb_objset,
(longlong_t)zb->zb_object,
(longlong_t)zb->zb_level,
(longlong_t)zb->zb_blkid);
scn->scn_phys.scn_bookmark = *zb;
}
dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
dprintf("suspending at DDT bookmark %llx/%llx/%llx/%llx\n",
(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
scn->scn_pausing = B_TRUE;
scn->scn_suspending = B_TRUE;
return (B_TRUE);
}
return (B_FALSE);
Expand Down Expand Up @@ -600,7 +688,7 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
/*
* If we found the block we're trying to resume from, or
* we went past it to a different object, zero it out to
* indicate that it's OK to start checking for pausing
* indicate that it's OK to start checking for suspending
* again.
*/
if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
Expand Down Expand Up @@ -703,7 +791,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
/*
* We also always visit user/group accounting
* objects, and never skip them, even if we are
* pausing. This is necessary so that the space
* suspending. This is necessary so that the space
* deltas from this txg get integrated.
*/
dsl_scan_visitdnode(scn, ds, osp->os_type,
Expand Down Expand Up @@ -759,7 +847,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,

/* ASSERT(pbuf == NULL || arc_released(pbuf)); */

if (dsl_scan_check_pause(scn, zb))
if (dsl_scan_check_suspend(scn, zb))
return;

if (dsl_scan_check_resume(scn, dnp, zb))
Expand Down Expand Up @@ -1096,14 +1184,14 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
dsl_dataset_name(ds, dsname);
zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
"pausing=%u",
"suspending=%u",
(longlong_t)dsobj, dsname,
(longlong_t)scn->scn_phys.scn_cur_min_txg,
(longlong_t)scn->scn_phys.scn_cur_max_txg,
(int)scn->scn_pausing);
(int)scn->scn_suspending);
kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);

if (scn->scn_pausing)
if (scn->scn_suspending)
goto out;

/*
Expand Down Expand Up @@ -1267,13 +1355,13 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
n++;

if (dsl_scan_check_pause(scn, NULL))
if (dsl_scan_check_suspend(scn, NULL))
break;
}

zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
(longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
(int)scn->scn_pausing);
zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; "
"suspending=%u", (longlong_t)n,
(int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);

ASSERT(error == 0 || error == ENOENT);
ASSERT(error != ENOENT ||
Expand Down Expand Up @@ -1316,7 +1404,7 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
dsl_scan_ddt(scn, tx);
if (scn->scn_pausing)
if (scn->scn_suspending)
return;
}

Expand All @@ -1328,7 +1416,7 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
dsl_scan_visit_rootbp(scn, NULL,
&dp->dp_meta_rootbp, tx);
spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
if (scn->scn_pausing)
if (scn->scn_suspending)
return;

if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
Expand All @@ -1338,22 +1426,22 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
dsl_scan_visitds(scn,
dp->dp_origin_snap->ds_object, tx);
}
ASSERT(!scn->scn_pausing);
ASSERT(!scn->scn_suspending);
} else if (scn->scn_phys.scn_bookmark.zb_objset !=
ZB_DESTROYED_OBJSET) {
/*
* If we were paused, continue from here. Note if the
* ds we were paused on was deleted, the zb_objset may
* If we were suspended, continue from here. Note if the
* ds we were suspended on was deleted, the zb_objset may
* be -1, so we will skip this and find a new objset
* below.
*/
dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
if (scn->scn_pausing)
if (scn->scn_suspending)
return;
}

/*
* In case we were paused right at the end of the ds, zero the
* In case we were suspended right at the end of the ds, zero the
* bookmark so we don't think that we're still trying to resume.
*/
bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
Expand Down Expand Up @@ -1385,14 +1473,14 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)

dsl_scan_visitds(scn, dsobj, tx);
zap_cursor_fini(&zc);
if (scn->scn_pausing)
if (scn->scn_suspending)
return;
}
zap_cursor_fini(&zc);
}

static boolean_t
dsl_scan_free_should_pause(dsl_scan_t *scn)
dsl_scan_free_should_suspend(dsl_scan_t *scn)
{
uint64_t elapsed_nanosecs;

Expand All @@ -1416,7 +1504,7 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)

if (!scn->scn_is_bptree ||
(BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
if (dsl_scan_free_should_pause(scn))
if (dsl_scan_free_should_suspend(scn))
return (SET_ERROR(ERESTART));
}

Expand All @@ -1439,7 +1527,8 @@ dsl_scan_active(dsl_scan_t *scn)
return (B_FALSE);
if (spa_shutting_down(spa))
return (B_FALSE);
if (scn->scn_phys.scn_state == DSS_SCANNING ||
if ((scn->scn_phys.scn_state == DSS_SCANNING &&
!dsl_scan_is_paused_scrub(scn)) ||
(scn->scn_async_destroying && !scn->scn_async_stalled))
return (B_TRUE);

Expand Down Expand Up @@ -1494,12 +1583,12 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
return;

scn->scn_visited_this_txg = 0;
scn->scn_pausing = B_FALSE;
scn->scn_suspending = B_FALSE;
scn->scn_sync_start_time = gethrtime();
spa->spa_scrub_active = B_TRUE;

/*
* First process the async destroys. If we pause, don't do
* First process the async destroys. If we suspend, don't do
* any scrubbing or resilvering. This ensures that there are no
* async destroys while we are scanning, so the scan code doesn't
* have to worry about traversing it. It is also faster to free the
Expand Down Expand Up @@ -1616,7 +1705,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
return;

if (scn->scn_done_txg == tx->tx_txg) {
ASSERT(!scn->scn_pausing);
ASSERT(!scn->scn_suspending);
/* finished with scan. */
zfs_dbgmsg("txg %llu scan complete", tx->tx_txg);
dsl_scan_done(scn, B_TRUE, tx);
Expand All @@ -1625,6 +1714,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
return;
}

if (dsl_scan_is_paused_scrub(scn))
return;

if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
scn->scn_phys.scn_ddt_class_max) {
zfs_dbgmsg("doing scan sync txg %llu; "
Expand Down Expand Up @@ -1659,7 +1751,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
(longlong_t)scn->scn_visited_this_txg,
(longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));

if (!scn->scn_pausing) {
if (!scn->scn_suspending) {
scn->scn_done_txg = tx->tx_txg + 1;
zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
tx->tx_txg, scn->scn_done_txg);
Expand Down Expand Up @@ -1867,11 +1959,15 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
return (0);
}

/* Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver */
/*
* Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
* Can also be called to resume a paused scrub.
*/
int
dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
{
spa_t *spa = dp->dp_spa;
dsl_scan_t *scn = dp->dp_scan;

/*
* Purge all vdev caches and probe all devices. We do this here
Expand All @@ -1886,6 +1982,16 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
spa->spa_scrub_reopen = B_FALSE;
(void) spa_vdev_state_exit(spa, NULL, 0);

if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
/* got scrub start cmd, resume paused scrub */
int err = dsl_scrub_set_pause_resume(scn->scn_dp,
POOL_SCRUB_NORMAL);
if (err == 0)
return (ECANCELED);

return (SET_ERROR(err));
}

return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
}
Expand Down
Loading

0 comments on commit dc60830

Please sign in to comment.