Skip to content

Commit

Permalink
Teach zpool scrub to scrub only blocks in error log
Browse files Browse the repository at this point in the history
Added a flag '-e' in zpool scrub to scrub only blocks in error log. A
user can pause, resume and cancel the error scrub by passing additional
command line arguments -p -s just like a regular scrub. This involves
adding a new flag, creating new libzfs interfaces, a new ioctl, and the
actual iteration and read-issuing logic. Error scrubbing is executed in
multiple txg to make sure pool performance is not affected.

Co-authored-by: TulsiJain tulsi.jain@delphix.com
Signed-off-by: George Amanakis <gamanakis@gmail.com>
  • Loading branch information
gamanakis committed May 11, 2023
1 parent 4eca03f commit ff7bd26
Show file tree
Hide file tree
Showing 29 changed files with 1,601 additions and 71 deletions.
111 changes: 101 additions & 10 deletions cmd/zpool/zpool_main.c
Expand Up @@ -401,7 +401,7 @@ get_usage(zpool_help_t idx)
return (gettext("\tinitialize [-c | -s] [-w] <pool> "
"[<device> ...]\n"));
case HELP_SCRUB:
return (gettext("\tscrub [-s | -p] [-w] <pool> ...\n"));
return (gettext("\tscrub [-s | -p] [-w] [-e] <pool> ...\n"));
case HELP_RESILVER:
return (gettext("\tresilver <pool> ...\n"));
case HELP_TRIM:
Expand Down Expand Up @@ -7297,8 +7297,9 @@ wait_callback(zpool_handle_t *zhp, void *data)
}

/*
* zpool scrub [-s | -p] [-w] <pool> ...
* zpool scrub [-s | -p] [-w] [-e] <pool> ...
*
* -e Only scrub blocks in the error log.
* -s Stop. Stops any in-progress scrub.
* -p Pause. Pause in-progress scrub.
* -w Wait. Blocks until scrub has completed.
Expand All @@ -7314,14 +7315,21 @@ zpool_do_scrub(int argc, char **argv)
cb.cb_type = POOL_SCAN_SCRUB;
cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;

boolean_t is_error_scrub = B_FALSE;
boolean_t is_pause = B_FALSE;
boolean_t is_stop = B_FALSE;

/* check options */
while ((c = getopt(argc, argv, "spw")) != -1) {
while ((c = getopt(argc, argv, "spwe")) != -1) {
switch (c) {
case 'e':
is_error_scrub = B_TRUE;
break;
case 's':
cb.cb_type = POOL_SCAN_NONE;
is_stop = B_TRUE;
break;
case 'p':
cb.cb_scrub_cmd = POOL_SCRUB_PAUSE;
is_pause = B_TRUE;
break;
case 'w':
wait = B_TRUE;
Expand All @@ -7333,11 +7341,21 @@ zpool_do_scrub(int argc, char **argv)
}
}

if (cb.cb_type == POOL_SCAN_NONE &&
cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) {
(void) fprintf(stderr, gettext("invalid option combination: "
"-s and -p are mutually exclusive\n"));
if (is_pause && is_stop) {
(void) fprintf(stderr, gettext("invalid option "
"combination :-s and -p are mutually exclusive\n"));
usage(B_FALSE);
} else {
if (is_error_scrub)
cb.cb_type = POOL_SCAN_ERRORSCRUB;

if (is_pause) {
cb.cb_scrub_cmd = POOL_SCRUB_PAUSE;
} else if (is_stop) {
cb.cb_type = POOL_SCAN_NONE;
} else {
cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
}
}

if (wait && (cb.cb_type == POOL_SCAN_NONE ||
Expand Down Expand Up @@ -7561,6 +7579,70 @@ secs_to_dhms(uint64_t total, char *buf)
}
}

/*
* Print out detailed error scrub status.
*/
static void
print_err_scrub_status(pool_scan_stat_t *ps)
{
time_t start, end, pause;
uint64_t total_secs_left;
uint64_t secs_left, mins_left, hours_left, days_left;
uint64_t examined, to_be_examined;

if (ps == NULL || ps->pss_error_scrub_func != POOL_SCAN_ERRORSCRUB) {
return;
}

(void) printf(gettext(" scrub: "));

start = ps->pss_error_scrub_start;
end = ps->pss_error_scrub_end;
pause = ps->pss_pass_error_scrub_pause;
examined = ps->pss_error_scrub_examined;
to_be_examined = ps->pss_error_scrub_to_be_examined;

assert(ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB);

if (ps->pss_error_scrub_state == DSS_FINISHED) {
total_secs_left = end - start;
days_left = total_secs_left / 60 / 60 / 24;
hours_left = (total_secs_left / 60 / 60) % 24;
mins_left = (total_secs_left / 60) % 60;
secs_left = (total_secs_left % 60);

(void) printf(gettext("scrubbed %llu error blocks in %llu days "
"%02llu:%02llu:%02llu on %s"), (u_longlong_t)examined,
(u_longlong_t)days_left, (u_longlong_t)hours_left,
(u_longlong_t)mins_left, (u_longlong_t)secs_left,
ctime(&end));

return;
} else if (ps->pss_error_scrub_state == DSS_CANCELED) {
(void) printf(gettext("error scrub canceled on %s"),
ctime(&end));
return;
}
assert(ps->pss_error_scrub_state == DSS_ERRORSCRUBBING);

/* Error scrub is in progress. */
if (pause == 0) {
(void) printf(gettext("error scrub in progress since %s"),
ctime(&start));
} else {
(void) printf(gettext("error scrub paused since %s"),
ctime(&pause));
(void) printf(gettext("\terror scrub started on %s"),
ctime(&start));
}

double fraction_done = (double)examined / (to_be_examined + examined);
(void) printf(gettext("\t%.2f%% done, issued I/O for %llu error"
" blocks"), 100 * fraction_done, (u_longlong_t)examined);

(void) printf("\n");
}

/*
* Print out detailed scrub status.
*/
Expand Down Expand Up @@ -7897,10 +7979,12 @@ print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot)
{
uint64_t rebuild_end_time = 0, resilver_end_time = 0;
boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE;
boolean_t have_errorscrub = B_FALSE;
boolean_t active_resilver = B_FALSE;
pool_checkpoint_stat_t *pcs = NULL;
pool_scan_stat_t *ps = NULL;
uint_t c;
time_t scrub_start = 0, errorscrub_start = 0;

if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
(uint64_t **)&ps, &c) == 0) {
Expand All @@ -7909,16 +7993,23 @@ print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot)
active_resilver = (ps->pss_state == DSS_SCANNING);
}


have_resilver = (ps->pss_func == POOL_SCAN_RESILVER);
have_scrub = (ps->pss_func == POOL_SCAN_SCRUB);
scrub_start = ps->pss_start_time;
have_errorscrub = (ps->pss_error_scrub_func ==
POOL_SCAN_ERRORSCRUB);
errorscrub_start = ps->pss_error_scrub_start;
}

boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time);
boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0));

/* Always print the scrub status when available. */
if (have_scrub)
if (have_scrub && scrub_start > errorscrub_start)
print_scan_scrub_resilver_status(ps);
else if (have_errorscrub && errorscrub_start >= scrub_start)
print_err_scrub_status(ps);

/*
* When there is an active resilver or rebuild print its status.
Expand Down
3 changes: 3 additions & 0 deletions include/libzfs.h
Expand Up @@ -125,11 +125,14 @@ typedef enum zfs_error {
EZFS_THREADCREATEFAILED, /* thread create failed */
EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */
EZFS_SCRUBBING, /* currently scrubbing */
EZFS_ERRORSCRUBBING, /* currently error scrubbing */
EZFS_ERRORSCRUB_PAUSED, /* error scrub currently paused */
EZFS_NO_SCRUB, /* no active scrub */
EZFS_DIFF, /* general failure of zfs diff */
EZFS_DIFFDATA, /* bad zfs diff data */
EZFS_POOLREADONLY, /* pool is in read-only mode */
EZFS_SCRUB_PAUSED, /* scrub currently paused */
EZFS_SCRUB_PAUSED_TO_CANCEL, /* scrub currently paused */
EZFS_ACTIVE_POOL, /* pool is imported on a different system */
EZFS_CRYPTOFAILED, /* failed to setup encryption */
EZFS_NO_PENDING, /* cannot cancel, no operation is pending */
Expand Down
2 changes: 2 additions & 0 deletions include/libzfs_core.h
Expand Up @@ -155,6 +155,8 @@ _LIBZFS_CORE_H int lzc_get_bootenv(const char *, nvlist_t **);
_LIBZFS_CORE_H int lzc_get_vdev_prop(const char *, nvlist_t *, nvlist_t **);
_LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **);

_LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **);

#ifdef __cplusplus
}
#endif
Expand Down
1 change: 1 addition & 0 deletions include/sys/dmu.h
Expand Up @@ -378,6 +378,7 @@ typedef struct dmu_buf {
#define DMU_POOL_DDT_STATS "DDT-statistics"
#define DMU_POOL_CREATION_VERSION "creation_version"
#define DMU_POOL_SCAN "scan"
#define DMU_POOL_ERRORSCRUB "error_scrub"
#define DMU_POOL_FREE_BPOBJ "free_bpobj"
#define DMU_POOL_BPTREE_OBJ "bptree_obj"
#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj"
Expand Down
27 changes: 26 additions & 1 deletion include/sys/dsl_scan.h
Expand Up @@ -29,6 +29,7 @@

#include <sys/zfs_context.h>
#include <sys/zio.h>
#include <sys/zap.h>
#include <sys/ddt.h>
#include <sys/bplist.h>

Expand Down Expand Up @@ -78,6 +79,21 @@ typedef enum dsl_scan_flags {

#define DSL_SCAN_FLAGS_MASK (DSF_VISIT_DS_AGAIN)

typedef struct dsl_errorscrub_phys {
uint64_t dep_func; /* pool_scan_func_t */
uint64_t dep_state; /* dsl_scan_state_t */
uint64_t dep_cursor; /* serialized zap cursor for tracing progress */
uint64_t dep_start_time; /* error scrub start time, unix timestamp */
uint64_t dep_end_time; /* error scrub end time, unix timestamp */
uint64_t dep_to_examine; /* total error blocks to be scrubbed */
uint64_t dep_examined; /* blocks scrubbed so far */
uint64_t dep_errors; /* error scrub I/O error count */
uint64_t dep_paused_flags; /* flag for paused */
} dsl_errorscrub_phys_t;

#define ERRORSCRUB_PHYS_NUMINTS (sizeof (dsl_errorscrub_phys_t) \
/ sizeof (uint64_t))

/*
* Every pool will have one dsl_scan_t and this structure will contain
* in-memory information about the scan and a pointer to the on-disk
Expand Down Expand Up @@ -151,11 +167,15 @@ typedef struct dsl_scan {
uint64_t scn_avg_zio_size_this_txg;
uint64_t scn_zios_this_txg;

/* zap cursor for tracing error scrub progress */
zap_cursor_t errorscrub_cursor;
/* members needed for syncing scan status to disk */
dsl_scan_phys_t scn_phys; /* on disk representation of scan */
dsl_scan_phys_t scn_phys_cached;
avl_tree_t scn_queue; /* queue of datasets to scan */
uint64_t scn_queues_pending; /* outstanding data to issue */
/* members needed for syncing error scrub status to disk */
dsl_errorscrub_phys_t errorscrub_phys;
} dsl_scan_t;

typedef struct dsl_scan_io_queue dsl_scan_io_queue_t;
Expand All @@ -171,8 +191,12 @@ int dsl_scan_cancel(struct dsl_pool *);
int dsl_scan(struct dsl_pool *, pool_scan_func_t);
void dsl_scan_assess_vdev(struct dsl_pool *dp, vdev_t *vd);
boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp);
int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd);
boolean_t dsl_errorscrubbing(const struct dsl_pool *dp);
boolean_t dsl_errorscrub_active(dsl_scan_t *scn);
void dsl_scan_restart_resilver(struct dsl_pool *, uint64_t txg);
int dsl_scrub_set_pause_resume(const struct dsl_pool *dp,
pool_scrub_cmd_t cmd);
void dsl_errorscrub_sync(struct dsl_pool *, dmu_tx_t *);
boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
Expand All @@ -184,6 +208,7 @@ void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
struct dmu_tx *tx);
boolean_t dsl_scan_active(dsl_scan_t *scn);
boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn);
boolean_t dsl_errorscrub_is_paused(const dsl_scan_t *scn);
void dsl_scan_freed(spa_t *spa, const blkptr_t *bp);
void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue);
void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd);
Expand Down
19 changes: 18 additions & 1 deletion include/sys/fs/zfs.h
Expand Up @@ -1036,6 +1036,7 @@ typedef enum pool_scan_func {
POOL_SCAN_NONE,
POOL_SCAN_SCRUB,
POOL_SCAN_RESILVER,
POOL_SCAN_ERRORSCRUB,
POOL_SCAN_FUNCS
} pool_scan_func_t;

Expand Down Expand Up @@ -1099,6 +1100,20 @@ typedef struct pool_scan_stat {
uint64_t pss_pass_scrub_spent_paused;
uint64_t pss_pass_issued; /* issued bytes per scan pass */
uint64_t pss_issued; /* total bytes checked by scanner */

/* error scrub values stored on disk */
uint64_t pss_error_scrub_func; /* pool_scan_func_t */
uint64_t pss_error_scrub_state; /* dsl_scan_state_t */
uint64_t pss_error_scrub_start; /* error scrub start time */
uint64_t pss_error_scrub_end; /* error scrub end time */
uint64_t pss_error_scrub_examined; /* error blocks issued I/O */
/* error blocks to be issued I/O */
uint64_t pss_error_scrub_to_be_examined;

/* error scrub values not stored on disk */
/* error scrub pause time in milliseconds */
uint64_t pss_pass_error_scrub_pause;

} pool_scan_stat_t;

typedef struct pool_removal_stat {
Expand All @@ -1120,6 +1135,7 @@ typedef enum dsl_scan_state {
DSS_SCANNING,
DSS_FINISHED,
DSS_CANCELED,
DSS_ERRORSCRUBBING,
DSS_NUM_STATES
} dsl_scan_state_t;

Expand Down Expand Up @@ -1359,7 +1375,7 @@ typedef enum {
*/
typedef enum zfs_ioc {
/*
* Core features - 81/128 numbers reserved.
* Core features - 88/128 numbers reserved.
*/
#ifdef __FreeBSD__
ZFS_IOC_FIRST = 0,
Expand Down Expand Up @@ -1454,6 +1470,7 @@ typedef enum zfs_ioc {
ZFS_IOC_WAIT_FS, /* 0x5a54 */
ZFS_IOC_VDEV_GET_PROPS, /* 0x5a55 */
ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */
ZFS_IOC_POOL_SCRUB, /* 0x5a57 */

/*
* Per-platform (Optional) - 8/128 numbers reserved.
Expand Down
8 changes: 8 additions & 0 deletions include/sys/spa.h
Expand Up @@ -1154,6 +1154,7 @@ extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate);
extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
extern uint64_t spa_approx_errlog_size(spa_t *spa);
extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count);
extern uint64_t spa_get_last_errlog_size(spa_t *spa);
extern void spa_errlog_rotate(spa_t *spa);
extern void spa_errlog_drain(spa_t *spa);
extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
Expand All @@ -1164,6 +1165,13 @@ extern void spa_swap_errlog(spa_t *spa, uint64_t new_head_ds,
extern void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj,
dmu_tx_t *tx);
extern void spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx);
extern int find_top_affected_fs(spa_t *spa, uint64_t head_ds,
zbookmark_err_phys_t *zep, uint64_t *top_affected_fs);
extern int find_birth_txg(struct dsl_dataset *ds, zbookmark_err_phys_t *zep,
uint64_t *birth_txg);
extern void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep,
zbookmark_phys_t *zb);
extern void name_to_errphys(char *buf, zbookmark_err_phys_t *zep);

/* vdev cache */
extern void vdev_cache_stat_init(void);
Expand Down
4 changes: 4 additions & 0 deletions include/sys/spa_impl.h
Expand Up @@ -295,6 +295,10 @@ struct spa {
uint64_t spa_scan_pass_exam; /* examined bytes per pass */
uint64_t spa_scan_pass_issued; /* issued bytes per pass */

/* error scrub pause time in milliseconds */
uint64_t spa_scan_pass_errorscrub_pause;
/* total error scrub paused time in milliseconds */
uint64_t spa_scan_pass_errorscrub_spent_paused;
/*
* We are in the middle of a resilver, and another resilver
* is needed once this one completes. This is set iff any
Expand Down
5 changes: 5 additions & 0 deletions include/sys/sysevent/eventdefs.h
Expand Up @@ -123,6 +123,11 @@ extern "C" {
#define ESC_ZFS_TRIM_CANCEL "trim_cancel"
#define ESC_ZFS_TRIM_RESUME "trim_resume"
#define ESC_ZFS_TRIM_SUSPEND "trim_suspend"
#define ESC_ZFS_ERRORSCRUB_START "errorscrub_start"
#define ESC_ZFS_ERRORSCRUB_FINISH "errorscrub_finish"
#define ESC_ZFS_ERRORSCRUB_ABORT "errorscrub_abort"
#define ESC_ZFS_ERRORSCRUB_RESUME "errorscrub_resume"
#define ESC_ZFS_ERRORSCRUB_PAUSED "errorscrub_paused"

/*
* datalink subclass definitions.
Expand Down
3 changes: 2 additions & 1 deletion lib/libzfs/libzfs.abi
Expand Up @@ -5717,7 +5717,8 @@
<enumerator name='POOL_SCAN_NONE' value='0'/>
<enumerator name='POOL_SCAN_SCRUB' value='1'/>
<enumerator name='POOL_SCAN_RESILVER' value='2'/>
<enumerator name='POOL_SCAN_FUNCS' value='3'/>
<enumerator name='POOL_SCAN_ERRORSCRUB' value='3'/>
<enumerator name='POOL_SCAN_FUNCS' value='4'/>
</enum-decl>
<typedef-decl name='pool_scan_func_t' type-id='1b092565' id='7313fbe2'/>
<enum-decl name='pool_scrub_cmd' id='a1474cbd'>
Expand Down

0 comments on commit ff7bd26

Please sign in to comment.