Skip to content
This repository has been archived by the owner on Nov 7, 2019. It is now read-only.

Commit

Permalink
9290 device removal reduces redundancy of mirrors
Browse files Browse the repository at this point in the history
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Prashanth Sreenivasa <pks@delphix.com>
Reviewed by: Sara Hartse <sara.hartse@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Tim Chase <tim@chase2k.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
  • Loading branch information
ahrens authored and prakashsurya committed Apr 16, 2018
1 parent ff9e88c commit 3a4b1be
Show file tree
Hide file tree
Showing 15 changed files with 900 additions and 232 deletions.
18 changes: 11 additions & 7 deletions usr/src/cmd/zdb/zdb.c
Expand Up @@ -3006,7 +3006,7 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

spa_vdev_removal_t *svr = spa->spa_vdev_removal;
vdev_t *vd = svr->svr_vdev;
vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;

for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
Expand All @@ -3022,13 +3022,17 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
svr->svr_allocd_segs, SM_ALLOC));

/*
* Clear everything past what has been synced,
* because we have not allocated mappings for it yet.
* Clear everything past what has been synced unless
* it's past the spacemap, because we have not allocated
* mappings for it yet.
*/
range_tree_clear(svr->svr_allocd_segs,
vdev_indirect_mapping_max_offset(vim),
msp->ms_sm->sm_start + msp->ms_sm->sm_size -
vdev_indirect_mapping_max_offset(vim));
uint64_t vim_max_offset =
vdev_indirect_mapping_max_offset(vim);
uint64_t sm_end = msp->ms_sm->sm_start +
msp->ms_sm->sm_size;
if (sm_end > vim_max_offset)
range_tree_clear(svr->svr_allocd_segs,
vim_max_offset, sm_end - vim_max_offset);
}

zcb->zcb_removing_size +=
Expand Down
58 changes: 55 additions & 3 deletions usr/src/cmd/ztest/ztest.c
Expand Up @@ -436,6 +436,7 @@ static ztest_ds_t *ztest_ds;

static kmutex_t ztest_vdev_lock;
static kmutex_t ztest_checkpoint_lock;
static boolean_t ztest_device_removal_active = B_FALSE;

/*
* The ztest_name_lock protects the pool and dataset namespace used by
Expand Down Expand Up @@ -2880,7 +2881,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
* value. Don't bother trying to attach while we are in the middle
* of removal.
*/
if (spa->spa_vdev_removal != NULL) {
if (ztest_device_removal_active) {
spa_config_exit(spa, SCL_ALL, FTAG);
mutex_exit(&ztest_vdev_lock);
return;
Expand Down Expand Up @@ -3055,16 +3056,49 @@ ztest_device_removal(ztest_ds_t *zd, uint64_t id)
spa_t *spa = ztest_spa;
vdev_t *vd;
uint64_t guid;
int error;

mutex_enter(&ztest_vdev_lock);

if (ztest_device_removal_active) {
mutex_exit(&ztest_vdev_lock);
return;
}

/*
* Remove a random top-level vdev and wait for removal to finish.
*/
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE));
guid = vd->vdev_guid;
spa_config_exit(spa, SCL_VDEV, FTAG);

(void) spa_vdev_remove(spa, guid, B_FALSE);
error = spa_vdev_remove(spa, guid, B_FALSE);
if (error == 0) {
ztest_device_removal_active = B_TRUE;
mutex_exit(&ztest_vdev_lock);

while (spa->spa_vdev_removal != NULL)
txg_wait_synced(spa_get_dsl(spa), 0);
} else {
mutex_exit(&ztest_vdev_lock);
return;
}

/*
* The pool needs to be scrubbed after completing device removal.
* Failure to do so may result in checksum errors due to the
* strategy employed by ztest_fault_inject() when selecting which
* offset are redundant and can be damaged.
*/
error = spa_scan(spa, POOL_SCAN_SCRUB);
if (error == 0) {
while (dsl_scan_scrubbing(spa_get_dsl(spa)))
txg_wait_synced(spa_get_dsl(spa), 0);
}

mutex_enter(&ztest_vdev_lock);
ztest_device_removal_active = B_FALSE;
mutex_exit(&ztest_vdev_lock);
}

Expand Down Expand Up @@ -3203,7 +3237,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
* that the metaslab_class space increased (because it decreases
* when the device removal completes).
*/
if (spa->spa_vdev_removal != NULL) {
if (ztest_device_removal_active) {
spa_config_exit(spa, SCL_STATE, spa);
mutex_exit(&ztest_vdev_lock);
mutex_exit(&ztest_checkpoint_lock);
Expand Down Expand Up @@ -4988,6 +5022,18 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
boolean_t islog = B_FALSE;

mutex_enter(&ztest_vdev_lock);

/*
* Device removal is in progress, fault injection must be disabled
* until it completes and the pool is scrubbed. The fault injection
* strategy for damaging blocks does not take in to account evacuated
* blocks which may have already been damaged.
*/
if (ztest_device_removal_active) {
mutex_exit(&ztest_vdev_lock);
return;
}

maxfaults = MAXFAULTS();
leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
mirror_save = zs->zs_mirrors;
Expand Down Expand Up @@ -5333,6 +5379,12 @@ ztest_scrub(ztest_ds_t *zd, uint64_t id)
{
spa_t *spa = ztest_spa;

/*
* Scrub in progress by device removal.
*/
if (ztest_device_removal_active)
return;

(void) spa_scan(spa, POOL_SCAN_SCRUB);
(void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
(void) spa_scan(spa, POOL_SCAN_SCRUB);
Expand Down
2 changes: 1 addition & 1 deletion usr/src/lib/libzfs/common/libzfs_pool.c
Expand Up @@ -2808,7 +2808,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,

case EBUSY:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, "
"or pool has removing/removed vdevs"),
"or device removal is in progress"),
new_disk);
(void) zfs_error(hdl, EZFS_BADDEV, msg);
break;
Expand Down
3 changes: 2 additions & 1 deletion usr/src/lib/libzpool/common/llib-lzpool
Expand Up @@ -24,7 +24,7 @@
*/

/*
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/

/* LINTLIBRARY */
Expand All @@ -39,6 +39,7 @@
#include <sys/dnode.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_scan.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/space_map.h>
Expand Down
11 changes: 10 additions & 1 deletion usr/src/uts/common/fs/zfs/dsl_scan.c
Expand Up @@ -1988,7 +1988,16 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,

/* if it's a resilver, this may not be in the target range */
if (!needs_io) {
if (DVA_GET_GANG(&bp->blk_dva[d])) {
if (vd->vdev_ops == &vdev_indirect_ops) {
/*
* The indirect vdev can point to multiple
* vdevs. For simplicity, always create
* the resilver zio_t. zio_vdev_io_start()
* will bypass the child resilver i/o's if
* they are on vdevs that don't have DTL's.
*/
needs_io = B_TRUE;
} else if (DVA_GET_GANG(&bp->blk_dva[d])) {
/*
* Gang members may be spread across multiple
* vdevs, so the best estimate we have is the
Expand Down
2 changes: 1 addition & 1 deletion usr/src/uts/common/fs/zfs/metaslab.c
Expand Up @@ -3582,7 +3582,7 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
return;

if (spa->spa_vdev_removal != NULL &&
spa->spa_vdev_removal->svr_vdev == vd &&
spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
vdev_is_concrete(vd)) {
/*
* Note: we check if the vdev is concrete because when
Expand Down
7 changes: 2 additions & 5 deletions usr/src/uts/common/fs/zfs/spa.c
Expand Up @@ -5509,8 +5509,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
for (int c = 0; c < vd->vdev_children; c++) {
tvd = vd->vdev_child[c];
if (spa->spa_vdev_removal != NULL &&
tvd->vdev_ashift !=
spa->spa_vdev_removal->svr_vdev->vdev_ashift) {
tvd->vdev_ashift != spa->spa_max_ashift) {
return (spa_vdev_exit(spa, vd, txg, EINVAL));
}
/* Fail if top level vdev is raidz */
Expand Down Expand Up @@ -5626,10 +5625,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
return (spa_vdev_exit(spa, NULL, txg, error));
}

if (spa->spa_vdev_removal != NULL ||
spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
if (spa->spa_vdev_removal != NULL)
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
}

if (oldvd == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
Expand Down
5 changes: 4 additions & 1 deletion usr/src/uts/common/fs/zfs/spa_misc.c
Expand Up @@ -1761,9 +1761,12 @@ spa_update_dspace(spa_t *spa)
* allocated twice (on the old device and the new
* device).
*/
vdev_t *vd = spa->spa_vdev_removal->svr_vdev;
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
vdev_t *vd =
vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
spa->spa_dspace -= spa_deflate(spa) ?
vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
spa_config_exit(spa, SCL_VDEV, FTAG);
}
}

Expand Down
2 changes: 1 addition & 1 deletion usr/src/uts/common/fs/zfs/sys/vdev_removal.h
Expand Up @@ -30,7 +30,7 @@ extern "C" {
#endif

typedef struct spa_vdev_removal {
vdev_t *svr_vdev;
uint64_t svr_vdev_id;
uint64_t svr_max_offset_to_sync[TXG_SIZE];
/* Thread performing a vdev removal. */
kthread_t *svr_thread;
Expand Down
2 changes: 1 addition & 1 deletion usr/src/uts/common/fs/zfs/sys/zio.h
Expand Up @@ -550,7 +550,7 @@ extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
zio_done_func_t *done, void *private);

extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
struct abd *data, uint64_t size, int type, zio_priority_t priority,
struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private);

extern void zio_vdev_io_bypass(zio_t *zio);
Expand Down
26 changes: 26 additions & 0 deletions usr/src/uts/common/fs/zfs/vdev.c
Expand Up @@ -867,6 +867,32 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
svd->vdev_stat.vs_space = 0;
svd->vdev_stat.vs_dspace = 0;

/*
* State which may be set on a top-level vdev that's in the
* process of being removed.
*/
ASSERT0(tvd->vdev_indirect_config.vic_births_object);
ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
ASSERT0(tvd->vdev_removing);
tvd->vdev_removing = svd->vdev_removing;
tvd->vdev_indirect_config = svd->vdev_indirect_config;
tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
tvd->vdev_indirect_births = svd->vdev_indirect_births;
range_tree_swap(&svd->vdev_obsolete_segments,
&tvd->vdev_obsolete_segments);
tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
svd->vdev_indirect_config.vic_mapping_object = 0;
svd->vdev_indirect_config.vic_births_object = 0;
svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
svd->vdev_indirect_mapping = NULL;
svd->vdev_indirect_births = NULL;
svd->vdev_obsolete_sm = NULL;
svd->vdev_removing = 0;

for (t = 0; t < TXG_SIZE; t++) {
while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
Expand Down

0 comments on commit 3a4b1be

Please sign in to comment.