Skip to content
This repository has been archived by the owner on Nov 7, 2019. It is now read-only.

Commit

Permalink
9486 reduce memory used by device removal on fragmented pools
Browse files Browse the repository at this point in the history
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Tim Chase <tim@chase2k.com>
Approved by: Robert Mustacchi <rm@joyent.com>
  • Loading branch information
ahrens authored and Prakash Surya committed Jun 1, 2018
1 parent c7a7b2f commit cfd63e1
Show file tree
Hide file tree
Showing 5 changed files with 244 additions and 46 deletions.
23 changes: 21 additions & 2 deletions usr/src/uts/common/fs/zfs/range_tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -298,15 +298,14 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
static range_seg_t *
range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
{
avl_index_t where;
range_seg_t rsearch;
uint64_t end = start + size;

VERIFY(size != 0);

rsearch.rs_start = start;
rsearch.rs_end = end;
return (avl_find(&rt->rt_root, &rsearch, &where));
return (avl_find(&rt->rt_root, &rsearch, NULL));
}

static range_seg_t *
Expand Down Expand Up @@ -407,3 +406,23 @@ range_tree_is_empty(range_tree_t *rt)
ASSERT(rt != NULL);
return (range_tree_space(rt) == 0);
}

uint64_t
range_tree_min(range_tree_t *rt)
{
range_seg_t *rs = avl_first(&rt->rt_root);
return (rs != NULL ? rs->rs_start : 0);
}

uint64_t
range_tree_max(range_tree_t *rt)
{
range_seg_t *rs = avl_last(&rt->rt_root);
return (rs != NULL ? rs->rs_end : 0);
}

uint64_t
range_tree_span(range_tree_t *rt)
{
return (range_tree_max(rt) - range_tree_min(rt));
}
3 changes: 3 additions & 0 deletions usr/src/uts/common/fs/zfs/sys/range_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ boolean_t range_tree_is_empty(range_tree_t *rt);
void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
void range_tree_stat_verify(range_tree_t *rt);
uint64_t range_tree_min(range_tree_t *rt);
uint64_t range_tree_max(range_tree_t *rt);
uint64_t range_tree_span(range_tree_t *rt);

void range_tree_add(void *arg, uint64_t start, uint64_t size);
void range_tree_remove(void *arg, uint64_t start, uint64_t size);
Expand Down
3 changes: 3 additions & 0 deletions usr/src/uts/common/fs/zfs/sys/vdev_removal.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ extern void spa_vdev_remove_suspend(spa_t *);
extern int spa_vdev_remove_cancel(spa_t *);
extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr);

extern int vdev_removal_max_span;
extern int zfs_remove_max_segment;

#ifdef __cplusplus
}
#endif
Expand Down
53 changes: 32 additions & 21 deletions usr/src/uts/common/fs/zfs/vdev_label.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@
* 1. Uniquely identify this device as part of a ZFS pool and confirm its
* identity within the pool.
*
* 2. Verify that all the devices given in a configuration are present
* 2. Verify that all the devices given in a configuration are present
* within the pool.
*
* 3. Determine the uberblock for the pool.
* 3. Determine the uberblock for the pool.
*
* 4. In case of an import operation, determine the configuration of the
* 4. In case of an import operation, determine the configuration of the
* toplevel vdev of which it is a part.
*
* 5. If an import operation cannot find all the devices in the pool,
* 5. If an import operation cannot find all the devices in the pool,
* provide enough information to the administrator to determine which
* devices are missing.
*
Expand Down Expand Up @@ -77,9 +77,9 @@
* In order to identify which labels are valid, the labels are written in the
* following manner:
*
* 1. For each vdev, update 'L1' to the new label
* 2. Update the uberblock
* 3. For each vdev, update 'L2' to the new label
* 1. For each vdev, update 'L1' to the new label
* 2. Update the uberblock
* 3. For each vdev, update 'L2' to the new label
*
* Given arbitrary failure, we can determine the correct label to use based on
* the transaction group. If we fail after updating L1 but before updating the
Expand Down Expand Up @@ -117,19 +117,19 @@
*
* The nvlist describing the pool and vdev contains the following elements:
*
* version ZFS on-disk version
* name Pool name
* state Pool state
* txg Transaction group in which this label was written
* pool_guid Unique identifier for this pool
* vdev_tree An nvlist describing vdev tree.
* version ZFS on-disk version
* name Pool name
* state Pool state
* txg Transaction group in which this label was written
* pool_guid Unique identifier for this pool
* vdev_tree An nvlist describing vdev tree.
* features_for_read
* An nvlist of the features necessary for reading the MOS.
*
* Each leaf device label also contains the following:
*
* top_guid Unique ID for top-level vdev in which this is contained
* guid Unique ID for the leaf vdev
* top_guid Unique ID for top-level vdev in which this is contained
* guid Unique ID for the leaf vdev
*
* The 'vs' configuration follows the format described in 'spa_config.c'.
*/
Expand Down Expand Up @@ -390,22 +390,33 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
* histograms.
*/
uint64_t seg_count = 0;
uint64_t to_alloc = vd->vdev_stat.vs_alloc;

/*
* There are the same number of allocated segments
* as free segments, so we will have at least one
* entry per free segment.
* entry per free segment. However, small free
* segments (smaller than vdev_removal_max_span)
* will be combined with adjacent allocated segments
* as a single mapping.
*/
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
seg_count += vd->vdev_mg->mg_histogram[i];
if (1ULL << (i + 1) < vdev_removal_max_span) {
to_alloc +=
vd->vdev_mg->mg_histogram[i] <<
i + 1;
} else {
seg_count +=
vd->vdev_mg->mg_histogram[i];
}
}

/*
* The maximum length of a mapping is SPA_MAXBLOCKSIZE,
* so we need at least one entry per SPA_MAXBLOCKSIZE
* of allocated data.
* The maximum length of a mapping is
* zfs_remove_max_segment, so we need at least one entry
* per zfs_remove_max_segment of allocated data.
*/
seg_count += vd->vdev_stat.vs_alloc / SPA_MAXBLOCKSIZE;
seg_count += to_alloc / zfs_remove_max_segment;

fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
seg_count *
Expand Down
Loading

0 comments on commit cfd63e1

Please sign in to comment.