Skip to content

Commit

Permalink
Use deferred free mechanism for autotrim
Browse files Browse the repository at this point in the history
Current autotrim has very high overhead:
1. Select a metaslab that needs to be trimmed, and disable it.
2. Push the issued zio_trim() into txg sync pipline by calling
txg_wait_synced(). Wait for the txg (tx_open_txg + 2) to be synced.
3. Enable the metaslab.

In this patch, I added ms_trimming, ms_trimmed,
ms_trimmed_defer[2] to metaslab to track autotrim process.
Here are the steps after the patch:
1. Select a metaslab that needs to be trimmed, only grab the ms_lock.
2. If the metaslab is loaded, move all the ms_trim ranges from
ms_allocatable to ms_trimming, so that these trimming ranges will not
be allocated. Then release ms_lock.
3. After issuing all the zio_trim(), move all the trimming ranges
from ms_trimming to ms_trimmed.
4. The trimmed ranges will go through ms_trimmed_defer[2], after that,
the trimmed ranges will be released back to ms_allocatable.

Benefits:
No metaslab_disable(), no txg_wait_synced().
txg_wait_synced() cause short-lived txgs, and fragmentation.

Signed-off-by: jxdking <lostking2008@hotmail.com>
  • Loading branch information
jxdking committed Jun 11, 2021
1 parent e5e76bd commit 4daf35b
Show file tree
Hide file tree
Showing 3 changed files with 248 additions and 48 deletions.
3 changes: 3 additions & 0 deletions include/sys/metaslab_impl.h
Expand Up @@ -423,6 +423,9 @@ struct metaslab {
* facilitate efficient trimming.
*/
range_tree_t *ms_trim;
range_tree_t *ms_trimming;
range_tree_t *ms_trimmed;
range_tree_t *ms_trimmed_defer[TXG_DEFER_SIZE];

boolean_t ms_condensing; /* condensing? */
boolean_t ms_condense_wanted;
Expand Down
161 changes: 151 additions & 10 deletions module/zfs/metaslab.c
Expand Up @@ -145,6 +145,12 @@ int zfs_mg_fragmentation_threshold = 95;
*/
int zfs_metaslab_fragmentation_threshold = 70;

/*
* For debug purpose, if true, ms_trimmed ranges will be directly put back
* in ms_allocatable without being deferred.
*/
int metaslab_trimmed_defer_disable = B_FALSE;

/*
* When set will load all metaslabs when pool is first opened.
*/
Expand Down Expand Up @@ -1940,10 +1946,14 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg)

ASSERT3U(msp->ms_deferspace, ==,
range_tree_space(msp->ms_defer[0]) +
range_tree_space(msp->ms_defer[1]));
range_tree_space(msp->ms_defer[1]) +
range_tree_space(msp->ms_trimmed_defer[0]) +
range_tree_space(msp->ms_trimmed_defer[1]));

msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
msp->ms_deferspace + range_tree_space(msp->ms_freed);
msp->ms_deferspace + range_tree_space(msp->ms_freed) +
range_tree_space(msp->ms_trimming) +
range_tree_space(msp->ms_trimmed);

VERIFY3U(sm_free_space, ==, msp_free_space);
}
Expand Down Expand Up @@ -2016,7 +2026,13 @@ metaslab_aux_histograms_update(metaslab_t *msp)
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
metaslab_aux_histogram_add(msp->ms_deferhist[t],
sm->sm_shift, msp->ms_defer[t]);
metaslab_aux_histogram_add(msp->ms_deferhist[t],
sm->sm_shift, msp->ms_trimmed_defer[t]);
}
metaslab_aux_histogram_add(msp->ms_synchist,
sm->sm_shift, msp->ms_trimming);
metaslab_aux_histogram_add(msp->ms_synchist,
sm->sm_shift, msp->ms_trimmed);
}

metaslab_aux_histogram_add(msp->ms_synchist,
Expand Down Expand Up @@ -2407,7 +2423,13 @@ metaslab_load_impl(metaslab_t *msp)
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_walk(msp->ms_defer[t],
range_tree_remove, msp->ms_allocatable);
range_tree_walk(msp->ms_trimmed_defer[t],
range_tree_remove, msp->ms_allocatable);
}
range_tree_walk(msp->ms_trimming,
range_tree_remove, msp->ms_allocatable);
range_tree_walk(msp->ms_trimmed,
range_tree_remove, msp->ms_allocatable);

/*
* Call metaslab_recalculate_weight_and_sort() now that the
Expand Down Expand Up @@ -2700,6 +2722,12 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
type, mrap, start, shift);

ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
ms->ms_trimming = range_tree_create(NULL, type, NULL, start, shift);
ms->ms_trimmed = range_tree_create(NULL, type, NULL, start, shift);
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
ms->ms_trimmed_defer[t] = range_tree_create(NULL, type, NULL,
start, shift);
}

metaslab_group_add(mg, ms);
metaslab_set_fragmentation(ms, B_FALSE);
Expand Down Expand Up @@ -2813,6 +2841,14 @@ metaslab_fini(metaslab_t *msp)

range_tree_vacate(msp->ms_trim, NULL, NULL);
range_tree_destroy(msp->ms_trim);
range_tree_vacate(msp->ms_trimming, NULL, NULL);
range_tree_destroy(msp->ms_trimming);
range_tree_vacate(msp->ms_trimmed, NULL, NULL);
range_tree_destroy(msp->ms_trimmed);
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_vacate(msp->ms_trimmed_defer[t], NULL, NULL);
range_tree_destroy(msp->ms_trimmed_defer[t]);
}

mutex_exit(&msp->ms_lock);
cv_destroy(&msp->ms_load_cv);
Expand Down Expand Up @@ -3552,12 +3588,29 @@ metaslab_should_condense(metaslab_t *msp)
ASSERT(sm != NULL);
ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);

if (msp->ms_condense_wanted)
return (B_TRUE);

uint64_t trim_numsegs = range_tree_numsegs(msp->ms_trimming) +
range_tree_numsegs(msp->ms_trimmed);

for (int t = 0; t < TXG_DEFER_SIZE; t++) {
trim_numsegs += range_tree_numsegs(msp->ms_trimmed_defer[t]);
}

/*
* When there is any trim activity in flight, the estimation based on
* ms_allocatable may not represent the full picture of the space map.
* Skip the condense.
*/
if (trim_numsegs > 0)
return (B_FALSE);

/*
* We always condense metaslabs that are empty and metaslabs for
* which a condense request has been made.
*/
if (range_tree_numsegs(msp->ms_allocatable) == 0 ||
msp->ms_condense_wanted)
if (range_tree_numsegs(msp->ms_allocatable) == 0)
return (B_TRUE);

uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
Expand Down Expand Up @@ -3634,9 +3687,9 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
ASSERT3U(spa_sync_pass(spa), ==, 1);
ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */

zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, "
zfs_dbgmsg("condensing: txg %llu, vdev id %llu, ms_id %llu, "
"spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
msp->ms_group->mg_vd->vdev_id, msp->ms_id,
spa->spa_name, space_map_length(msp->ms_sm),
range_tree_numsegs(msp->ms_allocatable),
msp->ms_condense_wanted ? "TRUE" : "FALSE");
Expand All @@ -3653,7 +3706,11 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_walk(msp->ms_defer[t],
range_tree_add, condense_tree);
range_tree_walk(msp->ms_trimmed_defer[t],
range_tree_add, condense_tree);
}
range_tree_walk(msp->ms_trimming, range_tree_add, condense_tree);
range_tree_walk(msp->ms_trimmed, range_tree_add, condense_tree);

for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
Expand Down Expand Up @@ -3882,9 +3939,11 @@ metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
uint64_t sm_len_after = space_map_length(msp->ms_sm);
if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
"ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
"ms_id %llu, smp_allocated %llu, unflushed_allocs %llu, "
"unflushed_frees %llu, "
"appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa),
msp->ms_group->mg_vd->vdev_id, msp->ms_id,
space_map_allocated(msp->ms_sm),
range_tree_space(msp->ms_unflushed_allocs),
range_tree_space(msp->ms_unflushed_frees),
(sm_len_after - sm_len_before));
Expand Down Expand Up @@ -3950,6 +4009,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
*/
if (range_tree_is_empty(alloctree) &&
range_tree_is_empty(msp->ms_freeing) &&
range_tree_is_empty(msp->ms_trimming) &&
range_tree_is_empty(msp->ms_trimmed) &&
range_tree_is_empty(msp->ms_checkpointing) &&
!(msp->ms_loaded && msp->ms_condense_wanted &&
txg <= spa_final_dirty_txg(spa)))
Expand Down Expand Up @@ -4147,7 +4208,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
space_map_histogram_add(msp->ms_sm,
msp->ms_defer[t], tx);
space_map_histogram_add(msp->ms_sm,
msp->ms_trimmed_defer[t], tx);
}

space_map_histogram_add(msp->ms_sm, msp->ms_trimming, tx);
space_map_histogram_add(msp->ms_sm, msp->ms_trimmed, tx);
}

/*
Expand Down Expand Up @@ -4233,12 +4299,39 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
vdev_t *vd = mg->mg_vd;
spa_t *spa = vd->vdev_spa;
range_tree_t **defer_tree;
range_tree_t **trimmed_defer;
int64_t alloc_delta, defer_delta;
boolean_t defer_allowed = B_TRUE;

ASSERT(!vd->vdev_ishole);

mutex_enter(&msp->ms_lock);
if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) != 0) {
zfs_dbgmsg("before: txg %llu, spa %s, vdev_id %llu, "
"ms_id %llu, smp_allocated %llu, "
"ms_unflushed_allocs %llu, ms_unflushed_frees %llu, "
"freeing %llu, freed %llu, defer %llu + %llu, "
"trimming %llu, trimmed %llu, trimmed_defer %llu + %llu, "
"allocatable %llu, ms_allocated_space %llu, "
"ms_deferspace %llu, ms_weight %llu\n",
spa_syncing_txg(spa), spa_name(spa),
msp->ms_group->mg_vd->vdev_id, msp->ms_id,
space_map_allocated(msp->ms_sm),
range_tree_space(msp->ms_unflushed_allocs),
range_tree_space(msp->ms_unflushed_frees),
range_tree_space(msp->ms_freeing),
range_tree_space(msp->ms_freed),
range_tree_space(msp->ms_defer[0]),
range_tree_space(msp->ms_defer[1]),
range_tree_space(msp->ms_trimming),
range_tree_space(msp->ms_trimmed),
range_tree_space(msp->ms_trimmed_defer[0]),
range_tree_space(msp->ms_trimmed_defer[1]),
range_tree_space(msp->ms_allocatable),
msp->ms_allocated_space,
msp->ms_deferspace,
msp->ms_weight);
}

if (msp->ms_new) {
/* this is a new metaslab, add its capacity to the vdev */
Expand All @@ -4253,6 +4346,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
ASSERT0(range_tree_space(msp->ms_checkpointing));

defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
trimmed_defer = &msp->ms_trimmed_defer[txg % TXG_DEFER_SIZE];

uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
metaslab_class_get_alloc(spa_normal_class(spa));
Expand All @@ -4265,10 +4359,19 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
range_tree_space(msp->ms_freed);

if (defer_allowed) {
defer_delta = range_tree_space(msp->ms_freed) -
range_tree_space(*defer_tree);
if (!metaslab_trimmed_defer_disable) {
defer_delta = range_tree_space(msp->ms_freed) +
range_tree_space(msp->ms_trimmed) -
range_tree_space(*defer_tree) -
range_tree_space(*trimmed_defer);
} else {
defer_delta = range_tree_space(msp->ms_freed) -
range_tree_space(*defer_tree) -
range_tree_space(*trimmed_defer);
}
} else {
defer_delta -= range_tree_space(*defer_tree);
defer_delta -= range_tree_space(*defer_tree) +
range_tree_space(*trimmed_defer);
}
metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
defer_delta, 0);
Expand Down Expand Up @@ -4320,6 +4423,16 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
msp->ms_allocatable);
}

range_tree_vacate(*trimmed_defer,
msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
if (defer_allowed && !metaslab_trimmed_defer_disable) {
range_tree_swap(&msp->ms_trimmed, trimmed_defer);
} else {
range_tree_vacate(msp->ms_trimmed,
msp->ms_loaded ? range_tree_add : NULL,
msp->ms_allocatable);
}

msp->ms_synced_length = space_map_length(msp->ms_sm);

msp->ms_deferspace += defer_delta;
Expand Down Expand Up @@ -4353,6 +4466,34 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
ASSERT0(range_tree_space(msp->ms_checkpointing));
msp->ms_allocating_total -= msp->ms_allocated_this_txg;
msp->ms_allocated_this_txg = 0;

if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) != 0) {
zfs_dbgmsg("after: txg %llu, spa %s, vdev_id %llu, "
"ms_id %llu, smp_allocated %llu, "
"ms_unflushed_allocs %llu, ms_unflushed_frees %llu, "
"freeing %llu, freed %llu, defer %llu + %llu, "
"trimming %llu, trimmed %llu, trimmed_defer %llu + %llu, "
"allocatable %llu, ms_allocated_space %llu, "
"ms_deferspace %llu, ms_weight %llu\n",
spa_syncing_txg(spa), spa_name(spa),
msp->ms_group->mg_vd->vdev_id, msp->ms_id,
space_map_allocated(msp->ms_sm),
range_tree_space(msp->ms_unflushed_allocs),
range_tree_space(msp->ms_unflushed_frees),
range_tree_space(msp->ms_freeing),
range_tree_space(msp->ms_freed),
range_tree_space(msp->ms_defer[0]),
range_tree_space(msp->ms_defer[1]),
range_tree_space(msp->ms_trimming),
range_tree_space(msp->ms_trimmed),
range_tree_space(msp->ms_trimmed_defer[0]),
range_tree_space(msp->ms_trimmed_defer[1]),
range_tree_space(msp->ms_allocatable),
msp->ms_allocated_space,
msp->ms_deferspace,
msp->ms_weight);
}

mutex_exit(&msp->ms_lock);
}

Expand Down

0 comments on commit 4daf35b

Please sign in to comment.