Skip to content

Commit

Permalink
Prefetch on deadlists merge.
Browse files Browse the repository at this point in the history
During snapshot deletion ZFS may issue several reads for each deadlist
to merge them into next snapshot's or pool's bpobj.  Number of the dead
lists increases with number of snapshots.  On HDD pools it may take
significant time during which sync thread is blocked.

This patch introduces prescient prefetch of required blocks for up to
128 deadlists ahead.  Tests show reduction of time required to delete
dataset with 720 snapshots with randomly overwritten file on wide HDD
pool from 75-85 to 22-28 seconds.

Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
  • Loading branch information
amotin committed Jan 18, 2023
1 parent a379083 commit 626f17a
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 6 deletions.
1 change: 1 addition & 0 deletions include/sys/bpobj.h
Expand Up @@ -87,6 +87,7 @@ int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func,
void *arg, int64_t start);

void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
void bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj);
void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx);

Expand Down
62 changes: 60 additions & 2 deletions module/zfs/bpobj.c
Expand Up @@ -663,14 +663,13 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
}

VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));

if (bpobj_is_empty(&subbpo)) {
/* No point in having an empty subobj. */
bpobj_close(&subbpo);
bpobj_free(bpo->bpo_os, subobj, tx);
return;
}
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));

mutex_enter(&bpo->bpo_lock);
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
Expand Down Expand Up @@ -780,6 +779,65 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)

}

/*
* Prefetch metadata required for bpobj_enqueue_subobj().
*/
void
bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj)
{
dmu_object_info_t doi;
bpobj_t subbpo;
uint64_t subsubobjs;
boolean_t copy_subsub = B_TRUE;
boolean_t copy_bps = B_TRUE;

ASSERT(bpobj_is_open(bpo));
ASSERT(subobj != 0);

if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj)
return;

VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
if (bpobj_is_empty(&subbpo)) {
bpobj_close(&subbpo);
return;
}
subsubobjs = subbpo.bpo_phys->bpo_subobjs;
bpobj_close(&subbpo);

if (subsubobjs != 0) {
VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
if (doi.doi_max_offset > doi.doi_data_block_size)
copy_subsub = B_FALSE;
}

VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi));
if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub)
copy_bps = B_FALSE;

if (copy_subsub && subsubobjs != 0) {
if (bpo->bpo_phys->bpo_subobjs) {
dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
ZIO_PRIORITY_ASYNC_READ);
}
dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1,
ZIO_PRIORITY_ASYNC_READ);
}

if (copy_bps) {
dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1,
ZIO_PRIORITY_ASYNC_READ);
dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1,
ZIO_PRIORITY_ASYNC_READ);
} else if (bpo->bpo_phys->bpo_subobjs) {
dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
ZIO_PRIORITY_ASYNC_READ);
}
}

void
bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
Expand Down
63 changes: 59 additions & 4 deletions module/zfs/dsl_deadlist.c
Expand Up @@ -438,6 +438,18 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
}
}

/*
* Prefetch metadata required for dle_enqueue_subobj().
*/
static void
dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
uint64_t obj)
{
if (dle->dle_bpobj.bpo_object !=
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj)
bpobj_prefetch_subobj(&dle->dle_bpobj, obj);
}

void
dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
Expand Down Expand Up @@ -810,6 +822,27 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
dle_enqueue_subobj(dl, dle, obj, tx);
}

/*
* Prefetch metadata required for dsl_deadlist_insert_bpobj().
*/
static void
dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth)
{
dsl_deadlist_entry_t dle_tofind;
dsl_deadlist_entry_t *dle;
avl_index_t where;

ASSERT(MUTEX_HELD(&dl->dl_lock));

dsl_deadlist_load_tree(dl);

dle_tofind.dle_mintxg = birth;
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
if (dle == NULL)
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
dle_prefetch_subobj(dl, dle, obj);
}

static int
dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
Expand All @@ -826,12 +859,12 @@ dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
void
dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
{
zap_cursor_t zc;
zap_attribute_t za;
zap_cursor_t zc, pzc;
zap_attribute_t za, pza;
dmu_buf_t *bonus;
dsl_deadlist_phys_t *dlp;
dmu_object_info_t doi;
int error;
int error, perror, i;

VERIFY0(dmu_object_info(dl->dl_os, obj, &doi));
if (doi.doi_type == DMU_OT_BPOBJ) {
Expand All @@ -843,15 +876,28 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
}

mutex_enter(&dl->dl_lock);
for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0;
(perror = zap_cursor_retrieve(&pzc, &pza)) == 0 && i < 128;
zap_cursor_advance(&pzc), i++) {
dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer,
zfs_strtonum(pza.za_name, NULL));
}
for (zap_cursor_init(&zc, dl->dl_os, obj);
(error = zap_cursor_retrieve(&zc, &za)) == 0;
zap_cursor_advance(&zc)) {
uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));
if (perror == 0) {
dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer,
zfs_strtonum(pza.za_name, NULL));
zap_cursor_advance(&pzc);
perror = zap_cursor_retrieve(&pzc, &pza);
}
}
VERIFY3U(error, ==, ENOENT);
zap_cursor_fini(&zc);
zap_cursor_fini(&pzc);

VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
dlp = bonus->db_data;
Expand All @@ -869,8 +915,9 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
dmu_tx_t *tx)
{
dsl_deadlist_entry_t dle_tofind;
dsl_deadlist_entry_t *dle;
dsl_deadlist_entry_t *dle, *pdle;
avl_index_t where;
int i;

ASSERT(!dl->dl_oldfmt);

Expand All @@ -882,11 +929,19 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
if (dle == NULL)
dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
for (pdle = dle, i = 0; pdle && i < 128; ) {
bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
pdle = AVL_NEXT(&dl->dl_tree, pdle);
}
while (dle) {
uint64_t used, comp, uncomp;
dsl_deadlist_entry_t *dle_next;

bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
if (pdle) {
bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
pdle = AVL_NEXT(&dl->dl_tree, pdle);
}

VERIFY0(bpobj_space(&dle->dle_bpobj,
&used, &comp, &uncomp));
Expand Down

0 comments on commit 626f17a

Please sign in to comment.