Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prefetch on deadlists merge. #14402

Merged
merged 1 commit into from Jan 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions include/sys/bpobj.h
Expand Up @@ -87,6 +87,7 @@ int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func,
void *arg, int64_t start);

void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
void bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj);
void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx);

Expand Down
65 changes: 63 additions & 2 deletions module/zfs/bpobj.c
Expand Up @@ -663,14 +663,13 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
}

VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));

if (bpobj_is_empty(&subbpo)) {
/* No point in having an empty subobj. */
bpobj_close(&subbpo);
bpobj_free(bpo->bpo_os, subobj, tx);
return;
}
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));

mutex_enter(&bpo->bpo_lock);
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
Expand Down Expand Up @@ -780,6 +779,68 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)

}

/*
* Prefetch metadata required for bpobj_enqueue_subobj().
*/
void
bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj)
{
dmu_object_info_t doi;
bpobj_t subbpo;
uint64_t subsubobjs;
boolean_t copy_subsub = B_TRUE;
boolean_t copy_bps = B_TRUE;

ASSERT(bpobj_is_open(bpo));
ASSERT(subobj != 0);

if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj)
return;

if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0)
return;
if (bpobj_is_empty(&subbpo)) {
bpobj_close(&subbpo);
return;
}
subsubobjs = subbpo.bpo_phys->bpo_subobjs;
bpobj_close(&subbpo);

if (subsubobjs != 0) {
if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0)
return;
if (doi.doi_max_offset > doi.doi_data_block_size)
copy_subsub = B_FALSE;
}

if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0)
return;
if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub)
copy_bps = B_FALSE;

if (copy_subsub && subsubobjs != 0) {
if (bpo->bpo_phys->bpo_subobjs) {
dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
ZIO_PRIORITY_ASYNC_READ);
}
dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1,
ZIO_PRIORITY_ASYNC_READ);
}

if (copy_bps) {
dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1,
ZIO_PRIORITY_ASYNC_READ);
dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1,
ZIO_PRIORITY_ASYNC_READ);
} else if (bpo->bpo_phys->bpo_subobjs) {
dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
ZIO_PRIORITY_ASYNC_READ);
}
}

void
bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
Expand Down
71 changes: 67 additions & 4 deletions module/zfs/dsl_deadlist.c
Expand Up @@ -438,6 +438,18 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
}
}

/*
* Prefetch metadata required for dle_enqueue_subobj().
*/
static void
dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
uint64_t obj)
{
if (dle->dle_bpobj.bpo_object !=
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj)
bpobj_prefetch_subobj(&dle->dle_bpobj, obj);
}

void
dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
Expand Down Expand Up @@ -810,6 +822,27 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
dle_enqueue_subobj(dl, dle, obj, tx);
}

/*
* Prefetch metadata required for dsl_deadlist_insert_bpobj().
*/
static void
dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth)
{
dsl_deadlist_entry_t dle_tofind;
dsl_deadlist_entry_t *dle;
avl_index_t where;

ASSERT(MUTEX_HELD(&dl->dl_lock));

dsl_deadlist_load_tree(dl);

dle_tofind.dle_mintxg = birth;
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
if (dle == NULL)
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
behlendorf marked this conversation as resolved.
Show resolved Hide resolved
dle_prefetch_subobj(dl, dle, obj);
}

static int
dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
Expand All @@ -826,12 +859,12 @@ dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
void
dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
{
zap_cursor_t zc;
zap_attribute_t za;
zap_cursor_t zc, pzc;
zap_attribute_t za, pza;
dmu_buf_t *bonus;
dsl_deadlist_phys_t *dlp;
dmu_object_info_t doi;
int error;
int error, perror, i;

VERIFY0(dmu_object_info(dl->dl_os, obj, &doi));
if (doi.doi_type == DMU_OT_BPOBJ) {
Expand All @@ -843,15 +876,32 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
}

mutex_enter(&dl->dl_lock);
/*
* Prefetch up to 128 deadlists first and then more as we progress.
* The limit is a balance between ARC use and diminishing returns.
*/
for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0;
(perror = zap_cursor_retrieve(&pzc, &pza)) == 0 && i < 128;
amotin marked this conversation as resolved.
Show resolved Hide resolved
zap_cursor_advance(&pzc), i++) {
dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer,
zfs_strtonum(pza.za_name, NULL));
}
for (zap_cursor_init(&zc, dl->dl_os, obj);
(error = zap_cursor_retrieve(&zc, &za)) == 0;
zap_cursor_advance(&zc)) {
uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));
if (perror == 0) {
dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer,
zfs_strtonum(pza.za_name, NULL));
zap_cursor_advance(&pzc);
perror = zap_cursor_retrieve(&pzc, &pza);
}
}
VERIFY3U(error, ==, ENOENT);
zap_cursor_fini(&zc);
zap_cursor_fini(&pzc);

VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
dlp = bonus->db_data;
Expand All @@ -869,8 +919,9 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
dmu_tx_t *tx)
{
dsl_deadlist_entry_t dle_tofind;
dsl_deadlist_entry_t *dle;
dsl_deadlist_entry_t *dle, *pdle;
avl_index_t where;
int i;

ASSERT(!dl->dl_oldfmt);

Expand All @@ -882,11 +933,23 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
if (dle == NULL)
dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
/*
* Prefetch up to 128 deadlists first and then more as we progress.
* The limit is a balance between ARC use and diminishing returns.
*/
for (pdle = dle, i = 0; pdle && i < 128; ) {
bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
pdle = AVL_NEXT(&dl->dl_tree, pdle);
}
while (dle) {
uint64_t used, comp, uncomp;
dsl_deadlist_entry_t *dle_next;

bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
if (pdle) {
bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
pdle = AVL_NEXT(&dl->dl_tree, pdle);
}

VERIFY0(bpobj_space(&dle->dle_bpobj,
&used, &comp, &uncomp));
Expand Down