Skip to content

Commit 8951cb8

Browse files
Alex Reecebehlendorf
authored andcommitted
Illumos 4873 - zvol unmap calls can take a very long time for larger datasets
4873 zvol unmap calls can take a very long time for larger datasets Author: Alex Reece <alex@delphix.com> Reviewed by: George Wilson <george@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Paul Dagnelie <paul.dagnelie@delphix.com> Reviewed by: Basil Crow <basil.crow@delphix.com> Reviewed by: Dan McDonald <danmcd@omniti.com> Approved by: Robert Mustacchi <rm@joyent.com> References: https://www.illumos.org/issues/4873 illumos/illumos-gate@0f6d88a Porting Notes: dbuf_free_range(): - reduce stack usage using kmem_alloc() - the sorted AVL tree will handle the spill block case correctly without all the special handling in the for() loop Ported-by: Chris Dunlop <chris@onthe.net.au> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
1 parent 58c4aa0 commit 8951cb8

File tree

7 files changed

+138
-49
lines changed

7 files changed

+138
-49
lines changed

include/sys/avl.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323
* Use is subject to license terms.
2424
*/
2525

26+
/*
27+
* Copyright (c) 2014 by Delphix. All rights reserved.
28+
*/
29+
2630
#ifndef _AVL_H
2731
#define _AVL_H
2832

@@ -259,6 +263,11 @@ extern boolean_t avl_update(avl_tree_t *, void *);
259263
extern boolean_t avl_update_lt(avl_tree_t *, void *);
260264
extern boolean_t avl_update_gt(avl_tree_t *, void *);
261265

266+
/*
267+
* Swaps the contents of the two trees.
268+
*/
269+
extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2);
270+
262271
/*
263272
* Return the number of nodes in the tree
264273
*/

include/sys/dbuf.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23-
* Copyright (c) 2013 by Delphix. All rights reserved.
23+
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
2424
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
2525
*/
2626

@@ -213,11 +213,14 @@ typedef struct dmu_buf_impl {
213213
/* pointer to most recent dirty record for this buffer */
214214
dbuf_dirty_record_t *db_last_dirty;
215215

216+
/* Creation time of dbuf (see comment in dbuf_compare). */
217+
hrtime_t db_creation;
218+
216219
/*
217220
* Our link on the owner dnodes's dn_dbufs list.
218221
* Protected by its dn_dbufs_mtx.
219222
*/
220-
list_node_t db_link;
223+
avl_node_t db_link;
221224

222225
/* Data which is unique to data (leaf) blocks: */
223226

include/sys/dnode.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ typedef struct dnode {
233233
refcount_t dn_holds;
234234

235235
kmutex_t dn_dbufs_mtx;
236-
list_t dn_dbufs; /* descendent dbufs */
236+
avl_tree_t dn_dbufs; /* descendent dbufs */
237237

238238
/* protected by dn_struct_rwlock */
239239
struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */

module/avl/avl.c

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323
* Use is subject to license terms.
2424
*/
2525

26+
/*
27+
* Copyright (c) 2014 by Delphix. All rights reserved.
28+
*/
29+
2630
/*
2731
* AVL - generic AVL tree implementation for kernel use
2832
*
@@ -85,6 +89,12 @@
8589
* is a modified "avl_node_t *". The bottom bit (normally 0 for a
8690
* pointer) is set to indicate if that the new node has a value greater
8791
* than the value of the indicated "avl_node_t *".
92+
*
93+
* Note - in addition to userland (e.g. libavl and libutil) and the kernel
94+
* (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module,
95+
* which each have their own compilation environments and subsequent
96+
* requirements. Each of these environments must be considered when adding
97+
* dependencies from avl.c.
8898
*/
8999

90100
#include <sys/types.h>
@@ -863,6 +873,24 @@ avl_update(avl_tree_t *t, void *obj)
863873
return (B_FALSE);
864874
}
865875

876+
void
877+
avl_swap(avl_tree_t *tree1, avl_tree_t *tree2)
878+
{
879+
avl_node_t *temp_node;
880+
ulong_t temp_numnodes;
881+
882+
ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar);
883+
ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset);
884+
ASSERT3U(tree1->avl_size, ==, tree2->avl_size);
885+
886+
temp_node = tree1->avl_root;
887+
temp_numnodes = tree1->avl_numnodes;
888+
tree1->avl_root = tree2->avl_root;
889+
tree1->avl_numnodes = tree2->avl_numnodes;
890+
tree2->avl_root = temp_node;
891+
tree2->avl_numnodes = temp_numnodes;
892+
}
893+
866894
/*
867895
* initialize a new AVL tree
868896
*/
@@ -1058,6 +1086,8 @@ EXPORT_SYMBOL(avl_first);
10581086
EXPORT_SYMBOL(avl_last);
10591087
EXPORT_SYMBOL(avl_nearest);
10601088
EXPORT_SYMBOL(avl_add);
1089+
EXPORT_SYMBOL(avl_swap);
1090+
EXPORT_SYMBOL(avl_is_empty);
10611091
EXPORT_SYMBOL(avl_remove);
10621092
EXPORT_SYMBOL(avl_numnodes);
10631093
EXPORT_SYMBOL(avl_destroy_nodes);

module/zfs/dbuf.c

Lines changed: 41 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,9 @@ dbuf_cons(void *vdb, void *unused, int kmflag)
9393
mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
9494
cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
9595
refcount_create(&db->db_holds);
96-
list_link_init(&db->db_link);
96+
97+
db->db_creation = gethrtime();
98+
9799
return (0);
98100
}
99101

@@ -386,7 +388,7 @@ dbuf_verify(dmu_buf_impl_t *db)
386388
ASSERT3U(db->db_level, <, dn->dn_nlevels);
387389
ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
388390
db->db_blkid == DMU_SPILL_BLKID ||
389-
!list_is_empty(&dn->dn_dbufs));
391+
!avl_is_empty(&dn->dn_dbufs));
390392
}
391393
if (db->db_blkid == DMU_BONUS_BLKID) {
392394
ASSERT(dn != NULL);
@@ -866,23 +868,34 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
866868
* receive; see comment below for details.
867869
*/
868870
void
869-
dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
871+
dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
872+
dmu_tx_t *tx)
870873
{
871-
dmu_buf_impl_t *db, *db_next;
874+
dmu_buf_impl_t *db, *db_next, *db_search;
872875
uint64_t txg = tx->tx_txg;
876+
avl_index_t where;
873877
boolean_t freespill =
874-
(start == DMU_SPILL_BLKID || end == DMU_SPILL_BLKID);
878+
(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID);
879+
880+
if (end_blkid > dn->dn_maxblkid && !freespill)
881+
end_blkid = dn->dn_maxblkid;
882+
dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
875883

876-
if (end > dn->dn_maxblkid && !freespill)
877-
end = dn->dn_maxblkid;
878-
dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
884+
db_seach = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
885+
db_search->db_level = 0;
886+
db_search->db_blkid = start_blkid;
887+
db_search->db_creation = 0;
879888

880889
mutex_enter(&dn->dn_dbufs_mtx);
881-
if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz &&
882-
!freespill) {
890+
if (start_blkid >= dn->dn_unlisted_l0_blkid && !freespill) {
883891
/* There can't be any dbufs in this range; no need to search. */
884-
mutex_exit(&dn->dn_dbufs_mtx);
885-
return;
892+
#ifdef DEBUG
893+
db = avl_find(&dn->dn_dbufs, db_search, &where);
894+
ASSERT3P(db, ==, NULL);
895+
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
896+
ASSERT(db == NULL || db->db_level > 0);
897+
#endif
898+
goto out;
886899
} else if (dmu_objset_is_receiving(dn->dn_objset)) {
887900
/*
888901
* If we are receiving, we expect there to be no dbufs in
@@ -894,19 +907,18 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
894907
atomic_inc_64(&zfs_free_range_recv_miss);
895908
}
896909

897-
for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
898-
db_next = list_next(&dn->dn_dbufs, db);
910+
db = avl_find(&dn->dn_dbufs, db_search, &where);
911+
ASSERT3P(db, ==, NULL);
912+
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
913+
914+
for (; db != NULL; db = db_next) {
915+
db_next = AVL_NEXT(&dn->dn_dbufs, db);
899916
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
900917

901-
/* Skip indirect blocks. */
902-
if (db->db_level != 0)
903-
continue;
904-
/* Skip direct blocks outside the range. */
905-
if (!freespill && (db->db_blkid < start || db->db_blkid > end))
906-
continue;
907-
/* Skip all direct blocks, only free spill blocks. */
908-
if (freespill && (db->db_blkid != DMU_SPILL_BLKID))
909-
continue;
918+
if (db->db_level != 0 || db->db_blkid > end_blkid) {
919+
break;
920+
}
921+
ASSERT3U(db->db_blkid, >=, start_blkid);
910922

911923
/* found a level 0 buffer in the range */
912924
mutex_enter(&db->db_mtx);
@@ -968,6 +980,9 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
968980

969981
mutex_exit(&db->db_mtx);
970982
}
983+
984+
out:
985+
kmem_free(db_search, sizeof (dmu_buf_impl_t));
971986
mutex_exit(&dn->dn_dbufs_mtx);
972987
}
973988

@@ -1657,7 +1672,7 @@ dbuf_clear(dmu_buf_impl_t *db)
16571672
dn = DB_DNODE(db);
16581673
dndb = dn->dn_dbuf;
16591674
if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1660-
list_remove(&dn->dn_dbufs, db);
1675+
avl_remove(&dn->dn_dbufs, db);
16611676
atomic_dec_32(&dn->dn_dbufs_count);
16621677
membar_producer();
16631678
DB_DNODE_EXIT(db);
@@ -1829,7 +1844,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
18291844
mutex_exit(&dn->dn_dbufs_mtx);
18301845
return (odb);
18311846
}
1832-
list_insert_head(&dn->dn_dbufs, db);
1847+
avl_add(&dn->dn_dbufs, db);
18331848
if (db->db_level == 0 && db->db_blkid >=
18341849
dn->dn_unlisted_l0_blkid)
18351850
dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
@@ -1888,7 +1903,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
18881903
DB_DNODE_ENTER(db);
18891904
dn = DB_DNODE(db);
18901905
mutex_enter(&dn->dn_dbufs_mtx);
1891-
list_remove(&dn->dn_dbufs, db);
1906+
avl_remove(&dn->dn_dbufs, db);
18921907
atomic_dec_32(&dn->dn_dbufs_count);
18931908
mutex_exit(&dn->dn_dbufs_mtx);
18941909
DB_DNODE_EXIT(db);
@@ -1906,7 +1921,6 @@ dbuf_destroy(dmu_buf_impl_t *db)
19061921
db->db_parent = NULL;
19071922
db->db_buf = NULL;
19081923

1909-
ASSERT(!list_link_active(&db->db_link));
19101924
ASSERT(db->db.db_data == NULL);
19111925
ASSERT(db->db_hash_next == NULL);
19121926
ASSERT(db->db_blkptr == NULL);

module/zfs/dnode.c

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,43 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
6262
static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
6363
#endif /* _KERNEL */
6464

65+
static int
66+
dbuf_compare(const void *x1, const void *x2)
67+
{
68+
const dmu_buf_impl_t *d1 = x1;
69+
const dmu_buf_impl_t *d2 = x2;
70+
71+
if (d1->db_level < d2->db_level) {
72+
return (-1);
73+
} else if (d1->db_level > d2->db_level) {
74+
return (1);
75+
}
76+
77+
if (d1->db_blkid < d2->db_blkid) {
78+
return (-1);
79+
} else if (d1->db_blkid > d2->db_blkid) {
80+
return (1);
81+
}
82+
83+
/*
84+
* If a dbuf is being evicted while dn_dbufs_mutex is not held, we set
85+
* the db_state to DB_EVICTING but do not remove it from dn_dbufs. If
86+
* another thread creates a dbuf of the same blkid before the dbuf is
87+
* removed from dn_dbufs, we can reach a state where there are two
88+
* dbufs of the same blkid and level in db_dbufs. To maintain the avl
89+
* invariant that there cannot be duplicate items, we distinguish
90+
* between these two dbufs based on the time they were created.
91+
*/
92+
if (d1->db_creation < d2->db_creation) {
93+
return (-1);
94+
} else if (d1->db_creation > d2->db_creation) {
95+
return (1);
96+
} else {
97+
ASSERT3P(d1, ==, d2);
98+
return (0);
99+
}
100+
}
101+
65102
/* ARGSUSED */
66103
static int
67104
dnode_cons(void *arg, void *unused, int kmflag)
@@ -116,7 +153,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
116153

117154
dn->dn_dbufs_count = 0;
118155
dn->dn_unlisted_l0_blkid = 0;
119-
list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
156+
avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
120157
offsetof(dmu_buf_impl_t, db_link));
121158

122159
dn->dn_moved = 0;
@@ -169,7 +206,7 @@ dnode_dest(void *arg, void *unused)
169206

170207
ASSERT0(dn->dn_dbufs_count);
171208
ASSERT0(dn->dn_unlisted_l0_blkid);
172-
list_destroy(&dn->dn_dbufs);
209+
avl_destroy(&dn->dn_dbufs);
173210
}
174211

175212
void
@@ -503,7 +540,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
503540
ASSERT0(dn->dn_assigned_txg);
504541
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
505542
ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
506-
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
543+
ASSERT(avl_is_empty(&dn->dn_dbufs));
507544

508545
for (i = 0; i < TXG_SIZE; i++) {
509546
ASSERT0(dn->dn_next_nblkptr[i]);
@@ -689,8 +726,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
689726
ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
690727
ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
691728
refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
692-
ASSERT(list_is_empty(&ndn->dn_dbufs));
693-
list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
729+
ASSERT(avl_is_empty(&ndn->dn_dbufs));
730+
avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
694731
ndn->dn_dbufs_count = odn->dn_dbufs_count;
695732
ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
696733
ndn->dn_bonus = odn->dn_bonus;
@@ -724,7 +761,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
724761
*/
725762
odn->dn_dbuf = NULL;
726763
odn->dn_handle = NULL;
727-
list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
764+
avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
728765
offsetof(dmu_buf_impl_t, db_link));
729766
odn->dn_dbufs_count = 0;
730767
odn->dn_unlisted_l0_blkid = 0;
@@ -1238,7 +1275,8 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
12381275
return;
12391276
}
12401277

1241-
ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
1278+
ASSERT(!refcount_is_zero(&dn->dn_holds) ||
1279+
!avl_is_empty(&dn->dn_dbufs));
12421280
ASSERT(dn->dn_datablksz != 0);
12431281
ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
12441282
ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
@@ -1311,7 +1349,7 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx)
13111349
int
13121350
dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
13131351
{
1314-
dmu_buf_impl_t *db, *db_next;
1352+
dmu_buf_impl_t *db;
13151353
int err;
13161354

13171355
if (size == 0)
@@ -1334,9 +1372,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
13341372
goto fail;
13351373

13361374
mutex_enter(&dn->dn_dbufs_mtx);
1337-
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
1338-
db_next = list_next(&dn->dn_dbufs, db);
1339-
1375+
for (db = avl_first(&dn->dn_dbufs); db != NULL;
1376+
db = AVL_NEXT(&dn->dn_dbufs, db)) {
13401377
if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
13411378
db->db_blkid != DMU_SPILL_BLKID) {
13421379
mutex_exit(&dn->dn_dbufs_mtx);

0 commit comments

Comments
 (0)